[build] Remove support for the yasm asm build

This commit is contained in:
Marko Viitanen 2022-04-28 14:12:09 +03:00
parent f8375f9bc6
commit 227556a13e
14 changed files with 2 additions and 3200 deletions

View file

@ -1,33 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ImportGroup Label="PropertySheets" />
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)-$(Configuration)\</IntDir>
<OutDir>$(SolutionDir)..\bin\$(Platform)-$(Configuration)\</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<CompileAs>CompileAsC</CompileAs>
<WarningLevel>Level4</WarningLevel>
<AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<PreprocessorDefinitions>KVZ_DLL_EXPORTS;KVZ_COMPILE_ASM;WIN32_LEAN_AND_MEAN;WIN32;WIN64;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(SolutionDir)..\src\threadwrapper\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<DisableSpecificWarnings>4244;4204;4206;4028;4152;4996;4018;4456;4389;4100;4131;4459;4706;4214;4127;4201</DisableSpecificWarnings>
<OpenMPSupport>false</OpenMPSupport>
<TreatSpecificWarningsAsErrors>4013;4029;4047;4716;4700;4020;4021;4133</TreatSpecificWarningsAsErrors>
</ClCompile>
<Link>
<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
<GenerateDebugInformation>true</GenerateDebugInformation>
<SubSystem>Console</SubSystem>
<RandomizedBaseAddress>false</RandomizedBaseAddress>
</Link>
<YASM>
<Defines>HAVE_ALIGNED_STACK=1</Defines>
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths)</IncludePaths>
</YASM>
</ItemDefinitionGroup>
<ItemGroup />
</Project>

View file

@ -1,26 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ImportGroup Label="PropertySheets" />
<PropertyGroup Label="UserMacros" />
<PropertyGroup />
<ItemDefinitionGroup>
<ClCompile>
<FloatingPointModel>Fast</FloatingPointModel>
<Optimization>Full</Optimization>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<IntrinsicFunctions>true</IntrinsicFunctions>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<BufferSecurityCheck>false</BufferSecurityCheck>
<WholeProgramOptimization>true</WholeProgramOptimization>
<OmitFramePointers>
</OmitFramePointers>
<EnableFiberSafeOptimizations>
</EnableFiberSafeOptimizations>
</ClCompile>
<Link>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
</ItemDefinitionGroup>
<ItemGroup />
</Project>

View file

@ -1,31 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup
Condition="'$(YASMBeforeTargets)' == '' and '$(YASMAfterTargets)' == '' and '$(ConfigurationType)' != 'Makefile'">
<YASMBeforeTargets>Midl</YASMBeforeTargets>
<YASMAfterTargets>CustomBuild</YASMAfterTargets>
</PropertyGroup>
<PropertyGroup>
<YASMDependsOn
Condition="'$(ConfigurationType)' != 'Makefile'">_SelectedFiles;$(YASMDependsOn)</YASMDependsOn>
</PropertyGroup>
<!-- Object format name for vsyasm must be in lower case. -->
<PropertyGroup Condition="'$(Platform)' == 'Win32'">
<YASMFormat>win32</YASMFormat>
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)' == 'x64'">
<YASMFormat>win64</YASMFormat>
</PropertyGroup>
<ItemDefinitionGroup>
<YASM>
<Debug>False</Debug>
<ObjectFile>$(IntDir)</ObjectFile>
<PreProc>0</PreProc>
<Parser>0</Parser>
<CommandLineTemplate>vsyasm.exe -Xvc -f $(YASMFormat) [AllOptions] [AdditionalOptions] [Inputs]</CommandLineTemplate>
<Outputs>%(ObjectFile)</Outputs>
<ExecutionDescription>Assembling %(Filename)%(Extension)</ExecutionDescription>
<ShowOnlyRuleProperties>false</ShowOnlyRuleProperties>
</YASM>
</ItemDefinitionGroup>
</Project>

View file

@ -1,109 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<PropertyPageSchema
Include="$(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml" />
<AvailableItemName
Include="YASM">
<Targets>_YASM</Targets>
</AvailableItemName>
</ItemGroup>
<UsingTask
TaskName="YASM"
TaskFactory="XamlTaskFactory"
AssemblyName="Microsoft.Build.Tasks.v4.0">
<Task>$(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml</Task>
</UsingTask>
<Target
Name="_YASM"
BeforeTargets="$(YASMBeforeTargets)"
AfterTargets="$(YASMAfterTargets)"
Condition="'@(YASM)' != ''"
DependsOnTargets="$(YASMDependsOn);ComputeYASMOutput"
Outputs="@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
Inputs="@(YASM);%(YASM.AdditionalDependencies);$(MSBuildProjectFile)">
<ItemGroup
Condition="'@(SelectedFiles)' != ''">
<YASM
Remove="@(YASM)"
Condition="'%(Identity)' != '@(SelectedFiles)'" />
</ItemGroup>
<ItemGroup>
<YASM_tlog
Include="%(YASM.ObjectFile)"
Condition="'%(YASM.ObjectFile)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'">
<Source>@(YASM->'%(FullPath)', '|')</Source>
</YASM_tlog>
</ItemGroup>
<Message
Importance="High"
Text="%(YASM.ExecutionDescription)" />
<WriteLinesToFile
Condition="'@(YASM_tlog)' != '' and '%(YASM_tlog.ExcludedFromBuild)' != 'true'"
File="$(TLogLocation)$(ProjectName).write.1.tlog"
Lines="^%(YASM_tlog.Source);@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
Encoding="Unicode" />
<YASM
Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'"
CommandLineTemplate="%(YASM.CommandLineTemplate)"
Debug="%(YASM.Debug)"
PreIncludeFile="%(YASM.PreIncludeFile)"
IncludePaths="%(YASM.IncludePaths)"
Defines="%(YASM.Defines)"
UnDefines="%(YASM.UnDefines)"
ObjectFile="%(YASM.ObjectFile)"
ListFile="%(YASM.ListFile)"
MapFile="%(YASM.MapFile)"
ErrorFile="%(YASM.ErrorFile)"
SymbolPrefix="%(YASM.SymbolPrefix)"
SymbolSuffix="%(YASM.SymbolSuffix)"
PreProc="%(YASM.PreProc)"
Parser="%(YASM.Parser)"
AdditionalOptions="%(YASM.AdditionalOptions)"
Inputs="@(YASM)" />
</Target>
<PropertyGroup>
<ComputeLinkInputsTargets>
$(ComputeLinkInputsTargets);
ComputeYASMOutput;
</ComputeLinkInputsTargets>
<ComputeLibInputsTargets>
$(ComputeLibInputsTargets);
ComputeYASMOutput;
</ComputeLibInputsTargets>
</PropertyGroup>
<Target
Name="ComputeYASMOutput"
Condition="'@(YASM)' != ''">
<ItemGroup>
<YASMDirsToMake
Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true' and !HasTrailingSlash('%(YASM.ObjectFile)')"
Include="%(YASM.ObjectFile)" />
<Link
Include="%(YASMDirsToMake.Identity)"
Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
<Lib
Include="%(YASMDirsToMake.Identity)"
Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
<ImpLib
Include="%(YASMDirsToMake.Identity)"
Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
</ItemGroup>
<ItemGroup>
<YASMDirsToMake
Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true' and HasTrailingSlash('%(YASM.ObjectFile)')"
Include="@(YASM->'%(ObjectFile)%(Filename).obj')" />
<Link
Include="%(YASMDirsToMake.Identity)"
Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
<Lib
Include="%(YASMDirsToMake.Identity)"
Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
<ImpLib
Include="%(YASMDirsToMake.Identity)"
Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
</ItemGroup>
<MakeDir
Directories="@(YASMDirsToMake->'%(RootDir)%(Directory)')" />
</Target>
</Project>

View file

@ -1,283 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<ProjectSchemaDefinitions xmlns="clr-namespace:Microsoft.Build.Framework.XamlTypes;assembly=Microsoft.Build.Framework" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:sys="clr-namespace:System;assembly=mscorlib" xmlns:transformCallback="Microsoft.Cpp.Dev10.ConvertPropertyCallback">
<Rule
Name="YASM"
PageTemplate="tool"
DisplayName="Yasm Assembler"
Order="200">
<Rule.DataSource>
<DataSource
Persistence="ProjectFile"
ItemType="YASM" />
</Rule.DataSource>
<Rule.Categories>
<Category
Name="General">
<Category.DisplayName>
<sys:String>General</sys:String>
</Category.DisplayName>
</Category>
<Category
Name="Symbols">
<Category.DisplayName>
<sys:String>Symbols</sys:String>
</Category.DisplayName>
</Category>
<Category
Name="Files">
<Category.DisplayName>
<sys:String>Files</sys:String>
</Category.DisplayName>
</Category>
<Category
Name="Command Line"
Subtype="CommandLine">
<Category.DisplayName>
<sys:String>Command Line</sys:String>
</Category.DisplayName>
</Category>
</Rule.Categories>
<StringListProperty
Name="Inputs"
Category="Command Line"
IsRequired="true"
Switch=" ">
<StringListProperty.DataSource>
<DataSource
Persistence="ProjectFile"
ItemType="YASM"
SourceType="Item" />
</StringListProperty.DataSource>
</StringListProperty>
<BoolProperty
Name="Debug"
Subcategory="Configuration"
HelpContext="0"
DisplayName="Debug Information"
Description="Generate debugging information"
Switch="-g cv8" />
<StringListProperty
Name="IncludePaths"
Subcategory="Configuration"
HelpContext="0"
DisplayName="Include Paths"
Description="Set the paths for any additional include files"
Switch="-i &quot;[value]&quot;" />
<StringListProperty
Name="Defines"
Category="Symbols"
Subcategory="Pre-Defined Symbols"
HelpContext="0"
DisplayName="Defined Symbols"
Description="Specify pre-defined symbols ('symbol' or 'symbol = value') "
Switch="-d &quot;[value]&quot;" />
<StringListProperty
Name="UnDefines"
Category="Symbols"
Subcategory="Pre-Defined Symbols"
HelpContext="0"
DisplayName="Remove Symbols"
Description="Remove pre-defined symbols "
Switch="-u &quot;[value]&quot;" />
<StringProperty
Name="ObjectFile"
Subcategory="Output"
HelpContext="0"
DisplayName="Object File Name"
Description="Select the output file name"
Switch="-o &quot;[value]&quot;" />
<StringProperty
Name="ListFile"
Category="Files"
Subcategory="Output"
HelpContext="0"
DisplayName="List File Name"
Description="Select an output listing by setting its file name"
Switch="-l &quot;[value]&quot;" />
<StringProperty
Name="PreIncludeFile"
Category="Files"
Subcategory="Configuration"
HelpContext="0"
DisplayName="Pre Include File"
Description="Select a pre-included file by setting its name"
Switch="-P &quot;[value]&quot;" />
<StringProperty
Name="MapFile"
Category="Files"
Subcategory="Output"
HelpContext="0"
DisplayName="Map File Name"
Description="Select a map output by setting its file name"
Switch="--mapdir= &quot;[value]&quot;" />
<StringProperty
Name="ErrorFile"
Category="Files"
Subcategory="Output"
HelpContext="0"
DisplayName="Error File Name"
Description="Send error/warning messages to a file by setting its name"
Switch="-E &quot;[value]&quot;" />
<StringProperty
Name="SymbolPrefix"
Category="Symbols"
Subcategory="Symbols"
HelpContext="0"
DisplayName="External Symbol Prefix"
Description="Prepend symbol to all external symbols"
Switch="--prefix &quot;[value]&quot;" />
<StringProperty
Name="SymbolSuffix"
Category="Symbols"
Subcategory="Symbols"
HelpContext="0"
DisplayName="External Symbol Suffix"
Description="Append symbol to all external symbols"
Switch="--suffix &quot;[value]&quot;" />
<EnumProperty
Name="PreProc"
Subcategory="Configuration"
HelpContext="0"
DisplayName="Pre-Processor"
Description="Select the pre-processor ('nasm' or 'raw')">
<EnumValue
Name="0"
DisplayName="Nasm "
Switch="-rnasm" />
<EnumValue
Name="1"
DisplayName="Raw"
Switch="-rraw" />
</EnumProperty>
<EnumProperty
Name="Parser"
Subcategory="Configuration"
HelpContext="0"
DisplayName="Parser"
Description="Select the parser for Intel ('nasm') or AT&amp;T ( 'gas') syntax">
<EnumValue
Name="0"
DisplayName="Nasm"
Switch="-pnasm" />
<EnumValue
Name="1"
DisplayName="Gas"
Switch="-pgas" />
</EnumProperty>
<StringProperty
Name="CommandLineTemplate"
DisplayName="Command Line"
Visible="False"
IncludeInCommandLine="False" />
<DynamicEnumProperty
Name="YASMBeforeTargets"
Category="General"
EnumProvider="Targets"
IncludeInCommandLine="False">
<DynamicEnumProperty.DisplayName>
<sys:String>Execute Before</sys:String>
</DynamicEnumProperty.DisplayName>
<DynamicEnumProperty.Description>
<sys:String>Specifies the targets for the build customization to run before.</sys:String>
</DynamicEnumProperty.Description>
<DynamicEnumProperty.ProviderSettings>
<NameValuePair
Name="Exclude"
Value="^YASMBeforeTargets|^Compute" />
</DynamicEnumProperty.ProviderSettings>
<DynamicEnumProperty.DataSource>
<DataSource
Persistence="ProjectFile"
HasConfigurationCondition="true" />
</DynamicEnumProperty.DataSource>
</DynamicEnumProperty>
<DynamicEnumProperty
Name="YASMAfterTargets"
Category="General"
EnumProvider="Targets"
IncludeInCommandLine="False">
<DynamicEnumProperty.DisplayName>
<sys:String>Execute After</sys:String>
</DynamicEnumProperty.DisplayName>
<DynamicEnumProperty.Description>
<sys:String>Specifies the targets for the build customization to run after.</sys:String>
</DynamicEnumProperty.Description>
<DynamicEnumProperty.ProviderSettings>
<NameValuePair
Name="Exclude"
Value="^YASMAfterTargets|^Compute" />
</DynamicEnumProperty.ProviderSettings>
<DynamicEnumProperty.DataSource>
<DataSource
Persistence="ProjectFile"
ItemType=""
HasConfigurationCondition="true" />
</DynamicEnumProperty.DataSource>
</DynamicEnumProperty>
<StringListProperty
Name="Outputs"
DisplayName="Outputs"
Visible="False"
IncludeInCommandLine="False" />
<StringProperty
Name="ExecutionDescription"
DisplayName="Execution Description"
Visible="False"
IncludeInCommandLine="False" />
<StringListProperty
Name="AdditionalDependencies"
DisplayName="Additional Dependencies"
IncludeInCommandLine="False"
Visible="true" />
<StringProperty
Subtype="AdditionalOptions"
Name="AdditionalOptions"
Category="Command Line">
<StringProperty.DisplayName>
<sys:String>Additional Options</sys:String>
</StringProperty.DisplayName>
<StringProperty.Description>
<sys:String>Additional Options</sys:String>
</StringProperty.Description>
</StringProperty>
</Rule>
<ItemType
Name="YASM"
DisplayName="Yasm Assembler" />
<FileExtension
Name="*.asm"
ContentType="YASM" />
<ContentType
Name="YASM"
DisplayName="Yasm Assembler"
ItemType="YASM" />
</ProjectSchemaDefinitions>

View file

@ -37,7 +37,6 @@
#include "strategies/generic/picture-generic.h" #include "strategies/generic/picture-generic.h"
#include "strategies/sse2/picture-sse2.h" #include "strategies/sse2/picture-sse2.h"
#include "strategies/sse41/picture-sse41.h" #include "strategies/sse41/picture-sse41.h"
#include "strategies/x86_asm/picture-x86-asm.h"
#include "strategyselector.h" #include "strategyselector.h"
@ -93,9 +92,6 @@ int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {
if (kvz_g_hardware_flags.intel_flags.sse41) { if (kvz_g_hardware_flags.intel_flags.sse41) {
success &= kvz_strategy_register_picture_sse41(opaque, bitdepth); success &= kvz_strategy_register_picture_sse41(opaque, bitdepth);
} }
if (kvz_g_hardware_flags.intel_flags.avx) {
success &= kvz_strategy_register_picture_x86_asm_avx(opaque, bitdepth);
}
if (kvz_g_hardware_flags.intel_flags.avx2) { if (kvz_g_hardware_flags.intel_flags.avx2) {
success &= kvz_strategy_register_picture_avx2(opaque, bitdepth); success &= kvz_strategy_register_picture_avx2(opaque, bitdepth);
} }

View file

@ -1,385 +0,0 @@
;/*****************************************************************************
; * This file is part of Kvazaar HEVC encoder.
; *
; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
; * All rights reserved.
; *
; * Redistribution and use in source and binary forms, with or without modification,
; * are permitted provided that the following conditions are met:
; *
; * * Redistributions of source code must retain the above copyright notice, this
; * list of conditions and the following disclaimer.
; *
; * * Redistributions in binary form must reproduce the above copyright notice, this
; * list of conditions and the following disclaimer in the documentation and/or
; * other materials provided with the distribution.
; *
; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
; * contributors may be used to endorse or promote products derived from
; * this software without specific prior written permission.
; *
; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; ****************************************************************************/
%include "x86inc.asm"
;cglobal and RET macros are from the x86.inc
;they push and pop the necessary registers to
;stack depending on the operating system
;Usage: cglobal name, %1, %2, %3
;1%: Number of arguments
;2%: Number of registers used
;3%: Number of xmm registers used.
;More info in x86inc.asm
SECTION .text
;Set x86inc.asm macros to use avx and xmm registers
INIT_XMM avx
;KVZ_SAD_4X4
;Calculates SAD of the 16 consequtive bytes in memory
;r0 address of the first value(current frame)
;r1 address of the first value(reference frame)
cglobal sad_4x4, 2, 2, 2
;Load 16 bytes of both frames
vmovdqu m0, [r0]
vmovdqu m1, [r1]
;Calculate SAD. The results are written in
;m0[15:0] and m0[79:64]
vpsadbw m0, m1
;Sum the results
vmovhlps m1, m0
vpaddw m0, m1
;Write the result to eax
vmovd eax, m0
RET
;KVZ_SAD_4X4_STRIDE
;Calculates SAD of a 4x4 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_4x4_stride, 3, 3, 2
;Load 4 times 4 bytes of both frames
vpinsrd m0, [r0], 0
add r0, r2
vpinsrd m0, [r0], 1
vpinsrd m0, [r0+r2], 2
vpinsrd m0, [r0+r2*2], 3
vpinsrd m1, [r1], 0
add r1, r2
vpinsrd m1, [r1], 1
vpinsrd m1, [r1+r2], 2
vpinsrd m1, [r1+r2*2], 3
vpsadbw m0, m1
vmovhlps m1, m0
vpaddw m0, m1
vmovd eax, m0
RET
;KVZ_SAD_8X8
;Calculates SAD of the 64 consequtive bytes in memory
;r0 address of the first value(current)
;r1 address of the first value(reference)
cglobal sad_8x8, 2, 2, 5
;Load the first half of both frames
vmovdqu m0, [r0]
vmovdqu m2, [r0+16]
vmovdqu m1, [r1]
vmovdqu m3, [r1+16]
;Calculate SADs for both
vpsadbw m0, m1
vpsadbw m2, m3
;Sum
vpaddw m0, m2
;Repeat for the latter half
vmovdqu m1, [r0+16*2]
vmovdqu m3, [r0+16*3]
vmovdqu m2, [r1+16*2]
vmovdqu m4, [r1+16*3]
vpsadbw m1, m2
vpsadbw m3, m4
vpaddw m1, m3
;Sum all the SADs
vpaddw m0, m1
vmovhlps m1, m0
vpaddw m0, m1
vmovd eax, m0
RET
;KVZ_SAD_8X8_STRIDE
;Calculates SAD of a 8x8 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_8x8_stride, 3, 3, 5
;Zero m0 register
vpxor m0, m0
;Load the first half to m1 and m3 registers(cur)
;Current frame
;Load to the high 64 bits of xmm
vmovhpd m1, [r0]
add r0, r2
;Load to the low 64 bits
vmovlpd m1, [r0]
vmovhpd m3, [r0+r2]
vmovlpd m3, [r0+r2*2]
;lea calculates the address to r0,
;but doesn't load anything from
;the memory. Equivalent for
;two add r0, r2 instructions.
lea r0, [r0+r2*2]
add r0, r2
;Reference frame
vmovhpd m2, [r1]
add r1, r2
vmovlpd m2, [r1]
vmovhpd m4, [r1+r2]
vmovlpd m4, [r1+r2*2]
lea r1, [r1+r2*2]
add r1, r2
vpsadbw m1, m2
vpsadbw m3, m4
vpaddw m0, m1
vpaddw m0, m3
;Repeat for the other half
vmovhpd m1, [r0]
add r0, r2
vmovlpd m1, [r0]
vmovhpd m3, [r0+r2]
vmovlpd m3, [r0+r2*2]
lea r0, [r0+r2*2]
add r0, r2
vmovhpd m2, [r1]
add r1, r2
vmovlpd m2, [r1]
vmovhpd m4, [r1+r2]
vmovlpd m4, [r1+r2*2]
lea r1, [r1+r2*2]
add r1, r2
vpsadbw m1, m2
vpsadbw m3, m4
vpaddw m0, m1
vpaddw m0, m3
vmovhlps m1, m0
vpaddw m0, m1
vmovd eax, m0
RET
;KVZ_SAD_16X16
;Calculates SAD of the 256 consequtive bytes in memory
;r0 address of the first value(current)
;r1 address of the first value(reference)
cglobal sad_16x16, 2, 2, 5
;Zero m4
vpxor m4, m4
%assign i 0
;Repeat 8 times.
%rep 8
;Load the next to rows of the current frame
vmovdqu m0, [r0 + 16 * i]
vmovdqu m2, [r0 + 16 * (i + 1)]
;Load the next to rows of the reference frame
vmovdqu m1, [r1 + 16 * i]
vmovdqu m3, [r1 + 16 * (i + 1)]
vpsadbw m0, m1
vpsadbw m2, m3
;Accumulate SADs to m4
vpaddw m4, m0
vpaddw m4, m2
%assign i i+2
%endrep
;Calculate the final sum
vmovhlps m0, m4
vpaddw m4, m0
vmovd eax, m4
RET
;KVZ_SAD_16X16_STRIDE
;Calculates SAD of a 16x16 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_16x16_stride, 3, 3, 5
vpxor m4, m4
%rep 8
; Load the next 2 rows from rec_buf to m0 and m2
vmovdqu m0, [r0]
vmovdqu m2, [r0 + r2]
lea r0, [r0 + r2*2]
; Load the next 2 rows from ref_buf to m1 and m3
vmovdqu m1, [r1]
vmovdqu m3, [r1 + r2]
lea r1, [r1 + r2*2]
vpsadbw m0, m1
vpsadbw m2, m3
vpaddw m4, m0
vpaddw m4, m2
%endrep
vmovhlps m0, m4
vpaddw m4, m0
vmovd eax, m4
RET
;KVZ_SAD_32x32_STRIDE
;Calculates SAD of a 32x32 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_32x32_stride, 3, 3, 5
vpxor m4, m4
; Handle 2 lines per iteration
%rep 16
vmovdqu m0, [r0]
vmovdqu m1, [r0 + 16]
vmovdqu m2, [r0 + r2]
vmovdqu m3, [r0 + r2 + 16]
lea r0, [r0 + 2 * r2]
vpsadbw m0, [r1]
vpsadbw m1, [r1 + 16]
vpsadbw m2, [r1 + r2]
vpsadbw m3, [r1 + r2 + 16]
lea r1, [r1 + 2 * r2]
vpaddd m4, m0
vpaddd m4, m1
vpaddd m4, m2
vpaddd m4, m3
%endrep
vmovhlps m0, m4
vpaddd m4, m0
vmovd eax, m4
RET
;KVZ_SAD_64x64_STRIDE
;Calculates SAD of a 64x64 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_64x64_stride, 3, 4, 5
vpxor m4, m4 ; sum accumulation register
mov r3, 4 ; number of iterations in the loop
Process16Lines:
; Intel optimization manual says to not unroll beyond 500 instructions.
; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but
; smaller is better, when speed is the same, right?
%rep 16
vmovdqu m0, [r0]
vmovdqu m1, [r0 + 1*16]
vmovdqu m2, [r0 + 2*16]
vmovdqu m3, [r0 + 3*16]
vpsadbw m0, [r1]
vpsadbw m1, [r1 + 1*16]
vpsadbw m2, [r1 + 2*16]
vpsadbw m3, [r1 + 3*16]
lea r0, [r0 + r2]
lea r1, [r1 + r2]
vpaddd m4, m0
vpaddd m4, m1
vpaddd m4, m2
vpaddd m4, m3
%endrep
dec r3
jnz Process16Lines
vmovhlps m0, m4
vpaddd m4, m0
vmovd eax, m4
RET

View file

@ -1,56 +0,0 @@
#ifndef _PICTURE_X86_ASM_SAD_H_
#define _PICTURE_X86_ASM_SAD_H_
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Optimizations for AVX, utilizing ASM implementations.
*/
#include "global.h" // IWYU pragma: keep
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
unsigned kvz_sad_4x4_avx(const uint8_t*, const uint8_t*);
unsigned kvz_sad_8x8_avx(const uint8_t*, const uint8_t*);
unsigned kvz_sad_16x16_avx(const uint8_t*, const uint8_t*);
unsigned kvz_sad_4x4_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_8x8_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_16x16_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_32x32_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_64x64_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
#endif // KVZ_BIT_DEPTH == 8
#endif

View file

@ -1,575 +0,0 @@
;/*****************************************************************************
; * This file is part of Kvazaar HEVC encoder.
; *
; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
; * All rights reserved.
; *
; * Redistribution and use in source and binary forms, with or without modification,
; * are permitted provided that the following conditions are met:
; *
; * * Redistributions of source code must retain the above copyright notice, this
; * list of conditions and the following disclaimer.
; *
; * * Redistributions in binary form must reproduce the above copyright notice, this
; * list of conditions and the following disclaimer in the documentation and/or
; * other materials provided with the distribution.
; *
; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
; * contributors may be used to endorse or promote products derived from
; * this software without specific prior written permission.
; *
; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; ****************************************************************************/
%include "x86inc.asm"
;cglobal and RET macros are from the x86.inc
;they push and pop the necessary registers to
;stack depending on the operating system
;Usage: cglobal name, %1, %2, %3
;1%: Number of arguments
;2%: Number of registers used
;3%: Number of xmm registers used.
;More info in x86inc.asm
SECTION .text
;Set x86inc.asm macros to use avx and xmm registers
INIT_XMM avx
;KVZ_ZERO_EXTEND_WD
;zero extend all packed words in xmm to dwords in 2 xmm registers
;%1 source register
;%2 lower destination register
;%3 higher destination register
%macro KVZ_ZERO_EXTEND_WD 3
;Zero extend high 64 bits
vmovhlps %3, %1
vpmovzxwd %3, %3
;Zero extend low 64 bits
vpmovzxwd %2, %1
%endmacro ; KVZ_ZERO_EXTEND_WD
; Use nondestructive horizontal add and sub to calculate both at the same time.
; TODO: It would probably be possible to do this with 3 registers (destructive vphsubw).
; args:
; 1, 2: input registers
; 3, 4: output registers
%macro SATD_HORIZONTAL_SUB_AND_ADD 4
; TODO: It might be possible to do this with 3 registers?
;First stage
vphaddw %3, %1, %2
vphsubw %4, %1, %2
;Second stage
vphaddw %1, %3, %4
vphsubw %2, %3, %4
;Third stage
vphaddw %3, %1, %2
vphsubw %4, %1, %2
%endmacro ; SATD_HORIZONTAL_SUB_AND_ADD
;KVZ_SATD_8X8_STRIDE
;Calculates SATD of a 8x8 block inside a frame with stride
;r0 address of the first value(reference)
;r1 address of the first value(current)
;r2 stride
;
;The Result is written in the register r4
%macro KVZ_SATD_8X8_STRIDE 0
;Calculate differences of the 8 rows into
;registers m0-m7
vpmovzxbw m0, [r0]
vpmovzxbw m7, [r2]
vpsubw m0, m7
vpmovzxbw m1, [r0+r1]
vpmovzxbw m7, [r2+r3]
vpsubw m1, m7
;Set r0 and r2 2 rows forward
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
vpmovzxbw m2, [r0]
vpmovzxbw m7, [r2]
vpsubw m2, m7
vpmovzxbw m3, [r0+r1]
vpmovzxbw m7, [r2+r3]
vpsubw m3, m7
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
vpmovzxbw m4, [r0]
vpmovzxbw m7, [r2]
vpsubw m4, m7
vpmovzxbw m5, [r0+r1]
vpmovzxbw m7, [r2+r3]
vpsubw m5, m7
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
vpmovzxbw m6, [r0]
vpmovzxbw m7, [r2]
vpsubw m6, m7
;32-bit AVX doesn't have registers
;xmm8-xmm15, use stack instead
%if ARCH_X86_64
vpmovzxbw m7, [r0+r1]
vpmovzxbw m8, [r2+r3]
vpsubw m7, m8
%else
%define temp0 esp+16*3
%define temp1 esp+16*2
%define temp2 esp+16*1
%define temp3 esp+16*0
;Reserve memory for 4 x 128 bits.
sub esp, 16*4
vpmovzxbw m7, [r2+r3]
vmovdqu [temp0], m7
vpmovzxbw m7, [r0+r1]
vpsubw m7, [temp0]
;Put rows 5-8 to stack
vmovdqu [temp0], m4
vmovdqu [temp1], m5
vmovdqu [temp2], m6
vmovdqu [temp3], m7
%endif
;Hadamard transform (FWHT algorithm)
;Horizontal transform
%if ARCH_X86_64
;Calculate horizontal transform for each row.
;Transforms of two rows are interleaved in register pairs.
;(m8 and m9, m10 and m11,...)
SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m8, m9
SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m10, m11
SATD_HORIZONTAL_SUB_AND_ADD m4, m5, m12, m13
SATD_HORIZONTAL_SUB_AND_ADD m6, m7, m14, m15
%else
;Calculate horizontal transforms for the first four rows.
;Then load the other four into the registers and store
;ready transforms in the stack.
;Input registers are m0-m3, results are written in
;registers m4-m7 (and memory).
SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5
SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7
vmovdqu m3, [temp3]
vmovdqu m2, [temp2]
vmovdqu m1, [temp1]
vmovdqu m0, [temp0]
vmovdqu [temp3], m7
vmovdqu [temp2], m6
vmovdqu [temp1], m5
vmovdqu [temp0], m4
SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5
SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7
%endif
;Vertical transform
;Transform columns of the 8x8 block.
;First sum the interleaved horizontally
;transformed values with one horizontal add
;for each pair of rows. Then calculate
;with regular packed additions and
;subtractions.
%if ARCH_X86_64
;Horizontally transformed values are in registers m8-m15
;Results are written in m0-m7
;First stage
vphaddw m0, m8, m9
vphsubw m1, m8, m9
vphaddw m2, m10, m11
vphsubw m3, m10, m11
vphaddw m4, m12, m13
vphsubw m5, m12, m13
vphaddw m6, m14, m15
vphsubw m7, m14, m15
;Second stage
vpaddw m8, m0, m2
vpaddw m9, m1, m3
vpsubw m10, m0, m2
vpsubw m11, m1, m3
vpaddw m12, m4, m6
vpaddw m13, m5, m7
vpsubw m14, m4, m6
vpsubw m15, m5, m7
;Third stage
vpaddw m0, m8, m12
vpaddw m1, m9, m13
vpaddw m2, m10, m14
vpaddw m3, m11, m15
vpsubw m4, m8, m12
vpsubw m5, m9, m13
vpsubw m6, m10, m14
vpsubw m7, m11, m15
%else
;Transformed values are in registers m4-m7
;and in memory(temp0-temp3). Transformed values
;are written in m4-m7. Also calculate absolute
;values for them and accumulate into ymm0.
;First stage
vphaddw m0, m4, m5
vphsubw m1, m4, m5
vphaddw m2, m6, m7
vphsubw m3, m6, m7
;Second stage
vpaddw m4, m0, m2
vpaddw m5, m1, m3
vpsubw m6, m0, m2
vpsubw m7, m1, m3
vmovdqu m3, [temp3]
vmovdqu m2, [temp2]
vmovdqu m1, [temp1]
vmovdqu m0, [temp0]
vmovdqu [temp3], m7
vmovdqu [temp2], m6
vmovdqu [temp1], m5
vmovdqu [temp0], m4
;First stage (second half)
vphaddw m4, m0, m1
vphsubw m5, m0, m1
vphaddw m6, m2, m3
vphsubw m7, m2, m3
;Second stage (second half)
vpaddw m0, m4, m6
vpaddw m1, m5, m7
vpsubw m2, m4, m6
vpsubw m3, m5, m7
;Third stage
vpaddw m4, m0, [temp0]
vpaddw m5, m1, [temp1]
vpsubw m6, m0, [temp0]
vpsubw m7, m1, [temp1]
;Calculate the absolute values and
;zero extend 16-bit values to 32-bit
;values. Then sum the values.
vpabsw m4, m4
KVZ_ZERO_EXTEND_WD m4, m4, m1
vpaddd m4, m1
vpabsw m5, m5
KVZ_ZERO_EXTEND_WD m5, m5, m1
vpaddd m5, m1
vpabsw m6, m6
KVZ_ZERO_EXTEND_WD m6, m6, m1
vpaddd m6, m1
vpabsw m7, m7
KVZ_ZERO_EXTEND_WD m7, m7, m1
vpaddd m7, m1
vpaddd m0, m4, m5
vpaddd m0, m6
vpaddd m0, m7
;Repeat for the rest
vpaddw m4, m2, [temp2]
vpaddw m5, m3, [temp3]
vpsubw m6, m2, [temp2]
vpsubw m7, m3, [temp3]
vpabsw m4, m4
KVZ_ZERO_EXTEND_WD m4, m4, m1
vpaddd m4, m1
vpabsw m5, m5
KVZ_ZERO_EXTEND_WD m5, m5, m1
vpaddd m5, m1
vpabsw m6, m6
KVZ_ZERO_EXTEND_WD m6, m6, m1
vpaddd m6, m1
vpabsw m7, m7
KVZ_ZERO_EXTEND_WD m7, m7, m1
vpaddd m7, m1
;Sum the other half of the packed results to ymm4
vpaddd m4, m5
vpaddd m4, m6
vpaddd m4, m7
;Sum all packed results to ymm0
vpaddd m0, m4
%endif
%if ARCH_X86_64
;Calculate the absolute values and
;zero extend 16-bit values to 32-bit
;values. In other words: extend xmm to
;corresponding ymm.
vpabsw m0, m0
KVZ_ZERO_EXTEND_WD m0, m0, m8
vpaddd m0, m8
vpabsw m1, m1
KVZ_ZERO_EXTEND_WD m1, m1, m8
vpaddd m1, m8
vpabsw m2, m2
KVZ_ZERO_EXTEND_WD m2, m2, m8
vpaddd m1, m8
vpabsw m3, m3
KVZ_ZERO_EXTEND_WD m3, m3, m8
vpaddd m3, m8
vpabsw m4, m4
KVZ_ZERO_EXTEND_WD m4, m4, m8
vpaddd m4, m8
vpabsw m5, m5
KVZ_ZERO_EXTEND_WD m5, m5, m8
vpaddd m5, m8
vpabsw m6, m6
KVZ_ZERO_EXTEND_WD m6, m6, m8
vpaddd m6, m8
vpabsw m7, m7
KVZ_ZERO_EXTEND_WD m7, m7, m8
vpaddd m7, m8
;Calculate packed sum of transformed values to ymm0
vpaddd m0, m1
vpaddd m0, m2
vpaddd m0, m3
vpaddd m0, m4
vpaddd m0, m5
vpaddd m0, m6
vpaddd m0, m7
%endif
;Sum the packed values to m0[32:0]
vphaddd m0, m0
vphaddd m0, m0
;The result is in the lowest 32 bits in m0
vmovd r4d, m0
;8x8 Hadamard transform requires
;adding 2 and dividing by 4
add r4, 2
shr r4, 2
;Zero high 128 bits of ymm registers to
;prevent AVX-SSE transition penalty.
vzeroupper
%if ARCH_X86_64 == 0
add esp, 16*4
%endif
%endmacro ; KVZ_SATD_8X8_STRIDE
;KVZ_SATD_4X4
;Calculates SATD of the 16 consequtive bytes in memory
;r0 address of the first value(current)
;r1 address of the first value(reference)
cglobal satd_4x4, 2, 2, 6
;Load 8 bytes from memory and zero extend
;to 16-bit values. Calculate difference.
vpmovzxbw m0, [r0]
vpmovzxbw m2, [r1]
vpsubw m0, m2
vpmovzxbw m1, [r0+8]
vpmovzxbw m3, [r1+8]
vpsubw m1, m3
;Hadamard transform
;Horizontal phase
;First stage
vphaddw m4, m0, m1
vphsubw m5, m0, m1
;Second stage
vphaddw m0, m4, m5
vphsubw m1, m4, m5
;Vertical phase
;First stage
vphaddw m4, m0, m1
vphsubw m5, m0, m1
;Second stage
vphaddw m0, m4, m5
vphsubw m1, m4, m5
;Calculate absolute values
vpabsw m0, m0
vpabsw m1, m1
;Sum the all the transformed values
vpaddw m0, m1
vphaddw m0, m0
vphaddw m0, m0
vphaddw m0, m0
;Extract the lowest 16 bits of m0
;into eax
vpextrw eax, m0, 0
;4x4 Hadamard transform requires
;Addition of 1 and division by 2
add eax, 1
shr eax, 1
RET
;KVZ_SATD_8X8
;Calculates SATD of a 8x8 block inside a frame with stride
;r0 address of the first value(reference)
;r1 address of the first value(current)
;r2 stride
%if ARCH_X86_64
cglobal satd_8x8, 4, 5, 16
%else
cglobal satd_8x8, 4, 5, 8
%endif
;Set arguments
mov r2, r1
mov r1, 8
mov r3, 8
;Calculate 8x8 SATD. Result is written
;in the register r4.
KVZ_SATD_8X8_STRIDE
mov rax, r4
RET
;KVZ_SATD_NXN
;Calculates SATD of a NxN block inside a frame with stride
;r0 address of the first value(reference)
;r1 address of the first value(current)
%macro KVZ_SATD_NXN 1
%if ARCH_X86_64
cglobal satd_%1x%1, 2, 7, 16
%else
cglobal satd_%1x%1, 2, 7, 8
%endif
;Set arguments
mov r2, r1
mov r1, %1
mov r3, %1
;Zero r5 and r6
xor r5, r5
xor r6, r6
;Calculate SATDs of each 8x8 sub-blocks
;and accumulate the results in r6. Repeat yloop
;N times. Repeat xloop N times. r4 and r5 are counters
;for the loops.
.yloop
;zero r4
xor r4, r4
.xloop
push r4
;Calculate SATD of the sub-block. Result is
;written in the register r4.
KVZ_SATD_8X8_STRIDE
add r6, r4
;Set r2 and r0 to the next sub-block
;on the same row
sub r2, 6*%1-8
sub r0, 6*%1-8
pop r4
add r4, 8
cmp r4, %1
jne .xloop
;Set r2 and r0 to the first sub-block
;on the next row(of 8x8 sub-blocks)
add r2, 7*%1
add r0, 7*%1
add r5, 8
cmp r5, %1
jne .yloop
mov rax, r6
RET
%endmacro ; KVZ_SATD_NXN
KVZ_SATD_NXN 16
KVZ_SATD_NXN 32
KVZ_SATD_NXN 64

View file

@ -1,50 +0,0 @@
#ifndef _PICTURE_X86_ASM_SATD_H_
#define _PICTURE_X86_ASM_SATD_H_
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Optimizations for AVX, utilizing ASM implementations.
*/
#include "global.h" // IWYU pragma: keep
unsigned kvz_satd_4x4_avx(const kvz_pixel *org, const kvz_pixel *cur);
unsigned kvz_satd_8x8_avx(const kvz_pixel *org, const kvz_pixel *cur);
unsigned kvz_satd_16x16_avx(const kvz_pixel *org, const kvz_pixel *cur);
unsigned kvz_satd_32x32_avx(const kvz_pixel *org, const kvz_pixel *cur);
unsigned kvz_satd_64x64_avx(const kvz_pixel *org, const kvz_pixel *cur);
#endif

View file

@ -1,132 +0,0 @@
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
#include "strategies/x86_asm/picture-x86-asm.h"
#if defined(KVZ_COMPILE_ASM)
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include <stdlib.h>
#include "strategies/x86_asm/picture-x86-asm-sad.h"
#include "strategies/x86_asm/picture-x86-asm-satd.h"
#include "strategies/sse41/picture-sse41.h"
#include "strategyselector.h"
static unsigned kvz_sad_32x32_avx(const uint8_t *data1, const uint8_t *data2)
{
unsigned sad = 0;
sad += kvz_sad_16x16_avx(data1, data2);
sad += kvz_sad_16x16_avx(data1 + 8 * 32, data2 + 8 * 32);
sad += kvz_sad_16x16_avx(data1 + 16 * 32, data2 + 16 * 32);
sad += kvz_sad_16x16_avx(data1 + 24 * 32, data2 + 24 * 32);
return sad;
}
static unsigned kvz_sad_64x64_avx(const uint8_t *data1, const uint8_t *data2)
{
unsigned sad = 0;
sad += kvz_sad_32x32_avx(data1, data2);
sad += kvz_sad_32x32_avx(data1 + 16 * 64, data2 + 16 * 64);
sad += kvz_sad_32x32_avx(data1 + 32 * 64, data2 + 32 * 64);
sad += kvz_sad_32x32_avx(data1 + 48 * 64, data2 + 48 * 64);
return sad;
}
static unsigned kvz_sad_other_avx(const uint8_t *data1, const uint8_t *data2,
int width, int height,
unsigned stride)
{
unsigned sad = 0;
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
sad += abs(data1[y * stride + x] - data2[y * stride + x]);
}
}
return sad;
}
static unsigned reg_sad_x86_asm(const uint8_t *data1, const uint8_t * data2,
const int width, const int height,
const unsigned stride1, const unsigned stride2)
{
if (width == height) {
if (width == 8) {
return kvz_sad_8x8_stride_avx(data1, data2, stride1);
} else if (width == 16) {
return kvz_sad_16x16_stride_avx(data1, data2, stride1);
} else if (width == 32) {
return kvz_sad_32x32_stride_avx(data1, data2, stride1);
} else if (width == 64) {
return kvz_sad_64x64_stride_avx(data1, data2, stride1);
}
}
if (width * height >= 16) {
// Call the vectorized general SAD SSE41 function when the block
// is big enough to make it worth it.
return kvz_reg_sad_sse41(data1, data2, width, height, stride1, stride2);
} else {
return kvz_sad_other_avx(data1, data2, width, height, stride1);
}
}
#endif // KVZ_BIT_DEPTH == 8
#endif //defined(KVZ_COMPILE_ASM)
int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth)
{
bool success = true;
#if defined(KVZ_COMPILE_ASM)
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "reg_sad", "x86_asm_avx", 30, &reg_sad_x86_asm);
success &= kvz_strategyselector_register(opaque, "sad_4x4", "x86_asm_avx", 30, &kvz_sad_4x4_avx);
success &= kvz_strategyselector_register(opaque, "sad_8x8", "x86_asm_avx", 30, &kvz_sad_8x8_avx);
success &= kvz_strategyselector_register(opaque, "sad_16x16", "x86_asm_avx", 30, &kvz_sad_16x16_avx);
success &= kvz_strategyselector_register(opaque, "sad_32x32", "x86_asm_avx", 30, &kvz_sad_32x32_avx);
success &= kvz_strategyselector_register(opaque, "sad_64x64", "x86_asm_avx", 30, &kvz_sad_64x64_avx);
success &= kvz_strategyselector_register(opaque, "satd_4x4", "x86_asm_avx", 30, &kvz_satd_4x4_avx);
success &= kvz_strategyselector_register(opaque, "satd_8x8", "x86_asm_avx", 30, &kvz_satd_8x8_avx);
success &= kvz_strategyselector_register(opaque, "satd_16x16", "x86_asm_avx", 30, &kvz_satd_16x16_avx);
success &= kvz_strategyselector_register(opaque, "satd_32x32", "x86_asm_avx", 30, &kvz_satd_32x32_avx);
success &= kvz_strategyselector_register(opaque, "satd_64x64", "x86_asm_avx", 30, &kvz_satd_64x64_avx);
}
#endif // KVZ_BIT_DEPTH == 8
#endif //!defined(KVZ_COMPILE_ASM)
return success;
}

View file

@ -1,46 +0,0 @@
#ifndef STRATEGIES_PICTURE_X86_ASM_H_
#define STRATEGIES_PICTURE_X86_ASM_H_
/*****************************************************************************
* This file is part of uvg266 VVC encoder.
*
* Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Optimizations for AVX, utilizing ASM implementations.
*/
#include "global.h" // IWYU pragma: keep
int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth);
#endif //STRATEGIES_PICTURE_X86_ASM_H_

File diff suppressed because it is too large Load diff

View file

@ -258,7 +258,6 @@ int kvz_strategyselector_register(void * const opaque, const char * const type,
//Check what strategies are available when they are registered //Check what strategies are available when they are registered
if (strcmp(strategy_name, "avx") == 0) kvz_g_strategies_available.intel_flags.avx++; if (strcmp(strategy_name, "avx") == 0) kvz_g_strategies_available.intel_flags.avx++;
if (strcmp(strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_available.intel_flags.avx++;
if (strcmp(strategy_name, "avx2") == 0) kvz_g_strategies_available.intel_flags.avx2++; if (strcmp(strategy_name, "avx2") == 0) kvz_g_strategies_available.intel_flags.avx2++;
if (strcmp(strategy_name, "mmx") == 0) kvz_g_strategies_available.intel_flags.mmx++; if (strcmp(strategy_name, "mmx") == 0) kvz_g_strategies_available.intel_flags.mmx++;
if (strcmp(strategy_name, "sse") == 0) kvz_g_strategies_available.intel_flags.sse++; if (strcmp(strategy_name, "sse") == 0) kvz_g_strategies_available.intel_flags.sse++;
@ -330,7 +329,6 @@ static void* strategyselector_choose_for(const strategy_list_t * const strategie
//Check what strategy we are going to use //Check what strategy we are going to use
if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++; if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++;
if (strcmp(strategies->strategies[max_priority_i].strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++;
if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx2") == 0) kvz_g_strategies_in_use.intel_flags.avx2++; if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx2") == 0) kvz_g_strategies_in_use.intel_flags.avx2++;
if (strcmp(strategies->strategies[max_priority_i].strategy_name, "mmx") == 0) kvz_g_strategies_in_use.intel_flags.mmx++; if (strcmp(strategies->strategies[max_priority_i].strategy_name, "mmx") == 0) kvz_g_strategies_in_use.intel_flags.mmx++;
if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse") == 0) kvz_g_strategies_in_use.intel_flags.sse++; if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse") == 0) kvz_g_strategies_in_use.intel_flags.sse++;