diff --git a/build/C_Properties.props b/build/C_Properties.props deleted file mode 100644 index e623c441..00000000 --- a/build/C_Properties.props +++ /dev/null @@ -1,33 +0,0 @@ - - - - - - $(Platform)-$(Configuration)\ - $(SolutionDir)..\bin\$(Platform)-$(Configuration)\ - - - - CompileAsC - Level4 - AssemblyAndSourceCode - MultiThreadedDebugDLL - KVZ_DLL_EXPORTS;KVZ_COMPILE_ASM;WIN32_LEAN_AND_MEAN;WIN32;WIN64;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) - $(SolutionDir)..\src\threadwrapper\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories) - 4244;4204;4206;4028;4152;4996;4018;4456;4389;4100;4131;4459;4706;4214;4127;4201 - false - 4013;4029;4047;4716;4700;4020;4021;4133 - - - %(AdditionalDependencies) - true - Console - false - - - HAVE_ALIGNED_STACK=1 - $(SolutionDir)..\src\extras;%(IncludePaths) - - - - \ No newline at end of file diff --git a/build/Release_Optimizations.props b/build/Release_Optimizations.props deleted file mode 100644 index 1d914e51..00000000 --- a/build/Release_Optimizations.props +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - Fast - Full - AnySuitable - Speed - true - MultiThreadedDLL - false - true - - - - - - - UseLinkTimeCodeGeneration - - - - \ No newline at end of file diff --git a/build/yasm/vsyasm.props b/build/yasm/vsyasm.props deleted file mode 100644 index e2b9c26e..00000000 --- a/build/yasm/vsyasm.props +++ /dev/null @@ -1,31 +0,0 @@ - - - - Midl - CustomBuild - - - _SelectedFiles;$(YASMDependsOn) - - - - win32 - - - win64 - - - - False - $(IntDir) - 0 - 0 - vsyasm.exe -Xvc -f $(YASMFormat) [AllOptions] [AdditionalOptions] [Inputs] - %(ObjectFile) - Assembling %(Filename)%(Extension) - false - - - diff --git a/build/yasm/vsyasm.targets b/build/yasm/vsyasm.targets deleted file mode 100644 index 7bf0d2c9..00000000 --- a/build/yasm/vsyasm.targets +++ /dev/null @@ -1,109 +0,0 @@ - - - - - - _YASM - - - - $(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml - - - - - - - - @(YASM->'%(FullPath)', '|') - - - - - - - - - $(ComputeLinkInputsTargets); - ComputeYASMOutput; - - - $(ComputeLibInputsTargets); - ComputeYASMOutput; - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/build/yasm/vsyasm.xml b/build/yasm/vsyasm.xml deleted file mode 100644 index cd08cbba..00000000 --- a/build/yasm/vsyasm.xml +++ /dev/null @@ -1,283 +0,0 @@ - - - - - - - - - - - - - General - - - - - - Symbols - - - - - - Files - - - - - - Command Line - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Execute Before - - - Specifies the targets for the build customization to run before. - - - - - - - - - - - - Execute After - - - Specifies the targets for the build customization to run after. - - - - - - - - - - - - - - - - - - Additional Options - - - Additional Options - - - - - - - - \ No newline at end of file diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index 56fe8709..928179e8 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -37,7 +37,6 @@ #include "strategies/generic/picture-generic.h" #include "strategies/sse2/picture-sse2.h" #include "strategies/sse41/picture-sse41.h" -#include "strategies/x86_asm/picture-x86-asm.h" #include "strategyselector.h" @@ -93,9 +92,6 @@ int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) { if (kvz_g_hardware_flags.intel_flags.sse41) { success &= kvz_strategy_register_picture_sse41(opaque, bitdepth); } - if (kvz_g_hardware_flags.intel_flags.avx) { - success &= kvz_strategy_register_picture_x86_asm_avx(opaque, bitdepth); - } if (kvz_g_hardware_flags.intel_flags.avx2) { success &= kvz_strategy_register_picture_avx2(opaque, bitdepth); } diff --git a/src/strategies/x86_asm/picture-x86-asm-sad.asm b/src/strategies/x86_asm/picture-x86-asm-sad.asm deleted file mode 100644 index dc7b3241..00000000 --- a/src/strategies/x86_asm/picture-x86-asm-sad.asm +++ /dev/null @@ -1,385 +0,0 @@ -;/***************************************************************************** -; * This file is part of Kvazaar HEVC encoder. -; * -; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors -; * All rights reserved. -; * -; * Redistribution and use in source and binary forms, with or without modification, -; * are permitted provided that the following conditions are met: -; * -; * * Redistributions of source code must retain the above copyright notice, this -; * list of conditions and the following disclaimer. -; * -; * * Redistributions in binary form must reproduce the above copyright notice, this -; * list of conditions and the following disclaimer in the documentation and/or -; * other materials provided with the distribution. -; * -; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its -; * contributors may be used to endorse or promote products derived from -; * this software without specific prior written permission. -; * -; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; ****************************************************************************/ - -%include "x86inc.asm" - -;cglobal and RET macros are from the x86.inc -;they push and pop the necessary registers to -;stack depending on the operating system - -;Usage: cglobal name, %1, %2, %3 -;1%: Number of arguments -;2%: Number of registers used -;3%: Number of xmm registers used. -;More info in x86inc.asm - -SECTION .text - -;Set x86inc.asm macros to use avx and xmm registers -INIT_XMM avx - -;KVZ_SAD_4X4 -;Calculates SAD of the 16 consequtive bytes in memory -;r0 address of the first value(current frame) -;r1 address of the first value(reference frame) - -cglobal sad_4x4, 2, 2, 2 - - ;Load 16 bytes of both frames - vmovdqu m0, [r0] - vmovdqu m1, [r1] - - ;Calculate SAD. The results are written in - ;m0[15:0] and m0[79:64] - vpsadbw m0, m1 - - ;Sum the results - vmovhlps m1, m0 - vpaddw m0, m1 - - ;Write the result to eax - vmovd eax, m0 - - RET - - -;KVZ_SAD_4X4_STRIDE -;Calculates SAD of a 4x4 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride - -cglobal sad_4x4_stride, 3, 3, 2 - - ;Load 4 times 4 bytes of both frames - vpinsrd m0, [r0], 0 - add r0, r2 - vpinsrd m0, [r0], 1 - vpinsrd m0, [r0+r2], 2 - vpinsrd m0, [r0+r2*2], 3 - - vpinsrd m1, [r1], 0 - add r1, r2 - vpinsrd m1, [r1], 1 - vpinsrd m1, [r1+r2], 2 - vpinsrd m1, [r1+r2*2], 3 - - vpsadbw m0, m1 - - vmovhlps m1, m0 - vpaddw m0, m1 - - vmovd eax, m0 - - RET - - -;KVZ_SAD_8X8 -;Calculates SAD of the 64 consequtive bytes in memory -;r0 address of the first value(current) -;r1 address of the first value(reference) - -cglobal sad_8x8, 2, 2, 5 - - ;Load the first half of both frames - vmovdqu m0, [r0] - vmovdqu m2, [r0+16] - - vmovdqu m1, [r1] - vmovdqu m3, [r1+16] - - ;Calculate SADs for both - vpsadbw m0, m1 - vpsadbw m2, m3 - - ;Sum - vpaddw m0, m2 - - ;Repeat for the latter half - vmovdqu m1, [r0+16*2] - vmovdqu m3, [r0+16*3] - - vmovdqu m2, [r1+16*2] - vmovdqu m4, [r1+16*3] - - vpsadbw m1, m2 - vpsadbw m3, m4 - - vpaddw m1, m3 - - ;Sum all the SADs - vpaddw m0, m1 - - vmovhlps m1, m0 - vpaddw m0, m1 - - vmovd eax, m0 - - RET - - -;KVZ_SAD_8X8_STRIDE -;Calculates SAD of a 8x8 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride - -cglobal sad_8x8_stride, 3, 3, 5 - - ;Zero m0 register - vpxor m0, m0 - - ;Load the first half to m1 and m3 registers(cur) - ;Current frame - ;Load to the high 64 bits of xmm - vmovhpd m1, [r0] - add r0, r2 - ;Load to the low 64 bits - vmovlpd m1, [r0] - - vmovhpd m3, [r0+r2] - vmovlpd m3, [r0+r2*2] - ;lea calculates the address to r0, - ;but doesn't load anything from - ;the memory. Equivalent for - ;two add r0, r2 instructions. - lea r0, [r0+r2*2] - add r0, r2 - - ;Reference frame - vmovhpd m2, [r1] - add r1, r2 - vmovlpd m2, [r1] - - vmovhpd m4, [r1+r2] - vmovlpd m4, [r1+r2*2] - lea r1, [r1+r2*2] - add r1, r2 - - vpsadbw m1, m2 - vpsadbw m3, m4 - - vpaddw m0, m1 - vpaddw m0, m3 - - ;Repeat for the other half - vmovhpd m1, [r0] - add r0, r2 - vmovlpd m1, [r0] - - vmovhpd m3, [r0+r2] - vmovlpd m3, [r0+r2*2] - lea r0, [r0+r2*2] - add r0, r2 - - vmovhpd m2, [r1] - add r1, r2 - vmovlpd m2, [r1] - - vmovhpd m4, [r1+r2] - vmovlpd m4, [r1+r2*2] - lea r1, [r1+r2*2] - add r1, r2 - - vpsadbw m1, m2 - vpsadbw m3, m4 - - vpaddw m0, m1 - vpaddw m0, m3 - - vmovhlps m1, m0 - vpaddw m0, m1 - - vmovd eax, m0 - - RET - - -;KVZ_SAD_16X16 -;Calculates SAD of the 256 consequtive bytes in memory -;r0 address of the first value(current) -;r1 address of the first value(reference) - -cglobal sad_16x16, 2, 2, 5 - - ;Zero m4 - vpxor m4, m4 - - %assign i 0 - - ;Repeat 8 times. - %rep 8 - - ;Load the next to rows of the current frame - vmovdqu m0, [r0 + 16 * i] - vmovdqu m2, [r0 + 16 * (i + 1)] - - ;Load the next to rows of the reference frame - vmovdqu m1, [r1 + 16 * i] - vmovdqu m3, [r1 + 16 * (i + 1)] - - vpsadbw m0, m1 - vpsadbw m2, m3 - - ;Accumulate SADs to m4 - vpaddw m4, m0 - vpaddw m4, m2 - - %assign i i+2 - - %endrep - - ;Calculate the final sum - vmovhlps m0, m4 - vpaddw m4, m0 - - vmovd eax, m4 - - RET - - -;KVZ_SAD_16X16_STRIDE -;Calculates SAD of a 16x16 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride - -cglobal sad_16x16_stride, 3, 3, 5 - - vpxor m4, m4 - - %rep 8 - - ; Load the next 2 rows from rec_buf to m0 and m2 - vmovdqu m0, [r0] - vmovdqu m2, [r0 + r2] - lea r0, [r0 + r2*2] - - ; Load the next 2 rows from ref_buf to m1 and m3 - vmovdqu m1, [r1] - vmovdqu m3, [r1 + r2] - lea r1, [r1 + r2*2] - - vpsadbw m0, m1 - vpsadbw m2, m3 - - vpaddw m4, m0 - vpaddw m4, m2 - - %endrep - - vmovhlps m0, m4 - vpaddw m4, m0 - - vmovd eax, m4 - - RET - - -;KVZ_SAD_32x32_STRIDE -;Calculates SAD of a 32x32 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride -cglobal sad_32x32_stride, 3, 3, 5 - vpxor m4, m4 - - ; Handle 2 lines per iteration - %rep 16 - vmovdqu m0, [r0] - vmovdqu m1, [r0 + 16] - vmovdqu m2, [r0 + r2] - vmovdqu m3, [r0 + r2 + 16] - lea r0, [r0 + 2 * r2] - - vpsadbw m0, [r1] - vpsadbw m1, [r1 + 16] - vpsadbw m2, [r1 + r2] - vpsadbw m3, [r1 + r2 + 16] - lea r1, [r1 + 2 * r2] - - vpaddd m4, m0 - vpaddd m4, m1 - vpaddd m4, m2 - vpaddd m4, m3 - %endrep - - vmovhlps m0, m4 - vpaddd m4, m0 - - vmovd eax, m4 - - RET - - -;KVZ_SAD_64x64_STRIDE -;Calculates SAD of a 64x64 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride -cglobal sad_64x64_stride, 3, 4, 5 - vpxor m4, m4 ; sum accumulation register - mov r3, 4 ; number of iterations in the loop - -Process16Lines: - ; Intel optimization manual says to not unroll beyond 500 instructions. - ; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but - ; smaller is better, when speed is the same, right? - %rep 16 - vmovdqu m0, [r0] - vmovdqu m1, [r0 + 1*16] - vmovdqu m2, [r0 + 2*16] - vmovdqu m3, [r0 + 3*16] - - vpsadbw m0, [r1] - vpsadbw m1, [r1 + 1*16] - vpsadbw m2, [r1 + 2*16] - vpsadbw m3, [r1 + 3*16] - - lea r0, [r0 + r2] - lea r1, [r1 + r2] - - vpaddd m4, m0 - vpaddd m4, m1 - vpaddd m4, m2 - vpaddd m4, m3 - %endrep - - dec r3 - jnz Process16Lines - - vmovhlps m0, m4 - vpaddd m4, m0 - - vmovd eax, m4 - - RET diff --git a/src/strategies/x86_asm/picture-x86-asm-sad.h b/src/strategies/x86_asm/picture-x86-asm-sad.h deleted file mode 100644 index 88d6ce66..00000000 --- a/src/strategies/x86_asm/picture-x86-asm-sad.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef _PICTURE_X86_ASM_SAD_H_ -#define _PICTURE_X86_ASM_SAD_H_ -/***************************************************************************** - * This file is part of uvg266 VVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -/** - * \ingroup Optimization - * \file - * Optimizations for AVX, utilizing ASM implementations. - */ - -#include "global.h" // IWYU pragma: keep -#include "kvazaar.h" - -#if KVZ_BIT_DEPTH == 8 -unsigned kvz_sad_4x4_avx(const uint8_t*, const uint8_t*); -unsigned kvz_sad_8x8_avx(const uint8_t*, const uint8_t*); -unsigned kvz_sad_16x16_avx(const uint8_t*, const uint8_t*); - -unsigned kvz_sad_4x4_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_8x8_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_16x16_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_32x32_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_64x64_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -#endif // KVZ_BIT_DEPTH == 8 - -#endif diff --git a/src/strategies/x86_asm/picture-x86-asm-satd.asm b/src/strategies/x86_asm/picture-x86-asm-satd.asm deleted file mode 100644 index ef915260..00000000 --- a/src/strategies/x86_asm/picture-x86-asm-satd.asm +++ /dev/null @@ -1,575 +0,0 @@ -;/***************************************************************************** -; * This file is part of Kvazaar HEVC encoder. -; * -; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors -; * All rights reserved. -; * -; * Redistribution and use in source and binary forms, with or without modification, -; * are permitted provided that the following conditions are met: -; * -; * * Redistributions of source code must retain the above copyright notice, this -; * list of conditions and the following disclaimer. -; * -; * * Redistributions in binary form must reproduce the above copyright notice, this -; * list of conditions and the following disclaimer in the documentation and/or -; * other materials provided with the distribution. -; * -; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its -; * contributors may be used to endorse or promote products derived from -; * this software without specific prior written permission. -; * -; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; ****************************************************************************/ - -%include "x86inc.asm" - -;cglobal and RET macros are from the x86.inc -;they push and pop the necessary registers to -;stack depending on the operating system - -;Usage: cglobal name, %1, %2, %3 -;1%: Number of arguments -;2%: Number of registers used -;3%: Number of xmm registers used. -;More info in x86inc.asm - -SECTION .text - -;Set x86inc.asm macros to use avx and xmm registers -INIT_XMM avx - -;KVZ_ZERO_EXTEND_WD -;zero extend all packed words in xmm to dwords in 2 xmm registers -;%1 source register -;%2 lower destination register -;%3 higher destination register - -%macro KVZ_ZERO_EXTEND_WD 3 - - ;Zero extend high 64 bits - vmovhlps %3, %1 - vpmovzxwd %3, %3 - ;Zero extend low 64 bits - vpmovzxwd %2, %1 - -%endmacro ; KVZ_ZERO_EXTEND_WD - -; Use nondestructive horizontal add and sub to calculate both at the same time. -; TODO: It would probably be possible to do this with 3 registers (destructive vphsubw). -; args: -; 1, 2: input registers -; 3, 4: output registers - -%macro SATD_HORIZONTAL_SUB_AND_ADD 4 - - ; TODO: It might be possible to do this with 3 registers? - - ;First stage - vphaddw %3, %1, %2 - vphsubw %4, %1, %2 - - ;Second stage - vphaddw %1, %3, %4 - vphsubw %2, %3, %4 - - ;Third stage - vphaddw %3, %1, %2 - vphsubw %4, %1, %2 - -%endmacro ; SATD_HORIZONTAL_SUB_AND_ADD - -;KVZ_SATD_8X8_STRIDE -;Calculates SATD of a 8x8 block inside a frame with stride -;r0 address of the first value(reference) -;r1 address of the first value(current) -;r2 stride -; -;The Result is written in the register r4 - -%macro KVZ_SATD_8X8_STRIDE 0 - - ;Calculate differences of the 8 rows into - ;registers m0-m7 - vpmovzxbw m0, [r0] - vpmovzxbw m7, [r2] - vpsubw m0, m7 - - vpmovzxbw m1, [r0+r1] - vpmovzxbw m7, [r2+r3] - vpsubw m1, m7 - - ;Set r0 and r2 2 rows forward - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - - vpmovzxbw m2, [r0] - vpmovzxbw m7, [r2] - vpsubw m2, m7 - - vpmovzxbw m3, [r0+r1] - vpmovzxbw m7, [r2+r3] - vpsubw m3, m7 - - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - - vpmovzxbw m4, [r0] - vpmovzxbw m7, [r2] - vpsubw m4, m7 - - vpmovzxbw m5, [r0+r1] - vpmovzxbw m7, [r2+r3] - vpsubw m5, m7 - - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - - vpmovzxbw m6, [r0] - vpmovzxbw m7, [r2] - vpsubw m6, m7 - - ;32-bit AVX doesn't have registers - ;xmm8-xmm15, use stack instead - - %if ARCH_X86_64 - vpmovzxbw m7, [r0+r1] - vpmovzxbw m8, [r2+r3] - vpsubw m7, m8 - %else - %define temp0 esp+16*3 - %define temp1 esp+16*2 - %define temp2 esp+16*1 - %define temp3 esp+16*0 - - ;Reserve memory for 4 x 128 bits. - sub esp, 16*4 - - vpmovzxbw m7, [r2+r3] - vmovdqu [temp0], m7 - vpmovzxbw m7, [r0+r1] - vpsubw m7, [temp0] - - ;Put rows 5-8 to stack - vmovdqu [temp0], m4 - vmovdqu [temp1], m5 - vmovdqu [temp2], m6 - vmovdqu [temp3], m7 - %endif - - ;Hadamard transform (FWHT algorithm) - ;Horizontal transform - - %if ARCH_X86_64 - ;Calculate horizontal transform for each row. - ;Transforms of two rows are interleaved in register pairs. - ;(m8 and m9, m10 and m11,...) - - SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m8, m9 - SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m10, m11 - SATD_HORIZONTAL_SUB_AND_ADD m4, m5, m12, m13 - SATD_HORIZONTAL_SUB_AND_ADD m6, m7, m14, m15 - - %else - ;Calculate horizontal transforms for the first four rows. - ;Then load the other four into the registers and store - ;ready transforms in the stack. - ;Input registers are m0-m3, results are written in - ;registers m4-m7 (and memory). - - SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5 - SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7 - - vmovdqu m3, [temp3] - vmovdqu m2, [temp2] - vmovdqu m1, [temp1] - vmovdqu m0, [temp0] - - vmovdqu [temp3], m7 - vmovdqu [temp2], m6 - vmovdqu [temp1], m5 - vmovdqu [temp0], m4 - - SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5 - SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7 - %endif - - - ;Vertical transform - ;Transform columns of the 8x8 block. - ;First sum the interleaved horizontally - ;transformed values with one horizontal add - ;for each pair of rows. Then calculate - ;with regular packed additions and - ;subtractions. - - %if ARCH_X86_64 - ;Horizontally transformed values are in registers m8-m15 - ;Results are written in m0-m7 - - ;First stage - vphaddw m0, m8, m9 - vphsubw m1, m8, m9 - - vphaddw m2, m10, m11 - vphsubw m3, m10, m11 - - vphaddw m4, m12, m13 - vphsubw m5, m12, m13 - - vphaddw m6, m14, m15 - vphsubw m7, m14, m15 - - ;Second stage - vpaddw m8, m0, m2 - vpaddw m9, m1, m3 - vpsubw m10, m0, m2 - vpsubw m11, m1, m3 - - vpaddw m12, m4, m6 - vpaddw m13, m5, m7 - vpsubw m14, m4, m6 - vpsubw m15, m5, m7 - - ;Third stage - vpaddw m0, m8, m12 - vpaddw m1, m9, m13 - vpaddw m2, m10, m14 - vpaddw m3, m11, m15 - - vpsubw m4, m8, m12 - vpsubw m5, m9, m13 - vpsubw m6, m10, m14 - vpsubw m7, m11, m15 - - %else - ;Transformed values are in registers m4-m7 - ;and in memory(temp0-temp3). Transformed values - ;are written in m4-m7. Also calculate absolute - ;values for them and accumulate into ymm0. - - ;First stage - vphaddw m0, m4, m5 - vphsubw m1, m4, m5 - - vphaddw m2, m6, m7 - vphsubw m3, m6, m7 - - ;Second stage - vpaddw m4, m0, m2 - vpaddw m5, m1, m3 - vpsubw m6, m0, m2 - vpsubw m7, m1, m3 - - vmovdqu m3, [temp3] - vmovdqu m2, [temp2] - vmovdqu m1, [temp1] - vmovdqu m0, [temp0] - - vmovdqu [temp3], m7 - vmovdqu [temp2], m6 - vmovdqu [temp1], m5 - vmovdqu [temp0], m4 - - ;First stage (second half) - vphaddw m4, m0, m1 - vphsubw m5, m0, m1 - - vphaddw m6, m2, m3 - vphsubw m7, m2, m3 - - ;Second stage (second half) - vpaddw m0, m4, m6 - vpaddw m1, m5, m7 - vpsubw m2, m4, m6 - vpsubw m3, m5, m7 - - ;Third stage - vpaddw m4, m0, [temp0] - vpaddw m5, m1, [temp1] - vpsubw m6, m0, [temp0] - vpsubw m7, m1, [temp1] - - ;Calculate the absolute values and - ;zero extend 16-bit values to 32-bit - ;values. Then sum the values. - - vpabsw m4, m4 - KVZ_ZERO_EXTEND_WD m4, m4, m1 - vpaddd m4, m1 - - vpabsw m5, m5 - KVZ_ZERO_EXTEND_WD m5, m5, m1 - vpaddd m5, m1 - - vpabsw m6, m6 - KVZ_ZERO_EXTEND_WD m6, m6, m1 - vpaddd m6, m1 - - vpabsw m7, m7 - KVZ_ZERO_EXTEND_WD m7, m7, m1 - vpaddd m7, m1 - - vpaddd m0, m4, m5 - vpaddd m0, m6 - vpaddd m0, m7 - - ;Repeat for the rest - vpaddw m4, m2, [temp2] - vpaddw m5, m3, [temp3] - vpsubw m6, m2, [temp2] - vpsubw m7, m3, [temp3] - - vpabsw m4, m4 - KVZ_ZERO_EXTEND_WD m4, m4, m1 - vpaddd m4, m1 - - vpabsw m5, m5 - KVZ_ZERO_EXTEND_WD m5, m5, m1 - vpaddd m5, m1 - - vpabsw m6, m6 - KVZ_ZERO_EXTEND_WD m6, m6, m1 - vpaddd m6, m1 - - vpabsw m7, m7 - KVZ_ZERO_EXTEND_WD m7, m7, m1 - vpaddd m7, m1 - - ;Sum the other half of the packed results to ymm4 - vpaddd m4, m5 - vpaddd m4, m6 - vpaddd m4, m7 - - ;Sum all packed results to ymm0 - vpaddd m0, m4 - - %endif - - %if ARCH_X86_64 - - ;Calculate the absolute values and - ;zero extend 16-bit values to 32-bit - ;values. In other words: extend xmm to - ;corresponding ymm. - - vpabsw m0, m0 - KVZ_ZERO_EXTEND_WD m0, m0, m8 - vpaddd m0, m8 - - vpabsw m1, m1 - KVZ_ZERO_EXTEND_WD m1, m1, m8 - vpaddd m1, m8 - - vpabsw m2, m2 - KVZ_ZERO_EXTEND_WD m2, m2, m8 - vpaddd m1, m8 - - vpabsw m3, m3 - KVZ_ZERO_EXTEND_WD m3, m3, m8 - vpaddd m3, m8 - - vpabsw m4, m4 - KVZ_ZERO_EXTEND_WD m4, m4, m8 - vpaddd m4, m8 - - vpabsw m5, m5 - KVZ_ZERO_EXTEND_WD m5, m5, m8 - vpaddd m5, m8 - - vpabsw m6, m6 - KVZ_ZERO_EXTEND_WD m6, m6, m8 - vpaddd m6, m8 - - vpabsw m7, m7 - KVZ_ZERO_EXTEND_WD m7, m7, m8 - vpaddd m7, m8 - - ;Calculate packed sum of transformed values to ymm0 - vpaddd m0, m1 - vpaddd m0, m2 - vpaddd m0, m3 - vpaddd m0, m4 - vpaddd m0, m5 - vpaddd m0, m6 - vpaddd m0, m7 - %endif - - ;Sum the packed values to m0[32:0] - vphaddd m0, m0 - vphaddd m0, m0 - - ;The result is in the lowest 32 bits in m0 - vmovd r4d, m0 - - ;8x8 Hadamard transform requires - ;adding 2 and dividing by 4 - add r4, 2 - shr r4, 2 - - ;Zero high 128 bits of ymm registers to - ;prevent AVX-SSE transition penalty. - vzeroupper - - %if ARCH_X86_64 == 0 - add esp, 16*4 - %endif - -%endmacro ; KVZ_SATD_8X8_STRIDE - -;KVZ_SATD_4X4 -;Calculates SATD of the 16 consequtive bytes in memory -;r0 address of the first value(current) -;r1 address of the first value(reference) - -cglobal satd_4x4, 2, 2, 6 - - ;Load 8 bytes from memory and zero extend - ;to 16-bit values. Calculate difference. - vpmovzxbw m0, [r0] - vpmovzxbw m2, [r1] - vpsubw m0, m2 - - vpmovzxbw m1, [r0+8] - vpmovzxbw m3, [r1+8] - vpsubw m1, m3 - - ;Hadamard transform - ;Horizontal phase - ;First stage - vphaddw m4, m0, m1 - vphsubw m5, m0, m1 - ;Second stage - vphaddw m0, m4, m5 - vphsubw m1, m4, m5 - - ;Vertical phase - ;First stage - vphaddw m4, m0, m1 - vphsubw m5, m0, m1 - ;Second stage - vphaddw m0, m4, m5 - vphsubw m1, m4, m5 - - ;Calculate absolute values - vpabsw m0, m0 - vpabsw m1, m1 - - ;Sum the all the transformed values - vpaddw m0, m1 - - vphaddw m0, m0 - vphaddw m0, m0 - vphaddw m0, m0 - - ;Extract the lowest 16 bits of m0 - ;into eax - vpextrw eax, m0, 0 - - ;4x4 Hadamard transform requires - ;Addition of 1 and division by 2 - add eax, 1 - shr eax, 1 - - RET - - - -;KVZ_SATD_8X8 -;Calculates SATD of a 8x8 block inside a frame with stride -;r0 address of the first value(reference) -;r1 address of the first value(current) -;r2 stride - -%if ARCH_X86_64 - cglobal satd_8x8, 4, 5, 16 -%else - cglobal satd_8x8, 4, 5, 8 -%endif - - ;Set arguments - mov r2, r1 - mov r1, 8 - mov r3, 8 - - ;Calculate 8x8 SATD. Result is written - ;in the register r4. - KVZ_SATD_8X8_STRIDE - mov rax, r4 - RET - -;KVZ_SATD_NXN -;Calculates SATD of a NxN block inside a frame with stride -;r0 address of the first value(reference) -;r1 address of the first value(current) - -%macro KVZ_SATD_NXN 1 - - %if ARCH_X86_64 - cglobal satd_%1x%1, 2, 7, 16 - %else - cglobal satd_%1x%1, 2, 7, 8 - %endif - - ;Set arguments - mov r2, r1 - mov r1, %1 - mov r3, %1 - - ;Zero r5 and r6 - xor r5, r5 - xor r6, r6 - - ;Calculate SATDs of each 8x8 sub-blocks - ;and accumulate the results in r6. Repeat yloop - ;N times. Repeat xloop N times. r4 and r5 are counters - ;for the loops. - - .yloop - - ;zero r4 - xor r4, r4 - - .xloop - push r4 - - ;Calculate SATD of the sub-block. Result is - ;written in the register r4. - KVZ_SATD_8X8_STRIDE - add r6, r4 - - ;Set r2 and r0 to the next sub-block - ;on the same row - sub r2, 6*%1-8 - sub r0, 6*%1-8 - - pop r4 - add r4, 8 - cmp r4, %1 - jne .xloop - - ;Set r2 and r0 to the first sub-block - ;on the next row(of 8x8 sub-blocks) - add r2, 7*%1 - add r0, 7*%1 - - add r5, 8 - cmp r5, %1 - jne .yloop - - mov rax, r6 - RET - -%endmacro ; KVZ_SATD_NXN - -KVZ_SATD_NXN 16 -KVZ_SATD_NXN 32 -KVZ_SATD_NXN 64 diff --git a/src/strategies/x86_asm/picture-x86-asm-satd.h b/src/strategies/x86_asm/picture-x86-asm-satd.h deleted file mode 100644 index b7377c70..00000000 --- a/src/strategies/x86_asm/picture-x86-asm-satd.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _PICTURE_X86_ASM_SATD_H_ -#define _PICTURE_X86_ASM_SATD_H_ -/***************************************************************************** - * This file is part of uvg266 VVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -/** - * \ingroup Optimization - * \file - * Optimizations for AVX, utilizing ASM implementations. - */ - -#include "global.h" // IWYU pragma: keep - - -unsigned kvz_satd_4x4_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_8x8_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_16x16_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_32x32_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_64x64_avx(const kvz_pixel *org, const kvz_pixel *cur); - -#endif diff --git a/src/strategies/x86_asm/picture-x86-asm.c b/src/strategies/x86_asm/picture-x86-asm.c deleted file mode 100644 index cbc960e2..00000000 --- a/src/strategies/x86_asm/picture-x86-asm.c +++ /dev/null @@ -1,132 +0,0 @@ -/***************************************************************************** - * This file is part of uvg266 VVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -#include "strategies/x86_asm/picture-x86-asm.h" - -#if defined(KVZ_COMPILE_ASM) -#include "kvazaar.h" -#if KVZ_BIT_DEPTH == 8 -#include - -#include "strategies/x86_asm/picture-x86-asm-sad.h" -#include "strategies/x86_asm/picture-x86-asm-satd.h" -#include "strategies/sse41/picture-sse41.h" -#include "strategyselector.h" - - -static unsigned kvz_sad_32x32_avx(const uint8_t *data1, const uint8_t *data2) -{ - unsigned sad = 0; - sad += kvz_sad_16x16_avx(data1, data2); - sad += kvz_sad_16x16_avx(data1 + 8 * 32, data2 + 8 * 32); - sad += kvz_sad_16x16_avx(data1 + 16 * 32, data2 + 16 * 32); - sad += kvz_sad_16x16_avx(data1 + 24 * 32, data2 + 24 * 32); - return sad; -} - -static unsigned kvz_sad_64x64_avx(const uint8_t *data1, const uint8_t *data2) -{ - unsigned sad = 0; - sad += kvz_sad_32x32_avx(data1, data2); - sad += kvz_sad_32x32_avx(data1 + 16 * 64, data2 + 16 * 64); - sad += kvz_sad_32x32_avx(data1 + 32 * 64, data2 + 32 * 64); - sad += kvz_sad_32x32_avx(data1 + 48 * 64, data2 + 48 * 64); - return sad; -} - -static unsigned kvz_sad_other_avx(const uint8_t *data1, const uint8_t *data2, - int width, int height, - unsigned stride) -{ - unsigned sad = 0; - - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - sad += abs(data1[y * stride + x] - data2[y * stride + x]); - } - } - - return sad; -} - -static unsigned reg_sad_x86_asm(const uint8_t *data1, const uint8_t * data2, - const int width, const int height, - const unsigned stride1, const unsigned stride2) -{ - if (width == height) { - if (width == 8) { - return kvz_sad_8x8_stride_avx(data1, data2, stride1); - } else if (width == 16) { - return kvz_sad_16x16_stride_avx(data1, data2, stride1); - } else if (width == 32) { - return kvz_sad_32x32_stride_avx(data1, data2, stride1); - } else if (width == 64) { - return kvz_sad_64x64_stride_avx(data1, data2, stride1); - } - } - - if (width * height >= 16) { - // Call the vectorized general SAD SSE41 function when the block - // is big enough to make it worth it. - return kvz_reg_sad_sse41(data1, data2, width, height, stride1, stride2); - } else { - return kvz_sad_other_avx(data1, data2, width, height, stride1); - } -} - -#endif // KVZ_BIT_DEPTH == 8 -#endif //defined(KVZ_COMPILE_ASM) - -int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth) -{ - bool success = true; -#if defined(KVZ_COMPILE_ASM) -#if KVZ_BIT_DEPTH == 8 - if (bitdepth == 8){ - success &= kvz_strategyselector_register(opaque, "reg_sad", "x86_asm_avx", 30, ®_sad_x86_asm); - - success &= kvz_strategyselector_register(opaque, "sad_4x4", "x86_asm_avx", 30, &kvz_sad_4x4_avx); - success &= kvz_strategyselector_register(opaque, "sad_8x8", "x86_asm_avx", 30, &kvz_sad_8x8_avx); - success &= kvz_strategyselector_register(opaque, "sad_16x16", "x86_asm_avx", 30, &kvz_sad_16x16_avx); - success &= kvz_strategyselector_register(opaque, "sad_32x32", "x86_asm_avx", 30, &kvz_sad_32x32_avx); - success &= kvz_strategyselector_register(opaque, "sad_64x64", "x86_asm_avx", 30, &kvz_sad_64x64_avx); - - success &= kvz_strategyselector_register(opaque, "satd_4x4", "x86_asm_avx", 30, &kvz_satd_4x4_avx); - success &= kvz_strategyselector_register(opaque, "satd_8x8", "x86_asm_avx", 30, &kvz_satd_8x8_avx); - success &= kvz_strategyselector_register(opaque, "satd_16x16", "x86_asm_avx", 30, &kvz_satd_16x16_avx); - success &= kvz_strategyselector_register(opaque, "satd_32x32", "x86_asm_avx", 30, &kvz_satd_32x32_avx); - success &= kvz_strategyselector_register(opaque, "satd_64x64", "x86_asm_avx", 30, &kvz_satd_64x64_avx); - } -#endif // KVZ_BIT_DEPTH == 8 -#endif //!defined(KVZ_COMPILE_ASM) - return success; -} diff --git a/src/strategies/x86_asm/picture-x86-asm.h b/src/strategies/x86_asm/picture-x86-asm.h deleted file mode 100644 index ce00e0d8..00000000 --- a/src/strategies/x86_asm/picture-x86-asm.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef STRATEGIES_PICTURE_X86_ASM_H_ -#define STRATEGIES_PICTURE_X86_ASM_H_ -/***************************************************************************** - * This file is part of uvg266 VVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -/** - * \ingroup Optimization - * \file - * Optimizations for AVX, utilizing ASM implementations. - */ - -#include "global.h" // IWYU pragma: keep - - -int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth); - -#endif //STRATEGIES_PICTURE_X86_ASM_H_ diff --git a/src/strategies/x86_asm/x86inc.asm b/src/strategies/x86_asm/x86inc.asm deleted file mode 100644 index a0652d9a..00000000 --- a/src/strategies/x86_asm/x86inc.asm +++ /dev/null @@ -1,1466 +0,0 @@ -;***************************************************************************** -;* x86inc.asm: x264asm abstraction layer -;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project -;* -;* Authors: Loren Merritt -;* Anton Mitrofanov -;* Jason Garrett-Glaser -;* Henrik Gramner -;* -;* Permission to use, copy, modify, and/or distribute this software for any -;* purpose with or without fee is hereby granted, provided that the above -;* copyright notice and this permission notice appear in all copies. -;* -;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -;***************************************************************************** - -; This is a header file for the x264ASM assembly language, which uses -; NASM/YASM syntax combined with a large number of macros to provide easy -; abstraction between different calling conventions (x86_32, win64, linux64). -; It also has various other useful features to simplify writing the kind of -; DSP functions that are most often used in x264. - -; Unlike the rest of x264, this file is available under an ISC license, as it -; has significant usefulness outside of x264 and we want it to be available -; to the largest audience possible. Of course, if you modify it for your own -; purposes to add a new feature, we strongly encourage contributing a patch -; as this feature might be useful for others as well. Send patches or ideas -; to x264-devel@videolan.org . - -%ifndef private_prefix - %define private_prefix kvz -%endif - -%ifndef public_prefix - %define public_prefix private_prefix -%endif - -%define WIN64 0 -%define UNIX64 0 -%if ARCH_X86_64 - %ifidn __OUTPUT_FORMAT__,win32 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,win64 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,x64 - %define WIN64 1 - %else - %define UNIX64 1 - %endif -%endif - -%ifdef PREFIX - %define mangle(x) _ %+ x -%else - %define mangle(x) x -%endif - -%macro SECTION_RODATA 0-1 16 - SECTION .rodata align=%1 -%endmacro - -%macro SECTION_TEXT 0-1 16 - SECTION .text align=%1 -%endmacro - -%if WIN64 - %define PIC -%elif ARCH_X86_64 == 0 -; x86_32 doesn't require PIC. -; Some distros prefer shared objects to be PIC, but nothing breaks if -; the code contains a few textrels, so we'll skip that complexity. - %undef PIC -%endif -%ifdef PIC - default rel -%endif - -%macro CPUNOP 1 - %ifdef __YASM_MAJOR__ - CPU %1 - %endif -%endmacro - -; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -CPUNOP amdnop - -; Macros to eliminate most code duplication between x86_32 and x86_64: -; Currently this works only for leaf functions which load all their arguments -; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. - -; PROLOGUE: -; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used. pushes callee-saved regs if needed. -; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, -; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), -; and an extra register will be allocated to hold the original stack -; pointer (to not invalidate r0m etc.). To prevent the use of an extra -; register as stack pointer, request a negative stack size. -; %4+/%5+ = list of names to define to registers -; PROLOGUE can also be invoked by adding the same options to cglobal - -; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src) and one local variable (tmp) - -; TODO Some functions can use some args directly from the stack. If they're the -; last args then you can just not declare them, but if they're in the middle -; we need more flexible macro. - -; RET: -; Pops anything that was pushed by PROLOGUE, and returns. - -; REP_RET: -; Use this instead of RET if it's a branch target. - -; registers: -; rN and rNq are the native-size register holding function argument N -; rNd, rNw, rNb are dword, word, and byte size -; rNh is the high 8 bits of the word size -; rNm is the original location of arg N (a register or on the stack), dword -; rNmp is native size - -%macro DECLARE_REG 2-3 - %define r%1q %2 - %define r%1d %2d - %define r%1w %2w - %define r%1b %2b - %define r%1h %2h - %if %0 == 2 - %define r%1m %2d - %define r%1mp %2 - %elif ARCH_X86_64 ; memory - %define r%1m [rstk + stack_offset + %3] - %define r%1mp qword r %+ %1 %+ m - %else - %define r%1m [rstk + stack_offset + %3] - %define r%1mp dword r %+ %1 %+ m - %endif - %define r%1 %2 -%endmacro - -%macro DECLARE_REG_SIZE 3 - %define r%1q r%1 - %define e%1q r%1 - %define r%1d e%1 - %define e%1d e%1 - %define r%1w %1 - %define e%1w %1 - %define r%1h %3 - %define e%1h %3 - %define r%1b %2 - %define e%1b %2 -%if ARCH_X86_64 == 0 - %define r%1 e%1 -%endif -%endmacro - -DECLARE_REG_SIZE ax, al, ah -DECLARE_REG_SIZE bx, bl, bh -DECLARE_REG_SIZE cx, cl, ch -DECLARE_REG_SIZE dx, dl, dh -DECLARE_REG_SIZE si, sil, null -DECLARE_REG_SIZE di, dil, null -DECLARE_REG_SIZE bp, bpl, null - -; t# defines for when per-arch register allocation is more complex than just function arguments - -%macro DECLARE_REG_TMP 1-* - %assign %%i 0 - %rep %0 - CAT_XDEFINE t, %%i, r%1 - %assign %%i %%i+1 - %rotate 1 - %endrep -%endmacro - -%macro DECLARE_REG_TMP_SIZE 0-* - %rep %0 - %define t%1q t%1 %+ q - %define t%1d t%1 %+ d - %define t%1w t%1 %+ w - %define t%1h t%1 %+ h - %define t%1b t%1 %+ b - %rotate 1 - %endrep -%endmacro - -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 - -%if ARCH_X86_64 - %define gprsize 8 -%else - %define gprsize 4 -%endif - -%macro PUSH 1 - push %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset+gprsize - %endif -%endmacro - -%macro POP 1 - pop %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset-gprsize - %endif -%endmacro - -%macro PUSH_IF_USED 1-* - %rep %0 - %if %1 < regs_used - PUSH r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro POP_IF_USED 1-* - %rep %0 - %if %1 < regs_used - pop r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro LOAD_IF_USED 1-* - %rep %0 - %if %1 < num_args - mov r%1, r %+ %1 %+ mp - %endif - %rotate 1 - %endrep -%endmacro - -%macro SUB 2 - sub %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset+(%2) - %endif -%endmacro - -%macro ADD 2 - add %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset-(%2) - %endif -%endmacro - -%macro movifnidn 2 - %ifnidn %1, %2 - mov %1, %2 - %endif -%endmacro - -%macro movsxdifnidn 2 - %ifnidn %1, %2 - movsxd %1, %2 - %endif -%endmacro - -%macro ASSERT 1 - %if (%1) == 0 - %error assert failed - %endif -%endmacro - -%macro DEFINE_ARGS 0-* - %ifdef n_arg_names - %assign %%i 0 - %rep n_arg_names - CAT_UNDEF arg_name %+ %%i, q - CAT_UNDEF arg_name %+ %%i, d - CAT_UNDEF arg_name %+ %%i, w - CAT_UNDEF arg_name %+ %%i, h - CAT_UNDEF arg_name %+ %%i, b - CAT_UNDEF arg_name %+ %%i, m - CAT_UNDEF arg_name %+ %%i, mp - CAT_UNDEF arg_name, %%i - %assign %%i %%i+1 - %endrep - %endif - - %xdefine %%stack_offset stack_offset - %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine - %assign %%i 0 - %rep %0 - %xdefine %1q r %+ %%i %+ q - %xdefine %1d r %+ %%i %+ d - %xdefine %1w r %+ %%i %+ w - %xdefine %1h r %+ %%i %+ h - %xdefine %1b r %+ %%i %+ b - %xdefine %1m r %+ %%i %+ m - %xdefine %1mp r %+ %%i %+ mp - CAT_XDEFINE arg_name, %%i, %1 - %assign %%i %%i+1 - %rotate 1 - %endrep - %xdefine stack_offset %%stack_offset - %assign n_arg_names %0 -%endmacro - -%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) - %ifnum %1 - %if %1 != 0 - %assign %%stack_alignment ((mmsize + 15) & ~15) - %assign stack_size %1 - %if stack_size < 0 - %assign stack_size -stack_size - %endif - %assign stack_size_padded stack_size - %if WIN64 - %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space - %if mmsize != 8 - %assign xmm_regs_used %2 - %if xmm_regs_used > 8 - %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 - %endif - %endif - %endif - %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - SUB rsp, stack_size_padded - %else - %assign %%reg_num (regs_used - 1) - %xdefine rstk r %+ %%reg_num - ; align stack, and save original stack location directly above - ; it, i.e. in [rsp+stack_size_padded], so we can restore the - ; stack in a single instruction (i.e. mov rsp, rstk or mov - ; rsp, [rsp+stack_size_padded]) - mov rstk, rsp - %if %1 < 0 ; need to store rsp on stack - sub rsp, gprsize+stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm [rsp+stack_size_padded] - mov rstkm, rstk - %else ; can keep rsp in rstk during whole function - sub rsp, stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm rstk - %endif - %endif - WIN64_PUSH_XMM - %endif - %endif -%endmacro - -%macro SETUP_STACK_POINTER 1 - %ifnum %1 - %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) - %if %1 > 0 - %assign regs_used (regs_used + 1) - %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 - %warning "Stack pointer will overwrite register argument" - %endif - %endif - %endif -%endmacro - -%macro DEFINE_ARGS_INTERNAL 3+ - %ifnum %2 - DEFINE_ARGS %3 - %elif %1 == 4 - DEFINE_ARGS %2 - %elif %1 > 4 - DEFINE_ARGS %2, %3 - %endif -%endmacro - -%if WIN64 ; Windows x64 ;================================================= - -DECLARE_REG 0, rcx -DECLARE_REG 1, rdx -DECLARE_REG 2, R8 -DECLARE_REG 3, R9 -DECLARE_REG 4, R10, 40 -DECLARE_REG 5, R11, 48 -DECLARE_REG 6, rax, 56 -DECLARE_REG 7, rdi, 64 -DECLARE_REG 8, rsi, 72 -DECLARE_REG 9, rbx, 80 -DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 - -%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4, %3 - %if mmsize != 8 && stack_size == 0 - WIN64_SPILL_XMM %3 - %endif - LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%macro WIN64_PUSH_XMM 0 - ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 - movaps [rstk + stack_offset + 8], xmm6 - %endif - %if xmm_regs_used > 7 - movaps [rstk + stack_offset + 24], xmm7 - %endif - %if xmm_regs_used > 8 - %assign %%i 8 - %rep xmm_regs_used-8 - movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 - %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 - SUB rsp, stack_size_padded - %endif - WIN64_PUSH_XMM -%endmacro - -%macro WIN64_RESTORE_XMM_INTERNAL 1 - %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 - %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] - %endrep - %endif - %if stack_size_padded > 0 - %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) - mov rsp, rstkm - %else - add %1, stack_size_padded - %assign %%pad_size stack_size_padded - %endif - %endif - %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] - %endif - %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] - %endif -%endmacro - -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset (stack_offset-stack_size_padded) - %assign xmm_regs_used 0 -%endmacro - -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 - -%macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp - POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%elif ARCH_X86_64 ; *nix x64 ;============================================= - -DECLARE_REG 0, rdi -DECLARE_REG 1, rsi -DECLARE_REG 2, rdx -DECLARE_REG 3, rcx -DECLARE_REG 4, R8 -DECLARE_REG 5, R9 -DECLARE_REG 6, rax, 8 -DECLARE_REG 7, R10, 16 -DECLARE_REG 8, R11, 24 -DECLARE_REG 9, rbx, 32 -DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4 - LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 14, 13, 12, 11, 10, 9 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%else ; X86_32 ;============================================================== - -DECLARE_REG 0, eax, 4 -DECLARE_REG 1, ecx, 8 -DECLARE_REG 2, edx, 12 -DECLARE_REG 3, ebx, 16 -DECLARE_REG 4, esi, 20 -DECLARE_REG 5, edi, 24 -DECLARE_REG 6, ebp, 28 -%define rsp esp - -%macro DECLARE_ARG 1-* - %rep %0 - %define r%1m [rstk + stack_offset + 4*%1 + 4] - %define r%1mp dword r%1m - %rotate 1 - %endrep -%endmacro - -DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - %if num_args > 7 - %assign num_args 7 - %endif - %if regs_used > 7 - %assign regs_used 7 - %endif - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 7 - PUSH_IF_USED 3, 4, 5, 6 - ALLOC_STACK %4 - LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 6, 5, 4, 3 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%endif ;====================================================================== - -%if WIN64 == 0 -%macro WIN64_SPILL_XMM 1 -%endmacro -%macro WIN64_RESTORE_XMM 1 -%endmacro -%macro WIN64_PUSH_XMM 0 -%endmacro -%endif - -; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either -; a branch or a branch target. So switch to a 2-byte form of ret in that case. -; We can automatically detect "follows a branch", but not a branch target. -; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) -%macro REP_RET 0 - %if has_epilogue - RET - %else - rep ret - %endif -%endmacro - -%define last_branch_adr $$ -%macro AUTO_REP_RET 0 - %ifndef cpuflags - times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. - %elif notcpuflag(ssse3) - times ((last_branch_adr-$)>>31)+1 rep - %endif - ret -%endmacro - -%macro BRANCH_INSTR 0-* - %rep %0 - %macro %1 1-2 %1 - %2 %1 - %%branch_instr: - %xdefine last_branch_adr %%branch_instr - %endmacro - %rotate 1 - %endrep -%endmacro - -BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp - -%macro TAIL_CALL 2 ; callee, is_nonadjacent - %if has_epilogue - call %1 - RET - %elif %2 - jmp %1 - %endif -%endmacro - -;============================================================================= -; arch-independent part -;============================================================================= - -%assign function_align 16 - -; Begin a function. -; Applies any symbol mangling needed for C linkage, and sets up a define such that -; subsequent uses of the function name automatically refer to the mangled version. -; Appends cpuflags to the function name if cpuflags has been specified. -; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX -; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). -%macro cglobal 1-2+ "" ; name, [PROLOGUE args] - cglobal_internal 1, %1 %+ SUFFIX, %2 -%endmacro -%macro cvisible 1-2+ "" ; name, [PROLOGUE args] - cglobal_internal 0, %1 %+ SUFFIX, %2 -%endmacro -%macro cglobal_internal 2-3+ - %if %1 - %xdefine %%FUNCTION_PREFIX private_prefix - %xdefine %%VISIBILITY hidden - %else - %xdefine %%FUNCTION_PREFIX public_prefix - %xdefine %%VISIBILITY - %endif - %ifndef cglobaled_%2 - %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) - %xdefine %2.skip_prologue %2 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %2, 1 - %endif - %xdefine current_function %2 - %ifidn __OUTPUT_FORMAT__,elf - global %2:function %%VISIBILITY - %else - global %2 - %endif - align function_align - %2: - RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer - %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required - %assign stack_offset 0 ; stack pointer offset relative to the return address - %assign stack_size 0 ; amount of stack space that can be freely used inside a function - %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 - %ifnidn %3, "" - PROLOGUE %3 - %endif -%endmacro - -%macro cextern 1 - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -; like cextern, but without the prefix -%macro cextern_naked 1 - %xdefine %1 mangle(%1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -%macro const 1-2+ - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - %ifidn __OUTPUT_FORMAT__,elf - global %1:data hidden - %else - global %1 - %endif - %1: %2 -%endmacro - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION .note.GNU-stack noalloc noexec nowrite progbits -%endif - -; cpuflags - -%assign cpuflags_mmx (1<<0) -%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx -%assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow -%assign cpuflags_sse (1<<4) | cpuflags_mmx2 -%assign cpuflags_sse2 (1<<5) | cpuflags_sse -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_avx2 (1<<14)| cpuflags_avx -%assign cpuflags_fma3 (1<<15)| cpuflags_avx - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 - -%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) -%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) - -; Takes up to 2 cpuflags from the above list. -; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. -; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-2 - CPUNOP amdnop - %if %0 >= 1 - %xdefine cpuname %1 - %assign cpuflags cpuflags_%1 - %if %0 >= 2 - %xdefine cpuname %1_%2 - %assign cpuflags cpuflags | cpuflags_%2 - %endif - %xdefine SUFFIX _ %+ cpuname - %if cpuflag(avx) - %assign avx_enabled 1 - %endif - %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) - %define mova movaps - %define movu movups - %define movnta movntps - %endif - %if cpuflag(aligned) - %define movu mova - %elifidn %1, sse3 - %define movu lddqu - %endif - %if ARCH_X86_64 == 0 && notcpuflag(sse2) - CPUNOP basicnop - %endif - %else - %xdefine SUFFIX - %undef cpuname - %undef cpuflags - %endif -%endmacro - -; Merge mmx and sse* -; m# is a simd register of the currently selected size -; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# -; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) - -%macro CAT_XDEFINE 3 - %xdefine %1%2 %3 -%endmacro - -%macro CAT_UNDEF 2 - %undef %1%2 -%endmacro - -%macro INIT_MMX 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_MMX %1 - %define mmsize 8 - %define num_mmregs 8 - %define mova movq - %define movu movq - %define movh movd - %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_XMM 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_XMM %1 - %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %define movh movq - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_YMM 0-1+ - %assign avx_enabled 1 - %define RESET_MM_PERMUTATION INIT_YMM %1 - %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %undef movh - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nymm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -INIT_XMM - -%macro DECLARE_MMCAST 1 - %define mmmm%1 mm%1 - %define mmxmm%1 mm%1 - %define mmymm%1 mm%1 - %define xmmmm%1 mm%1 - %define xmmxmm%1 xmm%1 - %define xmmymm%1 xmm%1 - %define ymmmm%1 mm%1 - %define ymmxmm%1 xmm%1 - %define ymmymm%1 ymm%1 - %define xm%1 xmm %+ m%1 - %define ym%1 ymm %+ m%1 -%endmacro - -%assign i 0 -%rep 16 - DECLARE_MMCAST i -%assign i i+1 -%endrep - -; I often want to use macros that permute their arguments. e.g. there's no -; efficient way to implement butterfly or transpose or dct without swapping some -; arguments. -; -; I would like to not have to manually keep track of the permutations: -; If I insert a permutation in the middle of a function, it should automatically -; change everything that follows. For more complex macros I may also have multiple -; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. -; -; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that -; permutes its arguments. It's equivalent to exchanging the contents of the -; registers, except that this way you exchange the register names instead, so it -; doesn't cost any cycles. - -%macro PERMUTE 2-* ; takes a list of pairs to swap -%rep %0/2 - %xdefine %%tmp%2 m%2 - %rotate 2 -%endrep -%rep %0/2 - %xdefine m%1 %%tmp%2 - CAT_XDEFINE n, m%1, %1 - %rotate 2 -%endrep -%endmacro - -%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) -%ifnum %1 ; SWAP 0, 1, ... - SWAP_INTERNAL_NUM %1, %2 -%else ; SWAP m0, m1, ... - SWAP_INTERNAL_NAME %1, %2 -%endif -%endmacro - -%macro SWAP_INTERNAL_NUM 2-* - %rep %0-1 - %xdefine %%tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 %%tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 - %rotate 1 - %endrep -%endmacro - -%macro SWAP_INTERNAL_NAME 2-* - %xdefine %%args n %+ %1 - %rep %0-1 - %xdefine %%args %%args, n %+ %2 - %rotate 1 - %endrep - SWAP_INTERNAL_NUM %%args -%endmacro - -; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later -; calls to that function will automatically load the permutation, so values can -; be returned in mmregs. -%macro SAVE_MM_PERMUTATION 0-1 - %if %0 - %xdefine %%f %1_m - %else - %xdefine %%f current_function %+ _m - %endif - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE %%f, %%i, m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro LOAD_MM_PERMUTATION 1 ; name to load from - %ifdef %1_m0 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE n, m %+ %%i, %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't -%macro call 1 - call_internal %1, %1 %+ SUFFIX -%endmacro -%macro call_internal 2 - %xdefine %%i %1 - %ifndef cglobaled_%1 - %ifdef cglobaled_%2 - %xdefine %%i %2 - %endif - %endif - call %%i - LOAD_MM_PERMUTATION %%i -%endmacro - -; Substitutions that reduce instruction size but are functionally equivalent -%macro add 2 - %ifnum %2 - %if %2==128 - sub %1, -128 - %else - add %1, %2 - %endif - %else - add %1, %2 - %endif -%endmacro - -%macro sub 2 - %ifnum %2 - %if %2==128 - add %1, -128 - %else - sub %1, %2 - %endif - %else - sub %1, %2 - %endif -%endmacro - -;============================================================================= -; AVX abstraction layer -;============================================================================= - -%assign i 0 -%rep 16 - %if i < 8 - CAT_XDEFINE sizeofmm, i, 8 - %endif - CAT_XDEFINE sizeofxmm, i, 16 - CAT_XDEFINE sizeofymm, i, 32 -%assign i i+1 -%endrep -%undef i - -%macro CHECK_AVX_INSTR_EMU 3-* - %xdefine %%opcode %1 - %xdefine %%dst %2 - %rep %0-2 - %ifidn %%dst, %3 - %error non-avx emulation of ``%%opcode'' is not supported - %endif - %rotate 1 - %endrep -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -;%5+: operands -%macro RUN_AVX_INSTR 5-8+ - %ifnum sizeof%6 - %assign %%sizeofreg sizeof%6 - %elifnum sizeof%5 - %assign %%sizeofreg sizeof%5 - %else - %assign %%sizeofreg mmsize - %endif - %assign %%emulate_avx 0 - %if avx_enabled && %%sizeofreg >= 16 - %xdefine %%instr v%1 - %else - %xdefine %%instr %1 - %if %0 >= 7+%3 - %assign %%emulate_avx 1 - %endif - %endif - - %if %%emulate_avx - %xdefine %%src1 %6 - %xdefine %%src2 %7 - %ifnidn %5, %6 - %if %0 >= 8 - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8 - %else - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 - %endif - %if %4 && %3 == 0 - %ifnid %7 - ; 3-operand AVX instructions with a memory arg can only have it in src2, - ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). - ; So, if the instruction is commutative with a memory arg, swap them. - %xdefine %%src1 %7 - %xdefine %%src2 %6 - %endif - %endif - %if %%sizeofreg == 8 - MOVQ %5, %%src1 - %elif %2 - MOVAPS %5, %%src1 - %else - MOVDQA %5, %%src1 - %endif - %endif - %if %0 >= 8 - %1 %5, %%src2, %8 - %else - %1 %5, %%src2 - %endif - %elif %0 >= 8 - %%instr %5, %6, %7, %8 - %elif %0 == 7 - %%instr %5, %6, %7 - %elif %0 == 6 - %%instr %5, %6 - %else - %%instr %5 - %endif -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 1-4 0, 1, 0 - %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4 - %ifidn %2, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1 - %elifidn %3, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2 - %elifidn %4, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3 - %elifidn %5, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4 - %else - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5 - %endif - %endmacro -%endmacro - -; Instructions with both VEX and non-VEX encodings -; Non-destructive instructions are written without parameters -AVX_INSTR addpd, 1, 0, 1 -AVX_INSTR addps, 1, 0, 1 -AVX_INSTR addsd, 1, 0, 1 -AVX_INSTR addss, 1, 0, 1 -AVX_INSTR addsubpd, 1, 0, 0 -AVX_INSTR addsubps, 1, 0, 0 -AVX_INSTR aesdec, 0, 0, 0 -AVX_INSTR aesdeclast, 0, 0, 0 -AVX_INSTR aesenc, 0, 0, 0 -AVX_INSTR aesenclast, 0, 0, 0 -AVX_INSTR aesimc -AVX_INSTR aeskeygenassist -AVX_INSTR andnpd, 1, 0, 0 -AVX_INSTR andnps, 1, 0, 0 -AVX_INSTR andpd, 1, 0, 1 -AVX_INSTR andps, 1, 0, 1 -AVX_INSTR blendpd, 1, 0, 0 -AVX_INSTR blendps, 1, 0, 0 -AVX_INSTR blendvpd, 1, 0, 0 -AVX_INSTR blendvps, 1, 0, 0 -AVX_INSTR cmppd, 1, 1, 0 -AVX_INSTR cmpps, 1, 1, 0 -AVX_INSTR cmpsd, 1, 1, 0 -AVX_INSTR cmpss, 1, 1, 0 -AVX_INSTR comisd -AVX_INSTR comiss -AVX_INSTR cvtdq2pd -AVX_INSTR cvtdq2ps -AVX_INSTR cvtpd2dq -AVX_INSTR cvtpd2ps -AVX_INSTR cvtps2dq -AVX_INSTR cvtps2pd -AVX_INSTR cvtsd2si -AVX_INSTR cvtsd2ss -AVX_INSTR cvtsi2sd -AVX_INSTR cvtsi2ss -AVX_INSTR cvtss2sd -AVX_INSTR cvtss2si -AVX_INSTR cvttpd2dq -AVX_INSTR cvttps2dq -AVX_INSTR cvttsd2si -AVX_INSTR cvttss2si -AVX_INSTR divpd, 1, 0, 0 -AVX_INSTR divps, 1, 0, 0 -AVX_INSTR divsd, 1, 0, 0 -AVX_INSTR divss, 1, 0, 0 -AVX_INSTR dppd, 1, 1, 0 -AVX_INSTR dpps, 1, 1, 0 -AVX_INSTR extractps -AVX_INSTR haddpd, 1, 0, 0 -AVX_INSTR haddps, 1, 0, 0 -AVX_INSTR hsubpd, 1, 0, 0 -AVX_INSTR hsubps, 1, 0, 0 -AVX_INSTR insertps, 1, 1, 0 -AVX_INSTR lddqu -AVX_INSTR ldmxcsr -AVX_INSTR maskmovdqu -AVX_INSTR maxpd, 1, 0, 1 -AVX_INSTR maxps, 1, 0, 1 -AVX_INSTR maxsd, 1, 0, 1 -AVX_INSTR maxss, 1, 0, 1 -AVX_INSTR minpd, 1, 0, 1 -AVX_INSTR minps, 1, 0, 1 -AVX_INSTR minsd, 1, 0, 1 -AVX_INSTR minss, 1, 0, 1 -AVX_INSTR movapd -AVX_INSTR movaps -AVX_INSTR movd -AVX_INSTR movddup -AVX_INSTR movdqa -AVX_INSTR movdqu -AVX_INSTR movhlps, 1, 0, 0 -AVX_INSTR movhpd, 1, 0, 0 -AVX_INSTR movhps, 1, 0, 0 -AVX_INSTR movlhps, 1, 0, 0 -AVX_INSTR movlpd, 1, 0, 0 -AVX_INSTR movlps, 1, 0, 0 -AVX_INSTR movmskpd -AVX_INSTR movmskps -AVX_INSTR movntdq -AVX_INSTR movntdqa -AVX_INSTR movntpd -AVX_INSTR movntps -AVX_INSTR movq -AVX_INSTR movsd, 1, 0, 0 -AVX_INSTR movshdup -AVX_INSTR movsldup -AVX_INSTR movss, 1, 0, 0 -AVX_INSTR movupd -AVX_INSTR movups -AVX_INSTR mpsadbw, 0, 1, 0 -AVX_INSTR mulpd, 1, 0, 1 -AVX_INSTR mulps, 1, 0, 1 -AVX_INSTR mulsd, 1, 0, 1 -AVX_INSTR mulss, 1, 0, 1 -AVX_INSTR orpd, 1, 0, 1 -AVX_INSTR orps, 1, 0, 1 -AVX_INSTR pabsb -AVX_INSTR pabsd -AVX_INSTR pabsw -AVX_INSTR packsswb, 0, 0, 0 -AVX_INSTR packssdw, 0, 0, 0 -AVX_INSTR packuswb, 0, 0, 0 -AVX_INSTR packusdw, 0, 0, 0 -AVX_INSTR paddb, 0, 0, 1 -AVX_INSTR paddw, 0, 0, 1 -AVX_INSTR paddd, 0, 0, 1 -AVX_INSTR paddq, 0, 0, 1 -AVX_INSTR paddsb, 0, 0, 1 -AVX_INSTR paddsw, 0, 0, 1 -AVX_INSTR paddusb, 0, 0, 1 -AVX_INSTR paddusw, 0, 0, 1 -AVX_INSTR palignr, 0, 1, 0 -AVX_INSTR pand, 0, 0, 1 -AVX_INSTR pandn, 0, 0, 0 -AVX_INSTR pavgb, 0, 0, 1 -AVX_INSTR pavgw, 0, 0, 1 -AVX_INSTR pblendvb, 0, 0, 0 -AVX_INSTR pblendw, 0, 1, 0 -AVX_INSTR pclmulqdq, 0, 1, 0 -AVX_INSTR pcmpestri -AVX_INSTR pcmpestrm -AVX_INSTR pcmpistri -AVX_INSTR pcmpistrm -AVX_INSTR pcmpeqb, 0, 0, 1 -AVX_INSTR pcmpeqw, 0, 0, 1 -AVX_INSTR pcmpeqd, 0, 0, 1 -AVX_INSTR pcmpeqq, 0, 0, 1 -AVX_INSTR pcmpgtb, 0, 0, 0 -AVX_INSTR pcmpgtw, 0, 0, 0 -AVX_INSTR pcmpgtd, 0, 0, 0 -AVX_INSTR pcmpgtq, 0, 0, 0 -AVX_INSTR pextrb -AVX_INSTR pextrd -AVX_INSTR pextrq -AVX_INSTR pextrw -AVX_INSTR phaddw, 0, 0, 0 -AVX_INSTR phaddd, 0, 0, 0 -AVX_INSTR phaddsw, 0, 0, 0 -AVX_INSTR phminposuw -AVX_INSTR phsubw, 0, 0, 0 -AVX_INSTR phsubd, 0, 0, 0 -AVX_INSTR phsubsw, 0, 0, 0 -AVX_INSTR pinsrb, 0, 1, 0 -AVX_INSTR pinsrd, 0, 1, 0 -AVX_INSTR pinsrq, 0, 1, 0 -AVX_INSTR pinsrw, 0, 1, 0 -AVX_INSTR pmaddwd, 0, 0, 1 -AVX_INSTR pmaddubsw, 0, 0, 0 -AVX_INSTR pmaxsb, 0, 0, 1 -AVX_INSTR pmaxsw, 0, 0, 1 -AVX_INSTR pmaxsd, 0, 0, 1 -AVX_INSTR pmaxub, 0, 0, 1 -AVX_INSTR pmaxuw, 0, 0, 1 -AVX_INSTR pmaxud, 0, 0, 1 -AVX_INSTR pminsb, 0, 0, 1 -AVX_INSTR pminsw, 0, 0, 1 -AVX_INSTR pminsd, 0, 0, 1 -AVX_INSTR pminub, 0, 0, 1 -AVX_INSTR pminuw, 0, 0, 1 -AVX_INSTR pminud, 0, 0, 1 -AVX_INSTR pmovmskb -AVX_INSTR pmovsxbw -AVX_INSTR pmovsxbd -AVX_INSTR pmovsxbq -AVX_INSTR pmovsxwd -AVX_INSTR pmovsxwq -AVX_INSTR pmovsxdq -AVX_INSTR pmovzxbw -AVX_INSTR pmovzxbd -AVX_INSTR pmovzxbq -AVX_INSTR pmovzxwd -AVX_INSTR pmovzxwq -AVX_INSTR pmovzxdq -AVX_INSTR pmuldq, 0, 0, 1 -AVX_INSTR pmulhrsw, 0, 0, 1 -AVX_INSTR pmulhuw, 0, 0, 1 -AVX_INSTR pmulhw, 0, 0, 1 -AVX_INSTR pmullw, 0, 0, 1 -AVX_INSTR pmulld, 0, 0, 1 -AVX_INSTR pmuludq, 0, 0, 1 -AVX_INSTR por, 0, 0, 1 -AVX_INSTR psadbw, 0, 0, 1 -AVX_INSTR pshufb, 0, 0, 0 -AVX_INSTR pshufd -AVX_INSTR pshufhw -AVX_INSTR pshuflw -AVX_INSTR psignb, 0, 0, 0 -AVX_INSTR psignw, 0, 0, 0 -AVX_INSTR psignd, 0, 0, 0 -AVX_INSTR psllw, 0, 0, 0 -AVX_INSTR pslld, 0, 0, 0 -AVX_INSTR psllq, 0, 0, 0 -AVX_INSTR pslldq, 0, 0, 0 -AVX_INSTR psraw, 0, 0, 0 -AVX_INSTR psrad, 0, 0, 0 -AVX_INSTR psrlw, 0, 0, 0 -AVX_INSTR psrld, 0, 0, 0 -AVX_INSTR psrlq, 0, 0, 0 -AVX_INSTR psrldq, 0, 0, 0 -AVX_INSTR psubb, 0, 0, 0 -AVX_INSTR psubw, 0, 0, 0 -AVX_INSTR psubd, 0, 0, 0 -AVX_INSTR psubq, 0, 0, 0 -AVX_INSTR psubsb, 0, 0, 0 -AVX_INSTR psubsw, 0, 0, 0 -AVX_INSTR psubusb, 0, 0, 0 -AVX_INSTR psubusw, 0, 0, 0 -AVX_INSTR ptest -AVX_INSTR punpckhbw, 0, 0, 0 -AVX_INSTR punpckhwd, 0, 0, 0 -AVX_INSTR punpckhdq, 0, 0, 0 -AVX_INSTR punpckhqdq, 0, 0, 0 -AVX_INSTR punpcklbw, 0, 0, 0 -AVX_INSTR punpcklwd, 0, 0, 0 -AVX_INSTR punpckldq, 0, 0, 0 -AVX_INSTR punpcklqdq, 0, 0, 0 -AVX_INSTR pxor, 0, 0, 1 -AVX_INSTR rcpps, 1, 0, 0 -AVX_INSTR rcpss, 1, 0, 0 -AVX_INSTR roundpd -AVX_INSTR roundps -AVX_INSTR roundsd -AVX_INSTR roundss -AVX_INSTR rsqrtps, 1, 0, 0 -AVX_INSTR rsqrtss, 1, 0, 0 -AVX_INSTR shufpd, 1, 1, 0 -AVX_INSTR shufps, 1, 1, 0 -AVX_INSTR sqrtpd, 1, 0, 0 -AVX_INSTR sqrtps, 1, 0, 0 -AVX_INSTR sqrtsd, 1, 0, 0 -AVX_INSTR sqrtss, 1, 0, 0 -AVX_INSTR stmxcsr -AVX_INSTR subpd, 1, 0, 0 -AVX_INSTR subps, 1, 0, 0 -AVX_INSTR subsd, 1, 0, 0 -AVX_INSTR subss, 1, 0, 0 -AVX_INSTR ucomisd -AVX_INSTR ucomiss -AVX_INSTR unpckhpd, 1, 0, 0 -AVX_INSTR unpckhps, 1, 0, 0 -AVX_INSTR unpcklpd, 1, 0, 0 -AVX_INSTR unpcklps, 1, 0, 0 -AVX_INSTR xorpd, 1, 0, 1 -AVX_INSTR xorps, 1, 0, 1 - -; 3DNow instructions, for sharing code between AVX, SSE and 3DN -AVX_INSTR pfadd, 1, 0, 1 -AVX_INSTR pfsub, 1, 0, 0 -AVX_INSTR pfmul, 1, 0, 1 - -; base-4 constants for shuffles -%assign i 0 -%rep 256 - %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) - %if j < 10 - CAT_XDEFINE q000, j, i - %elif j < 100 - CAT_XDEFINE q00, j, i - %elif j < 1000 - CAT_XDEFINE q0, j, i - %else - CAT_XDEFINE q, j, i - %endif -%assign i i+1 -%endrep -%undef i -%undef j - -%macro FMA_INSTR 3 - %macro %1 4-7 %1, %2, %3 - %if cpuflag(xop) - v%5 %1, %2, %3, %4 - %else - %6 %1, %2, %3 - %7 %1, %4 - %endif - %endmacro -%endmacro - -FMA_INSTR pmacsdd, pmulld, paddd -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmadcswd, pmaddwd, paddd - -; convert FMA4 to FMA3 if possible -%macro FMA4_INSTR 4 - %macro %1 4-8 %1, %2, %3, %4 - %if cpuflag(fma4) - v%5 %1, %2, %3, %4 - %elifidn %1, %2 - v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 - %elifidn %1, %3 - v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 - %elifidn %1, %4 - v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 - %else - %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported - %endif - %endmacro -%endmacro - -FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd -FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps -FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd -FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss - -FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd -FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps -FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd -FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps - -FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd -FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps -FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd -FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss - -FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd -FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps -FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd -FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss - -FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd -FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps -FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd -FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss - -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug -%if ARCH_X86_64 == 0 -%macro vpbroadcastq 2 -%if sizeof%1 == 16 - movddup %1, %2 -%else - vbroadcastsd %1, %2 -%endif -%endmacro -%endif - -%ifidn __OUTPUT_FORMAT__,elf -section .note.GNU-stack noalloc noexec nowrite progbits -%endif -%ifidn __OUTPUT_FORMAT__,elf32 -section .note.GNU-stack noalloc noexec nowrite progbits -%endif -%ifidn __OUTPUT_FORMAT__,elf64 -section .note.GNU-stack noalloc noexec nowrite progbits -%endif diff --git a/src/strategyselector.c b/src/strategyselector.c index a027a672..5826b509 100644 --- a/src/strategyselector.c +++ b/src/strategyselector.c @@ -257,8 +257,7 @@ int kvz_strategyselector_register(void * const opaque, const char * const type, } //Check what strategies are available when they are registered - if (strcmp(strategy_name, "avx") == 0) kvz_g_strategies_available.intel_flags.avx++; - if (strcmp(strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_available.intel_flags.avx++; + if (strcmp(strategy_name, "avx") == 0) kvz_g_strategies_available.intel_flags.avx++; if (strcmp(strategy_name, "avx2") == 0) kvz_g_strategies_available.intel_flags.avx2++; if (strcmp(strategy_name, "mmx") == 0) kvz_g_strategies_available.intel_flags.mmx++; if (strcmp(strategy_name, "sse") == 0) kvz_g_strategies_available.intel_flags.sse++; @@ -329,8 +328,7 @@ static void* strategyselector_choose_for(const strategy_list_t * const strategie } //Check what strategy we are going to use - if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++; - if (strcmp(strategies->strategies[max_priority_i].strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++; + if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++; if (strcmp(strategies->strategies[max_priority_i].strategy_name, "avx2") == 0) kvz_g_strategies_in_use.intel_flags.avx2++; if (strcmp(strategies->strategies[max_priority_i].strategy_name, "mmx") == 0) kvz_g_strategies_in_use.intel_flags.mmx++; if (strcmp(strategies->strategies[max_priority_i].strategy_name, "sse") == 0) kvz_g_strategies_in_use.intel_flags.sse++;