Added AVX strategy

This commit is contained in:
Ari Lemmetti 2014-06-16 18:14:11 +03:00
parent ab3845c9c7
commit bdef5384ef
4 changed files with 823 additions and 0 deletions

View file

@ -120,6 +120,9 @@
<ClCompile Include="..\..\src\strategies\strategies-picture.c">
<Filter>Source Files\strategies</Filter>
</ClCompile>
<ClCompile Include="..\..\src\strategies\picture\picture-avx.c">
<Filter>Source Files\strategies\picture</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\src\global.h">
@ -236,10 +239,19 @@
<ClInclude Include="..\..\src\strategies\nal\nal-generic.c">
<Filter>Source Files\strategies\nal</Filter>
</ClInclude>
<ClInclude Include="..\..\src\x86\picture_x86.h">
<Filter>Header Files\x86</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<YASM Include="..\..\src\x86\cpu.asm">
<Filter>Source Files\x86</Filter>
</YASM>
<YASM Include="..\..\src\x86\picture_x86.asm">
<Filter>Source Files\x86</Filter>
</YASM>
<YASM Include="..\..\src\x86\x86inc.asm">
<Filter>Source Files\x86</Filter>
</YASM>
</ItemGroup>
</Project>

View file

@ -0,0 +1,152 @@
/*****************************************************************************
* This file is part of Kvazaar HEVC encoder.
*
* Copyright (C) 2013-2014 Tampere University of Technology and others (see
* COPYING file).
*
* Kvazaar is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*
* Kvazaar is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
****************************************************************************/
/*
* \file
*/
#include "../strategyselector.h"
//#include "../picture.h"
#include "../x86/picture_x86.h"
#include <math.h>
#ifdef __GNUC__
__attribute__((__target__("avx")))
#endif
static unsigned kvz_sad_32x32(const pixel *data1, const pixel *data2)
{
unsigned sad = 0;
sad += kvz_sad_16x16(data1, data2);
sad += kvz_sad_16x16(data1 + 8 * 32, data2 + 8 * 32);
sad += kvz_sad_16x16(data1 + 16 * 32, data2 + 16 * 32);
sad += kvz_sad_16x16(data1 + 24 * 32, data2 + 24 * 32);
return sad;
}
static unsigned kvz_sad_32x32_stride(const pixel *data1, const pixel *data2, unsigned stride)
{
unsigned sad = 0;
sad += kvz_sad_16x16_stride(data1, data2, stride);
sad += kvz_sad_16x16_stride(data1 + 16, data2 + 16, stride);
sad += kvz_sad_16x16_stride(data1 + 16 * stride, data2 + 16 * stride, stride);
sad += kvz_sad_16x16_stride(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride);
return sad;
}
static unsigned kvz_sad_64x64(const pixel *data1, const pixel *data2)
{
unsigned sad = 0;
sad += kvz_sad_32x32(data1, data2);
sad += kvz_sad_32x32(data1 + 16 * 64, data2 + 16 * 64);
sad += kvz_sad_32x32(data1 + 32 * 64, data2 + 32 * 64);
sad += kvz_sad_32x32(data1 + 48 * 64, data2 + 48 * 64);
return sad;
}
static unsigned kvz_sad_64x64_stride(const pixel *data1, const pixel *data2, unsigned stride)
{
unsigned sad = 0;
sad += kvz_sad_32x32_stride(data1, data2, stride);
sad += kvz_sad_32x32_stride(data1 + 32, data2 + 32, stride);
sad += kvz_sad_32x32_stride(data1 + 32 * stride, data2 + 32 * stride, stride);
sad += kvz_sad_32x32_stride(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride);
return sad;
}
static unsigned kvz_sad_generic(const pixel * const data1, const pixel * const data2,
const int width, const int height, const unsigned stride1, const unsigned stride2)
{
int y, x;
unsigned sad = 0;
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
}
}
return sad;
}
static unsigned reg_sad_avx(const pixel * const data1, const pixel * const data2,
const int width, const int height, const unsigned stride1, const unsigned stride2)
{
if (width == 4 && height == 4) {
return kvz_sad_4x4_stride(data1, data2, stride1);
} else if (width == 8 && height == 8) {
return kvz_sad_8x8_stride(data1, data2, stride1);
} else if (width == 16 && height == 16) {
return kvz_sad_16x16_stride(data1, data2, stride1);
} else if (width == 32 && height == 32) {
return kvz_sad_32x32_stride(data1, data2, stride1);
} else if (width == 64 && height == 64) {
return kvz_sad_64x64_stride(data1, data2, stride1);
} else {
return kvz_sad_generic(data1, data2, width, height, stride1, stride2);
}
}
// Function macro for defining hadamard calculating functions
// for fixed size blocks. They calculate hadamard for integer
// multiples of 8x8 with the 8x8 hadamard function.
#define KVZ_SATD_NXN(n, pixel_type, suffix) \
static unsigned kvz_satd_ ## suffix ## _ ## n ## x ## n ## _stride( \
const pixel_type * const block1, const pixel_type * const block2) \
{ \
unsigned x, y; \
unsigned sum = 0; \
for (y = 0; y < (n); y += 8) { \
unsigned row = y * (n); \
for (x = 0; x < (n); x += 8) { \
sum += kvz_satd_8x8_stride(&block1[row + x], (n), &block2[row + x], (n)); \
} \
} \
return sum; \
}
// Declare these functions to make sure the signature of the macro matches.
static cost_pixel_nxn_func kvz_satd_8bit_16x16;
static cost_pixel_nxn_func kvz_satd_8bit_32x32;
static cost_pixel_nxn_func kvz_satd_8bit_64x64;
// These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
KVZ_SATD_NXN(16, pixel, 8bit)
KVZ_SATD_NXN(32, pixel, 8bit)
KVZ_SATD_NXN(64, pixel, 8bit)
static int strategy_register_picture_avx(void* opaque) {
bool success = true;
success &= strategyselector_register(opaque, "reg_sad", "avx", 30, &reg_sad_avx);
success &= strategyselector_register(opaque, "sad_8bit_4x4", "avx", 30, &kvz_sad_4x4);
success &= strategyselector_register(opaque, "sad_8bit_8x8", "avx", 30, &kvz_sad_8x8);
success &= strategyselector_register(opaque, "sad_8bit_16x16", "avx", 30, &kvz_sad_16x16);
success &= strategyselector_register(opaque, "sad_8bit_32x32", "avx", 30, &kvz_sad_32x32);
success &= strategyselector_register(opaque, "sad_8bit_64x64", "avx", 30, &kvz_sad_64x64);
success &= strategyselector_register(opaque, "satd_8bit_4x4", "avx", 30, &kvz_satd_4x4);
success &= strategyselector_register(opaque, "satd_8bit_8x8", "avx", 30, &kvz_satd_8x8_stride);
success &= strategyselector_register(opaque, "satd_8bit_16x16", "avx", 30, &kvz_satd_8bit_16x16_stride);
success &= strategyselector_register(opaque, "satd_8bit_32x32", "avx", 30, &kvz_satd_8bit_32x32_stride);
success &= strategyselector_register(opaque, "satd_8bit_64x64", "avx", 30, &kvz_satd_8bit_64x64_stride);
return success;
}

620
src/x86/picture_x86.asm Normal file
View file

@ -0,0 +1,620 @@
;/*****************************************************************************
;* This file is part of Kvazaar HEVC encoder.
;*
;* Copyright (C) 2013-2014 Tampere University of Technology and others (see
;* COPYING file).
;*
;* Kvazaar is free software: you can redistribute it and/or modify
;* it under the terms of the GNU General Public License version 2 as published
;* by the Free Software Foundation.
;*
;* Kvazaar is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
;****************************************************************************/
%include "x86inc.asm"
SECTION .text
;KVZ_SAD_4X4
;Calculates SAD of the 16 consequtive bytes in memory
;r0 address of the first value(current)
;r1 address of the first value(reference)
cglobal sad_4x4, 2, 2, 2
vmovdqu m0, [r0]
vmovdqu m1, [r1]
vpsadbw m0, m1
vmovhlps m1, m0
vpaddw m0, m1
vmovd eax, m0
RET
;KVZ_SAD_4X4_STRIDE
;Calculates SAD of a 4x4 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_4x4_stride, 3, 3, 2
vpinsrd m0, [r0], 0
add r0, r2
vpinsrd m0, [r0], 1
vpinsrd m0, [r0+r2], 2
vpinsrd m0, [r0+r2*2], 3
vpinsrd m1, [r1], 0
add r1, r2
vpinsrd m1, [r1], 1
vpinsrd m1, [r1+r2], 2
vpinsrd m1, [r1+r2*2], 3
vpsadbw m0, m1
vmovhlps m1, m0
vpaddw m0, m1
vmovd eax, m0
RET
;KVZ_SAD_8X8
;Calculates SAD of the 64 consequtive bytes in memory
;r0 address of the first value(current)
;r1 address of the first value(reference)
cglobal sad_8x8, 2, 2, 5
vpxor m0, m0
%rep 2
vmovdqu m1, [r0]
vmovdqu m3, [r0+16]
add r0, 32
vmovdqu m2, [r1]
vmovdqu m4, [r1+16]
add r1, 32
vpsadbw m1, m2
vpsadbw m3, m4
vpaddw m0, m1
vpaddw m0, m3
%endrep
vmovhlps m1, m0
vpaddw m0, m1
vmovd eax, m0
RET
;KVZ_SAD_8X8_STRIDE
;Calculates SAD of a 8x8 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_8x8_stride, 3, 3, 5
vpxor m0, m0
vmovhpd m1, [r0]
add r0, r2
vmovlpd m1, [r0]
vmovhpd m3, [r0+r2]
vmovlpd m3, [r0+r2*2]
lea r0, [r0+r2*2]
add r0, r2
vmovhpd m2, [r1]
add r1, r2
vmovlpd m2, [r1]
vmovhpd m4, [r1+r2]
vmovlpd m4, [r1+r2*2]
lea r1, [r1+r2*2]
add r1, r2
vpsadbw m1, m2
vpsadbw m3, m4
vpaddw m0, m1
vpaddw m0, m3
vmovhpd m1, [r0]
add r0, r2
vmovlpd m1, [r0]
vmovhpd m3, [r0+r2]
vmovlpd m3, [r0+r2*2]
lea r0, [r0+r2*2]
add r0, r2
vmovhpd m2, [r1]
add r1, r2
vmovlpd m2, [r1]
vmovhpd m4, [r1+r2]
vmovlpd m4, [r1+r2*2]
lea r1, [r1+r2*2]
add r1, r2
vpsadbw m1, m2
vpsadbw m3, m4
vpaddw m0, m1
vpaddw m0, m3
vmovhlps m1, m0
vpaddw m0, m1
vmovd eax, m0
RET
;KVZ_SAD_16X16
;Calculates SAD of the 256 consequtive bytes in memory
;r0 address of the first value(current)
;r1 address of the first value(reference)
cglobal sad_16x16, 2, 2, 5
vpxor m4, m4
%rep 8
; Load 2 rows from rec_buf to m0 and m2
vmovdqu m0, [r0]
vmovdqu m2, [r0 + 16]
add r0, 32
; Load 2 rows from ref_buf to m1 and m3
vmovdqu m1, [r1]
vmovdqu m3, [r1 + 16]
add r1, 32
vpsadbw m0, m1
vpsadbw m2, m3
vpaddw m4, m0
vpaddw m4, m2
%endrep
vmovhlps m0, m4
vpaddw m4, m0
vmovd eax, m4
RET
;KVZ_SAD_16X16_STRIDE
;Calculates SAD of a 16x16 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_16x16_stride, 3, 3, 5
vpxor m4, m4
%rep 8
; Load 2 rows from rec_buf to m0 and m2
vmovdqu m0, [r0]
vmovdqu m2, [r0 + r2]
lea r0, [r0 + r2*2]
; Load 2 rows from ref_buf to m1 and m3
vmovdqu m1, [r1]
vmovdqu m3, [r1 + r2]
lea r1, [r1 + r2*2]
vpsadbw m0, m1
vpsadbw m2, m3
vpaddw m4, m0
vpaddw m4, m2
%endrep
vmovhlps m0, m4
vpaddw m4, m0
vmovd eax, m4
RET
;KVZ_SATD_4X4
;Calculates SATD of the 16 consequtive bytes in memory
;r0 address of the first value(current)
;r1 address of the first value(reference)
cglobal satd_4x4, 2, 2, 6
vpmovzxbw m0, [r0]
vpmovzxbw m2, [r1]
vpsubw m0, m2
vpmovzxbw m1, [r0+8]
vpmovzxbw m3, [r1+8]
vpsubw m1, m3
;Horizontal phase
;rows 1-2
vphaddw m4, m0, m1
vphsubw m5, m0, m1
vphaddw m0, m4, m5
vphsubw m1, m4, m5
;Vertical phase
vphaddw m4, m0, m1
vphsubw m5, m0, m1
vphaddw m0, m4, m5
vphsubw m1, m4, m5
vpabsw m0, m0
vpabsw m1, m1
vpaddw m0, m1
vphaddw m0, m0
vphaddw m0, m0
vphaddw m0, m0
vpextrw eax, m0, 0
;Uncomment if transformed values not divided elsewhere
;add eax, 1
;shr eax, 1
RET
;KVZ_SATD_8X8_STRIDE
;Calculates SATD of a 8x8 block inside a frame with stride
;r0 address of the first value(reference)
;r1 address of the first value(current)
;r2 stride
%if ARCH_X86_64
cglobal satd_8x8_stride, 4, 4, 16
%else
cglobal satd_8x8_stride, 4, 4, 8
%endif
vpmovzxbw m0, [r0]
vpmovzxbw m7, [r2]
vpsubw m0, m7
vpmovzxbw m1, [r0+r1]
lea r0, [r0+r1*2]
vpmovzxbw m7, [r2+r3]
lea r2, [r2+r3*2]
vpsubw m1, m7
vpmovzxbw m2, [r0]
vpmovzxbw m7, [r2]
vpsubw m2, m7
vpmovzxbw m3, [r0+r1]
lea r0, [r0+r1*2]
vpmovzxbw m7, [r2+r3]
lea r2, [r2+r3*2]
vpsubw m3, m7
vpmovzxbw m4, [r0]
vpmovzxbw m7, [r2]
vpsubw m4, m7
vpmovzxbw m5, [r0+r1]
lea r0, [r0+r1*2]
vpmovzxbw m7, [r2+r3]
lea r2, [r2+r3*2]
vpsubw m5, m7
vpmovzxbw m6, [r0]
vpmovzxbw m7, [r2]
vpsubw m6, m7
%if ARCH_X86_64
vpmovzxbw m7, [r0+r1]
vpmovzxbw m8, [r2+r3]
vpsubw m7, m8
%elif
vpmovzxbw m7, [r2+r3]
movdqu [esp-16], m7
vpmovzxbw m7, [r0+r1]
vpsubw m7, [esp-16]
movdqu [esp-16], m4
movdqu [esp-16*2], m5
movdqu [esp-16*3], m6
movdqu [esp-16*4], m7
lea esp, [esp-16*4]
%endif
;Horizontal phaze
%if ARCH_X86_64
vphaddw m8, m0, m1
vphsubw m9, m0, m1
vphaddw m10, m2, m3
vphsubw m11, m2, m3
vphaddw m12, m4, m5
vphsubw m13, m4, m5
vphaddw m14, m6, m7
vphsubw m15, m6, m7
vphaddw m0, m8, m9
vphsubw m1, m8, m9
vphaddw m2, m10, m11
vphsubw m3, m10, m11
vphaddw m4, m12, m13
vphsubw m5, m12, m13
vphaddw m6, m14, m15
vphsubw m7, m14, m15
vphaddw m8, m0, m1
vphsubw m9, m0, m1
vphaddw m10, m2, m3
vphsubw m11, m2, m3
vphaddw m12, m4, m5
vphsubw m13, m4, m5
vphaddw m14, m6, m7
vphsubw m15, m6, m7
%elif
vphaddw m4, m0, m1
vphsubw m5, m0, m1
vphaddw m6, m2, m3
vphsubw m7, m2, m3
vphaddw m0, m4, m5
vphsubw m1, m4, m5
vphaddw m2, m6, m7
vphsubw m3, m6, m7
vphaddw m4, m0, m1
vphsubw m5, m0, m1
vphaddw m6, m2, m3
vphsubw m7, m2, m3
movdqu m3, [esp]
movdqu m2, [esp+16]
movdqu m1, [esp+16*2]
movdqu m0, [esp+16*3]
movdqu [esp], m7
movdqu [esp+16*1], m6
movdqu [esp+16*2], m5
movdqu [esp+16*3], m4
vphaddw m4, m0, m1
vphsubw m5, m0, m1
vphaddw m6, m2, m3
vphsubw m7, m2, m3
vphaddw m0, m4, m5
vphsubw m1, m4, m5
vphaddw m2, m6, m7
vphsubw m3, m6, m7
vphaddw m4, m0, m1
vphsubw m5, m0, m1
vphaddw m6, m2, m3
vphsubw m7, m2, m3
%endif
;Vertical phase
%if ARCH_X86_64
vphaddw m0, m8, m9
vphsubw m1, m8, m9
vphaddw m2, m10, m11
vphsubw m3, m10, m11
vphaddw m4, m12, m13
vphsubw m5, m12, m13
vphaddw m6, m14, m15
vphsubw m7, m14, m15
vpmovzxwd m0, m0
vpmovzxwd m1, m1
vpmovzxwd m2, m2
vpmovzxwd m3, m3
vpmovzxwd m4, m4
vpmovzxwd m5, m5
vpmovzxwd m6, m6
vpmovzxwd m7, m7
vpaddd m8, m0, m2
vpaddd m9, m1, m3
vpsubd m10, m0, m2
vpsubd m11, m1, m3
vpaddd m12, m4, m6
vpaddd m13, m5, m7
vpsubd m14, m4, m6
vpsubd m15, m5, m7
vpaddd m0, m8, m12
vpaddd m1, m9, m13
vpaddd m2, m10, m14
vpaddd m3, m11, m15
vpsubd m4, m8, m12
vpsubd m5, m9, m13
vpsubd m6, m10, m14
vpsubd m7, m11, m15
%elif
vphaddw m0, m4, m5
vphsubw m1, m4, m5
vphaddw m2, m6, m7
vphsubw m3, m6, m7
vpmovzxwd m0, m0
vpmovzxwd m1, m1
vpmovzxwd m2, m2
vpmovzxwd m3, m3
vpaddd m4, m0, m2
vpaddd m5, m1, m3
vpsubd m6, m0, m2
vpsubd m7, m1, m3
movdqu m3, [esp]
movdqu m2, [esp+16]
movdqu m1, [esp+16*2]
movdqu m0, [esp+16*3]
movdqu [esp], m7
movdqu [esp+16*1], m6
movdqu [esp+16*2], m5
movdqu [esp+16*3], m4
vphaddw m0, m4, m5
vphsubw m1, m4, m5
vphaddw m2, m6, m7
vphsubw m3, m6, m7
vpmovzxwd m0, m0
vpmovzxwd m1, m1
vpmovzxwd m2, m2
vpmovzxwd m3, m3
vpaddd m4, m0, m2
vpaddd m5, m1, m3
vpsubd m6, m0, m2
vpsubd m7, m1, m3
vpaddd m4, m2, [esp-16]
vpaddd m5, m3, [esp]
vpsubd m6, m2, [esp-16]
vpsubd m7, m3, [esp]
vpabsd m4, m4
vpabsd m5, m5
vpabsd m6, m6
vpabsd m7, m7
vpaddd m2, m4, m5
vpaddd m2, m6
vpaddd m2, m7
vpaddd m4, m0, [esp-16*3]
vpaddd m5, m1, [esp-16*2]
vpsubd m6, m0, [esp-16*3]
vpsubd m7, m1, [esp-16*2]
vpabsd m4, m4
vpabsd m5, m5
vpabsd m6, m6
vpabsd m7, m7
vpaddd m0, m4, m5
vpaddd m0, m6
vpaddd m0, m7
vpaddd m0, m2
%endif
%if ARCH_X86_64
vpabsd m0, m0
vpabsd m1, m1
vpabsd m2, m2
vpabsd m3, m3
vpabsd m4, m4
vpabsd m5, m5
vpabsd m6, m6
vpabsd m7, m7
vpaddd m0, m1
vpaddd m0, m2
vpaddd m0, m3
vpaddd m0, m4
vpaddd m0, m5
vpaddd m0, m6
vpaddd m0, m7
%endif
vphaddd m0, m0
vphaddd m0, m0
vpextrd eax, m0, 1
vpinsrd m1, eax, 0
vpaddd m0, m1
vpextrd eax, m0, 1
vpinsrd m1, eax, 0
vpaddd m0, m1
%if ARCH_X86_64 == 0
lea esp, [esp+16*4]
%endif
vmovd eax, m0
;Uncomment if transformed values not divided elsewhere
;add eax, 2
;shr eax, 2
RET

39
src/x86/picture_x86.h Normal file
View file

@ -0,0 +1,39 @@
#ifndef _PICTURE_X86_H_
#define _PICTURE_X86_H_
/*****************************************************************************
* This file is part of Kvazaar HEVC encoder.
*
* Copyright (C) 2013-2014 Tampere University of Technology and others (see
* COPYING file).
*
* Kvazaar is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*
* Kvazaar is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
****************************************************************************/
/*! \file picture_x86.h
\brief assembly functions header for sad and satd
*/
unsigned kvz_sad_4x4(const pixel*, const pixel*);
unsigned kvz_sad_8x8(const pixel*, const pixel*);
unsigned kvz_sad_16x16(const pixel*, const pixel*);
unsigned kvz_sad_4x4_stride(const pixel *data1, const pixel *data2, unsigned stride);
unsigned kvz_sad_8x8_stride(const pixel *data1, const pixel *data2, unsigned stride);
unsigned kvz_sad_16x16_stride(const pixel *data1, const pixel *data2, unsigned stride);
unsigned kvz_satd_4x4(const pixel *org, const pixel *cur);
unsigned kvz_satd_8x8(const pixel *org, const pixel *cur);
unsigned kvz_satd_8x8_stride(const pixel *org, int32_t org_stride, const pixel *cur, int32_t cur_stride);
#endif