uvg266/src/strategies/x86_asm/picture-x86-asm-sad.h
Ari Koivula 02cd17b427 Add faster AVX inter SAD for 32x32 and 64x64
Add implementations for these functions that process the image line by
line instead of using the 16x16 function to process block by block.

The 32x32 is around 30% faster, and 64x64 is around 15% faster,
on Haswell.

PASS inter_sad: 28.744M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec)
PASS inter_sad: 7.882M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec)
to
PASS inter_sad: 37.828M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec)
PASS inter_sad: 9.081M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec)
2016-09-01 21:36:39 +03:00

44 lines
1.8 KiB
C

#ifndef _PICTURE_X86_ASM_SAD_H_
#define _PICTURE_X86_ASM_SAD_H_
/*****************************************************************************
* This file is part of Kvazaar HEVC encoder.
*
* Copyright (C) 2013-2015 Tampere University of Technology and others (see
* COPYING file).
*
* Kvazaar is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Optimizations for AVX, utilizing ASM implementations.
*/
#include "global.h" // IWYU pragma: keep
unsigned kvz_sad_4x4_avx(const kvz_pixel*, const kvz_pixel*);
unsigned kvz_sad_8x8_avx(const kvz_pixel*, const kvz_pixel*);
unsigned kvz_sad_16x16_avx(const kvz_pixel*, const kvz_pixel*);
unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
#endif