diff --git a/src/Makefile b/src/Makefile index b7a63518..b2353db0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -5,7 +5,7 @@ ifeq (, $(ARCH)) endif SYSTEM = $(shell uname -s) ASMFLAGS = -DFLAGS = -O2 -g -Werror -DUSE_TILES=1 +DFLAGS = -O2 -g -Werror -DUSE_TILES=1 -march=native # ARCH related flags ifeq ($(ARCH), x86_64) diff --git a/src/picture-sse2.c b/src/picture-sse2.c new file mode 100644 index 00000000..f15c259c --- /dev/null +++ b/src/picture-sse2.c @@ -0,0 +1,90 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2014 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Kvazaar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kvazaar. If not, see . + ****************************************************************************/ + +/* + * \file + */ + +#include "picture.h" +#include + +#ifdef __SSE2__ +static unsigned reg_sad(const pixel * const data1, const pixel * const data2, + const int width, const int height, const unsigned stride1, const unsigned stride2) +{ + int y, x; + unsigned sad = 0; + __m128i sse_inc = _mm_setzero_si128 (); + long long int sse_inc_array[2]; + + for (y = 0; y < height; ++y) { + for (x = 0; x <= width-16; x+=16) { + const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); + const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b)); + } + +#ifdef __SSE4_1__ + { + const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); + const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); + switch (((width - (width%2)) - x)/2) { + case 0: + break; + case 1: + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01))); + break; + case 2: + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03))); + break; + case 3: + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07))); + break; + case 4: + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f))); + break; + case 5: + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f))); + break; + case 6: + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f))); + break; + case 7: + sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f))); + break; + default: + //Should not happen + assert(0); + } + x = (width - (width%2)); + } +#endif //__SSE4_1__ + + for (; x < width; ++x) { + sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); + } + } + _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc); + sad += sse_inc_array[0] + sse_inc_array[1]; + + return sad; +} +#else +#error picture-sse2.c requires __SSE2__ +#endif //__SSE2__ diff --git a/src/picture.c b/src/picture.c index 87e6fc13..8b00cb61 100644 --- a/src/picture.c +++ b/src/picture.c @@ -768,6 +768,13 @@ static unsigned hor_sad(const pixel *pic_data, const pixel *ref_data, return sad; } + +#if defined(__SSE2__) +#include "picture-sse2.c" +#elif defined(__ALTIVEC__) +#include "picture-altivec.c" +#else +//Generic implementations /** * \brief Calculate Sum of Absolute Differences (SAD) * @@ -782,20 +789,21 @@ static unsigned hor_sad(const pixel *pic_data, const pixel *ref_data, * * \returns Sum of Absolute Differences */ -static unsigned reg_sad(const pixel *data1, const pixel *data2, - int width, int height, unsigned stride1, unsigned stride2) +static unsigned reg_sad(const pixel * const data1, const pixel * const data2, + const int width, const int height, const unsigned stride1, const unsigned stride2) { int y, x; unsigned sad = 0; for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { - sad += abs((int)data1[y * stride1 + x] - (int)data2[y * stride2 + x]); + sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } return sad; } +#endif /** * \brief Handle special cases of comparing blocks that are not completely