uvg266/src/strategies/picture-sse41.c
Ari Koivula 42295d3cb9 Pass preprocessor defines for supported intrinsics in VS2010 explicitly.
- _M_IX86_FP defines whether VS should generate code using SSE or SSE2
  instructions. It isn't correct to use it to check whether optional runtime
  optimizations should be compiled in. It's also not defined at all in 64-bit
  mode.

- So let's just keep it simple and give a list of everything that is supported
  as release optimizations. It's not clear from the documentation if all of
  these are really supported. It just list a bunch of intrinsics from these
  that are.
2014-04-30 17:41:15 +03:00

93 lines
3.2 KiB
C

/*****************************************************************************
* This file is part of Kvazaar HEVC encoder.
*
* Copyright (C) 2013-2014 Tampere University of Technology and others (see
* COPYING file).
*
* Kvazaar is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*
* Kvazaar is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
****************************************************************************/
/*
* \file
*/
#include "../strategyselector.h"
#include "../picture.h"
#include <immintrin.h>
#include <assert.h>
#ifdef __GNUC__
__attribute__ ((__target__ ("sse2,sse4.1")))
#endif
static unsigned reg_sad_sse41(const pixel * const data1, const pixel * const data2,
const int width, const int height, const unsigned stride1, const unsigned stride2)
{
int y, x;
unsigned sad = 0;
__m128i sse_inc = _mm_setzero_si128 ();
long long int sse_inc_array[2];
for (y = 0; y < height; ++y) {
for (x = 0; x <= width-16; x+=16) {
const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
}
{
const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
switch (((width - (width%2)) - x)/2) {
case 0:
break;
case 1:
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01)));
break;
case 2:
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03)));
break;
case 3:
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07)));
break;
case 4:
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f)));
break;
case 5:
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f)));
break;
case 6:
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f)));
break;
case 7:
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f)));
break;
default:
//Should not happen
assert(0);
}
x = (width - (width%2));
}
for (; x < width; ++x) {
sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
}
}
_mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
sad += sse_inc_array[0] + sse_inc_array[1];
return sad;
}
static int strategy_register_picture_sse41(void* opaque) {
return strategyselector_register(opaque, "reg_sad", "sse41", 20, &reg_sad_sse41);
}