mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
42295d3cb9
- _M_IX86_FP defines whether VS should generate code using SSE or SSE2 instructions. It isn't correct to use it to check whether optional runtime optimizations should be compiled in. It's also not defined at all in 64-bit mode. - So let's just keep it simple and give a list of everything that is supported as release optimizations. It's not clear from the documentation if all of these are really supported. It just list a bunch of intrinsics from these that are.
93 lines
3.2 KiB
C
93 lines
3.2 KiB
C
/*****************************************************************************
|
|
* This file is part of Kvazaar HEVC encoder.
|
|
*
|
|
* Copyright (C) 2013-2014 Tampere University of Technology and others (see
|
|
* COPYING file).
|
|
*
|
|
* Kvazaar is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2 as published
|
|
* by the Free Software Foundation.
|
|
*
|
|
* Kvazaar is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
|
****************************************************************************/
|
|
|
|
/*
|
|
* \file
|
|
*/
|
|
#include "../strategyselector.h"
|
|
#include "../picture.h"
|
|
#include <immintrin.h>
|
|
#include <assert.h>
|
|
|
|
#ifdef __GNUC__
|
|
__attribute__ ((__target__ ("sse2,sse4.1")))
|
|
#endif
|
|
static unsigned reg_sad_sse41(const pixel * const data1, const pixel * const data2,
|
|
const int width, const int height, const unsigned stride1, const unsigned stride2)
|
|
{
|
|
int y, x;
|
|
unsigned sad = 0;
|
|
__m128i sse_inc = _mm_setzero_si128 ();
|
|
long long int sse_inc_array[2];
|
|
|
|
for (y = 0; y < height; ++y) {
|
|
for (x = 0; x <= width-16; x+=16) {
|
|
const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
|
|
const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
|
|
}
|
|
|
|
{
|
|
const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
|
|
const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
|
|
switch (((width - (width%2)) - x)/2) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01)));
|
|
break;
|
|
case 2:
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03)));
|
|
break;
|
|
case 3:
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07)));
|
|
break;
|
|
case 4:
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f)));
|
|
break;
|
|
case 5:
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f)));
|
|
break;
|
|
case 6:
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f)));
|
|
break;
|
|
case 7:
|
|
sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f)));
|
|
break;
|
|
default:
|
|
//Should not happen
|
|
assert(0);
|
|
}
|
|
x = (width - (width%2));
|
|
}
|
|
|
|
for (; x < width; ++x) {
|
|
sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
|
|
}
|
|
}
|
|
_mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
|
|
sad += sse_inc_array[0] + sse_inc_array[1];
|
|
|
|
return sad;
|
|
}
|
|
|
|
static int strategy_register_picture_sse41(void* opaque) {
|
|
return strategyselector_register(opaque, "reg_sad", "sse41", 20, ®_sad_sse41);
|
|
}
|