mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Create strategy for ver_sad
Easy to vectorize
This commit is contained in:
parent
ca94ae9529
commit
f781dc31f0
38
src/image.c
38
src/image.c
|
@ -260,32 +260,6 @@ static unsigned cor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
|||
return sad;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Vertically interpolate SAD outside the frame.
|
||||
*
|
||||
* \param data1 Starting point of the first picture.
|
||||
* \param data2 Starting point of the second picture.
|
||||
* \param width Width of the region for which SAD is calculated.
|
||||
* \param height Height of the region for which SAD is calculated.
|
||||
* \param width Width of the pixel array.
|
||||
*
|
||||
* \returns Sum of Absolute Differences
|
||||
*/
|
||||
static unsigned ver_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int block_width, int block_height, unsigned pic_stride)
|
||||
{
|
||||
int x, y;
|
||||
unsigned sad = 0;
|
||||
|
||||
for (y = 0; y < block_height; ++y) {
|
||||
for (x = 0; x < block_width; ++x) {
|
||||
sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
|
||||
}
|
||||
}
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Horizontally interpolate SAD outside the frame.
|
||||
*
|
||||
|
@ -370,7 +344,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
|
|||
result += cor_sad(pic_data,
|
||||
&ref_data[top * ref->stride + left],
|
||||
left, top, pic->stride);
|
||||
result += ver_sad(&pic_data[left],
|
||||
result += kvz_ver_sad(&pic_data[left],
|
||||
&ref_data[top * ref->stride + left],
|
||||
block_width - left, top, pic->stride);
|
||||
result += hor_sad(&pic_data[top * pic->stride],
|
||||
|
@ -380,7 +354,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
|
|||
&ref_data[top * ref->stride + left],
|
||||
block_width - left, block_height - top, pic->stride, ref->stride);
|
||||
} else if (top && right) {
|
||||
result += ver_sad(pic_data,
|
||||
result += kvz_ver_sad(pic_data,
|
||||
&ref_data[top * ref->stride],
|
||||
block_width - right, top, pic->stride);
|
||||
result += cor_sad(&pic_data[block_width - right],
|
||||
|
@ -402,7 +376,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
|
|||
result += cor_sad(&pic_data[(block_height - bottom) * pic->stride],
|
||||
&ref_data[(block_height - bottom - 1) * ref->stride + left],
|
||||
left, bottom, pic->stride);
|
||||
result += ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
|
||||
result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
|
||||
&ref_data[(block_height - bottom - 1) * ref->stride + left],
|
||||
block_width - left, bottom, pic->stride);
|
||||
} else if (bottom && right) {
|
||||
|
@ -412,14 +386,14 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
|
|||
result += hor_sad(&pic_data[block_width - right],
|
||||
&ref_data[block_width - right - 1],
|
||||
right, block_height - bottom, pic->stride, ref->stride);
|
||||
result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
|
||||
result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
|
||||
&ref_data[(block_height - bottom - 1) * ref->stride],
|
||||
block_width - right, bottom, pic->stride);
|
||||
result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right],
|
||||
&ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1],
|
||||
right, bottom, pic->stride);
|
||||
} else if (top) {
|
||||
result += ver_sad(pic_data,
|
||||
result += kvz_ver_sad(pic_data,
|
||||
&ref_data[top * ref->stride],
|
||||
block_width, top, pic->stride);
|
||||
result += reg_sad_maybe_optimized(&pic_data[top * pic->stride],
|
||||
|
@ -431,7 +405,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
|
|||
ref_data,
|
||||
block_width, block_height - bottom, pic->stride, ref->stride,
|
||||
optimized_sad);
|
||||
result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
|
||||
result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
|
||||
&ref_data[(block_height - bottom - 1) * ref->stride],
|
||||
block_width, bottom, pic->stride);
|
||||
} else if (left) {
|
||||
|
|
|
@ -1277,6 +1277,24 @@ static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)
|
|||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t width, int32_t height, uint32_t stride)
|
||||
{
|
||||
if (width == 0)
|
||||
return 0;
|
||||
if (width == 4)
|
||||
return ver_sad_w4(pic_data, ref_data, height, stride);
|
||||
if (width == 8)
|
||||
return ver_sad_w8(pic_data, ref_data, height, stride);
|
||||
if (width == 12)
|
||||
return ver_sad_w12(pic_data, ref_data, height, stride);
|
||||
if (width == 16)
|
||||
return ver_sad_w16(pic_data, ref_data, height, stride);
|
||||
else
|
||||
return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
|
||||
}
|
||||
|
||||
#endif //COMPILE_INTEL_AVX2
|
||||
|
||||
int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
|
||||
|
@ -1312,6 +1330,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
|
|||
success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -593,6 +593,32 @@ static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Vertically interpolate SAD outside the frame.
|
||||
*
|
||||
* \param data1 Starting point of the first picture.
|
||||
* \param data2 Starting point of the second picture.
|
||||
* \param width Width of the region for which SAD is calculated.
|
||||
* \param height Height of the region for which SAD is calculated.
|
||||
* \param width Width of the pixel array.
|
||||
*
|
||||
* \returns Sum of Absolute Differences
|
||||
*/
|
||||
static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int block_width, int block_height, unsigned pic_stride)
|
||||
{
|
||||
int x, y;
|
||||
unsigned sad = 0;
|
||||
|
||||
for (y = 0; y < block_height; ++y) {
|
||||
for (x = 0; x < block_width; ++x) {
|
||||
sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
|
||||
}
|
||||
}
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
|
||||
{
|
||||
bool success = true;
|
||||
|
@ -629,6 +655,7 @@ int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
|
|||
success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic);
|
||||
|
||||
success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
|
||||
|
||||
return success;
|
||||
}
|
||||
|
|
|
@ -66,6 +66,23 @@ static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t width, int32_t height, uint32_t stride)
|
||||
{
|
||||
if (width == 0)
|
||||
return 0;
|
||||
if (width == 4)
|
||||
return ver_sad_w4(pic_data, ref_data, height, stride);
|
||||
if (width == 8)
|
||||
return ver_sad_w8(pic_data, ref_data, height, stride);
|
||||
if (width == 12)
|
||||
return ver_sad_w12(pic_data, ref_data, height, stride);
|
||||
if (width == 16)
|
||||
return ver_sad_w16(pic_data, ref_data, height, stride);
|
||||
else
|
||||
return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
|
||||
}
|
||||
|
||||
#endif //COMPILE_INTEL_SSE41
|
||||
|
||||
|
||||
|
@ -75,6 +92,7 @@ int kvz_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) {
|
|||
if (bitdepth == 8){
|
||||
success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41);
|
||||
success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41);
|
||||
success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41);
|
||||
}
|
||||
#endif
|
||||
return success;
|
||||
|
|
|
@ -314,4 +314,232 @@ static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kv
|
|||
return _mm_cvtsi128_si32(sad);
|
||||
}
|
||||
|
||||
static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t height, uint32_t stride)
|
||||
{
|
||||
__m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data);
|
||||
__m128i sse_inc = _mm_setzero_si128();
|
||||
int32_t y;
|
||||
|
||||
const int32_t height_fourline_groups = height & ~3;
|
||||
const int32_t height_residual_lines = height & 3;
|
||||
|
||||
for (y = 0; y < height_fourline_groups; y += 4) {
|
||||
__m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
|
||||
|
||||
a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1);
|
||||
a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2);
|
||||
a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3);
|
||||
|
||||
__m128i curr_sads = _mm_sad_epu8(a, ref_row);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
}
|
||||
if (height_residual_lines) {
|
||||
// Only pick the last dword, because we're comparing single dwords (lines)
|
||||
ref_row = _mm_bsrli_si128(ref_row, 12);
|
||||
|
||||
for (; y < height; y++) {
|
||||
__m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride));
|
||||
|
||||
__m128i curr_sads = _mm_sad_epu8(a, ref_row);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
}
|
||||
}
|
||||
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
|
||||
|
||||
return _mm_cvtsi128_si32(sad);
|
||||
}
|
||||
|
||||
static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t height, uint32_t stride)
|
||||
{
|
||||
const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data);
|
||||
__m128i sse_inc = _mm_setzero_si128();
|
||||
uint64_t result = 0;
|
||||
int32_t y;
|
||||
|
||||
const int32_t height_fourline_groups = height & ~3;
|
||||
const int32_t height_residual_lines = height & 3;
|
||||
|
||||
for (y = 0; y < height_fourline_groups; y += 4) {
|
||||
__m128d a_d = _mm_setzero_pd();
|
||||
__m128d c_d = _mm_setzero_pd();
|
||||
|
||||
a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride));
|
||||
a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride));
|
||||
|
||||
c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride));
|
||||
c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride));
|
||||
|
||||
__m128i a = _mm_castpd_si128(a_d);
|
||||
__m128i c = _mm_castpd_si128(c_d);
|
||||
|
||||
__m128i curr_sads_ab = _mm_sad_epu8(a, ref_row);
|
||||
__m128i curr_sads_cd = _mm_sad_epu8(c, ref_row);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
|
||||
}
|
||||
if (height_residual_lines) {
|
||||
__m64 b = (__m64)_mm_cvtsi128_si64(ref_row);
|
||||
|
||||
for (; y < height; y++) {
|
||||
__m64 a = *(__m64 *)(pic_data + y * stride);
|
||||
__m64 sads = _mm_sad_pu8(a, b);
|
||||
result += (uint64_t)sads;
|
||||
}
|
||||
}
|
||||
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
|
||||
|
||||
result += _mm_cvtsi128_si32(sad);
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t height, uint32_t stride)
|
||||
{
|
||||
const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
|
||||
__m128i sse_inc = _mm_setzero_si128();
|
||||
int32_t y;
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride));
|
||||
|
||||
__m128i a_masked = _mm_blend_epi16(ref_row, a, 0x3f);
|
||||
__m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
}
|
||||
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
|
||||
return _mm_cvtsi128_si32(sad);
|
||||
}
|
||||
|
||||
static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t height, uint32_t stride)
|
||||
{
|
||||
const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
|
||||
__m128i sse_inc = _mm_setzero_si128();
|
||||
int32_t y;
|
||||
|
||||
const int32_t height_fourline_groups = height & ~3;
|
||||
const int32_t height_residual_lines = height & 3;
|
||||
|
||||
for (y = 0; y < height_fourline_groups; y += 4) {
|
||||
__m128i pic_row_1 = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
|
||||
__m128i pic_row_2 = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
|
||||
__m128i pic_row_3 = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
|
||||
__m128i pic_row_4 = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
|
||||
|
||||
__m128i curr_sads_1 = _mm_sad_epu8 (pic_row_1, ref_row);
|
||||
__m128i curr_sads_2 = _mm_sad_epu8 (pic_row_2, ref_row);
|
||||
__m128i curr_sads_3 = _mm_sad_epu8 (pic_row_3, ref_row);
|
||||
__m128i curr_sads_4 = _mm_sad_epu8 (pic_row_4, ref_row);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_4);
|
||||
}
|
||||
if (height_residual_lines) {
|
||||
for (; y < height; y++) {
|
||||
__m128i pic_row = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
|
||||
__m128i curr_sads = _mm_sad_epu8 (pic_row, ref_row);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
}
|
||||
}
|
||||
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
|
||||
|
||||
return _mm_cvtsi128_si32(sad);
|
||||
}
|
||||
|
||||
static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t width, int32_t height, uint32_t stride)
|
||||
{
|
||||
int32_t y, x;
|
||||
__m128i sse_inc = _mm_setzero_si128();
|
||||
|
||||
// Bytes in block in 128-bit blocks per each scanline, and remainder
|
||||
const int32_t width_xmms = width & ~15;
|
||||
const int32_t width_residual_pixels = width & 15;
|
||||
|
||||
const int32_t height_fourline_groups = height & ~3;
|
||||
const int32_t height_residual_lines = height & 3;
|
||||
|
||||
const __m128i rds = _mm_set1_epi8 (width_residual_pixels);
|
||||
const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
|
||||
|
||||
for (x = 0; x < width_xmms; x += 16) {
|
||||
const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
|
||||
for (y = 0; y < height_fourline_groups; y += 4) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
|
||||
__m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
|
||||
__m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
|
||||
__m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
|
||||
|
||||
__m128i curr_sads_ab = _mm_sad_epu8(ref_row, a);
|
||||
__m128i curr_sads_cd = _mm_sad_epu8(ref_row, c);
|
||||
__m128i curr_sads_ef = _mm_sad_epu8(ref_row, e);
|
||||
__m128i curr_sads_gh = _mm_sad_epu8(ref_row, g);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
|
||||
}
|
||||
if (height_residual_lines) {
|
||||
for (; y < height; y++) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
|
||||
|
||||
__m128i curr_sads = _mm_sad_epu8(a, ref_row);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (width_residual_pixels) {
|
||||
const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
|
||||
for (y = 0; y < height_fourline_groups; y += 4) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
|
||||
__m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
|
||||
__m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
|
||||
__m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
|
||||
|
||||
__m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
|
||||
__m128i c_masked = _mm_blendv_epi8(ref_row, c, rdmask);
|
||||
__m128i e_masked = _mm_blendv_epi8(ref_row, e, rdmask);
|
||||
__m128i g_masked = _mm_blendv_epi8(ref_row, g, rdmask);
|
||||
|
||||
__m128i curr_sads_ab = _mm_sad_epu8 (ref_row, a_masked);
|
||||
__m128i curr_sads_cd = _mm_sad_epu8 (ref_row, c_masked);
|
||||
__m128i curr_sads_ef = _mm_sad_epu8 (ref_row, e_masked);
|
||||
__m128i curr_sads_gh = _mm_sad_epu8 (ref_row, g_masked);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
|
||||
}
|
||||
if (height_residual_lines) {
|
||||
for (; y < height; y++) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
|
||||
|
||||
__m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
|
||||
__m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
}
|
||||
}
|
||||
}
|
||||
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
|
||||
|
||||
return _mm_cvtsi128_si32(sad);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -64,6 +64,7 @@ pixels_calc_ssd_func * kvz_pixels_calc_ssd = 0;
|
|||
inter_recon_bipred_func * kvz_inter_recon_bipred_blend = 0;
|
||||
|
||||
get_optimized_sad_func *kvz_get_optimized_sad = 0;
|
||||
ver_sad_func *kvz_ver_sad = 0;
|
||||
|
||||
|
||||
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {
|
||||
|
|
|
@ -113,7 +113,9 @@ typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_p
|
|||
|
||||
typedef unsigned (pixels_calc_ssd_func)(const kvz_pixel *const ref, const kvz_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
|
||||
typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
|
||||
|
||||
typedef uint32_t (ver_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t block_width, int32_t block_height,
|
||||
uint32_t pic_stride);
|
||||
|
||||
typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
|
||||
const int hi_prec_luma_rec1,
|
||||
|
@ -167,6 +169,7 @@ extern pixels_calc_ssd_func *kvz_pixels_calc_ssd;
|
|||
extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend;
|
||||
|
||||
extern get_optimized_sad_func *kvz_get_optimized_sad;
|
||||
extern ver_sad_func *kvz_ver_sad;
|
||||
|
||||
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
|
||||
cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
|
||||
|
@ -201,6 +204,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
|
|||
{"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \
|
||||
{"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \
|
||||
{"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
|
||||
{"ver_sad", (void**) &kvz_ver_sad}, \
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue