Create strategy for ver_sad

Easy to vectorize
This commit is contained in:
Pauli Oikkonen 2019-01-22 15:57:16 +02:00
parent ca94ae9529
commit f781dc31f0
7 changed files with 304 additions and 33 deletions

View file

@ -260,32 +260,6 @@ static unsigned cor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
return sad;
}
/**
* \brief Vertically interpolate SAD outside the frame.
*
* \param data1 Starting point of the first picture.
* \param data2 Starting point of the second picture.
* \param width Width of the region for which SAD is calculated.
* \param height Height of the region for which SAD is calculated.
* \param width Width of the pixel array.
*
* \returns Sum of Absolute Differences
*/
static unsigned ver_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int block_width, int block_height, unsigned pic_stride)
{
int x, y;
unsigned sad = 0;
for (y = 0; y < block_height; ++y) {
for (x = 0; x < block_width; ++x) {
sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
}
}
return sad;
}
/**
* \brief Horizontally interpolate SAD outside the frame.
*
@ -370,7 +344,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
result += cor_sad(pic_data,
&ref_data[top * ref->stride + left],
left, top, pic->stride);
result += ver_sad(&pic_data[left],
result += kvz_ver_sad(&pic_data[left],
&ref_data[top * ref->stride + left],
block_width - left, top, pic->stride);
result += hor_sad(&pic_data[top * pic->stride],
@ -380,7 +354,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
&ref_data[top * ref->stride + left],
block_width - left, block_height - top, pic->stride, ref->stride);
} else if (top && right) {
result += ver_sad(pic_data,
result += kvz_ver_sad(pic_data,
&ref_data[top * ref->stride],
block_width - right, top, pic->stride);
result += cor_sad(&pic_data[block_width - right],
@ -402,7 +376,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
result += cor_sad(&pic_data[(block_height - bottom) * pic->stride],
&ref_data[(block_height - bottom - 1) * ref->stride + left],
left, bottom, pic->stride);
result += ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
&ref_data[(block_height - bottom - 1) * ref->stride + left],
block_width - left, bottom, pic->stride);
} else if (bottom && right) {
@ -412,14 +386,14 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
result += hor_sad(&pic_data[block_width - right],
&ref_data[block_width - right - 1],
right, block_height - bottom, pic->stride, ref->stride);
result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
&ref_data[(block_height - bottom - 1) * ref->stride],
block_width - right, bottom, pic->stride);
result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right],
&ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1],
right, bottom, pic->stride);
} else if (top) {
result += ver_sad(pic_data,
result += kvz_ver_sad(pic_data,
&ref_data[top * ref->stride],
block_width, top, pic->stride);
result += reg_sad_maybe_optimized(&pic_data[top * pic->stride],
@ -431,7 +405,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
ref_data,
block_width, block_height - bottom, pic->stride, ref->stride,
optimized_sad);
result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
&ref_data[(block_height - bottom - 1) * ref->stride],
block_width, bottom, pic->stride);
} else if (left) {

View file

@ -1277,6 +1277,24 @@ static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)
else
return NULL;
}
static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t width, int32_t height, uint32_t stride)
{
if (width == 0)
return 0;
if (width == 4)
return ver_sad_w4(pic_data, ref_data, height, stride);
if (width == 8)
return ver_sad_w8(pic_data, ref_data, height, stride);
if (width == 12)
return ver_sad_w12(pic_data, ref_data, height, stride);
if (width == 16)
return ver_sad_w16(pic_data, ref_data, height, stride);
else
return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
}
#endif //COMPILE_INTEL_AVX2
int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
@ -1312,6 +1330,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
}
#endif

View file

@ -593,6 +593,32 @@ static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width)
return NULL;
}
/**
* \brief Vertically interpolate SAD outside the frame.
*
* \param data1 Starting point of the first picture.
* \param data2 Starting point of the second picture.
* \param width Width of the region for which SAD is calculated.
* \param height Height of the region for which SAD is calculated.
* \param width Width of the pixel array.
*
* \returns Sum of Absolute Differences
*/
static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int block_width, int block_height, unsigned pic_stride)
{
int x, y;
unsigned sad = 0;
for (y = 0; y < block_height; ++y) {
for (x = 0; x < block_width; ++x) {
sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
}
}
return sad;
}
int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
{
bool success = true;
@ -629,6 +655,7 @@ int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic);
success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic);
success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
return success;
}

View file

@ -66,6 +66,23 @@ static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width)
return NULL;
}
static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t width, int32_t height, uint32_t stride)
{
if (width == 0)
return 0;
if (width == 4)
return ver_sad_w4(pic_data, ref_data, height, stride);
if (width == 8)
return ver_sad_w8(pic_data, ref_data, height, stride);
if (width == 12)
return ver_sad_w12(pic_data, ref_data, height, stride);
if (width == 16)
return ver_sad_w16(pic_data, ref_data, height, stride);
else
return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
}
#endif //COMPILE_INTEL_SSE41
@ -75,6 +92,7 @@ int kvz_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) {
if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41);
success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41);
success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41);
}
#endif
return success;

View file

@ -314,4 +314,232 @@ static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kv
return _mm_cvtsi128_si32(sad);
}
static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t height, uint32_t stride)
{
__m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data);
__m128i sse_inc = _mm_setzero_si128();
int32_t y;
const int32_t height_fourline_groups = height & ~3;
const int32_t height_residual_lines = height & 3;
for (y = 0; y < height_fourline_groups; y += 4) {
__m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1);
a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2);
a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3);
__m128i curr_sads = _mm_sad_epu8(a, ref_row);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
}
if (height_residual_lines) {
// Only pick the last dword, because we're comparing single dwords (lines)
ref_row = _mm_bsrli_si128(ref_row, 12);
for (; y < height; y++) {
__m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride));
__m128i curr_sads = _mm_sad_epu8(a, ref_row);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
}
}
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
return _mm_cvtsi128_si32(sad);
}
static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t height, uint32_t stride)
{
const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data);
__m128i sse_inc = _mm_setzero_si128();
uint64_t result = 0;
int32_t y;
const int32_t height_fourline_groups = height & ~3;
const int32_t height_residual_lines = height & 3;
for (y = 0; y < height_fourline_groups; y += 4) {
__m128d a_d = _mm_setzero_pd();
__m128d c_d = _mm_setzero_pd();
a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride));
a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride));
c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride));
c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride));
__m128i a = _mm_castpd_si128(a_d);
__m128i c = _mm_castpd_si128(c_d);
__m128i curr_sads_ab = _mm_sad_epu8(a, ref_row);
__m128i curr_sads_cd = _mm_sad_epu8(c, ref_row);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
}
if (height_residual_lines) {
__m64 b = (__m64)_mm_cvtsi128_si64(ref_row);
for (; y < height; y++) {
__m64 a = *(__m64 *)(pic_data + y * stride);
__m64 sads = _mm_sad_pu8(a, b);
result += (uint64_t)sads;
}
}
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
result += _mm_cvtsi128_si32(sad);
return result;
}
static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t height, uint32_t stride)
{
const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
__m128i sse_inc = _mm_setzero_si128();
int32_t y;
for (y = 0; y < height; y++) {
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride));
__m128i a_masked = _mm_blend_epi16(ref_row, a, 0x3f);
__m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
}
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
return _mm_cvtsi128_si32(sad);
}
static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t height, uint32_t stride)
{
const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
__m128i sse_inc = _mm_setzero_si128();
int32_t y;
const int32_t height_fourline_groups = height & ~3;
const int32_t height_residual_lines = height & 3;
for (y = 0; y < height_fourline_groups; y += 4) {
__m128i pic_row_1 = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
__m128i pic_row_2 = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
__m128i pic_row_3 = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
__m128i pic_row_4 = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
__m128i curr_sads_1 = _mm_sad_epu8 (pic_row_1, ref_row);
__m128i curr_sads_2 = _mm_sad_epu8 (pic_row_2, ref_row);
__m128i curr_sads_3 = _mm_sad_epu8 (pic_row_3, ref_row);
__m128i curr_sads_4 = _mm_sad_epu8 (pic_row_4, ref_row);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_4);
}
if (height_residual_lines) {
for (; y < height; y++) {
__m128i pic_row = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
__m128i curr_sads = _mm_sad_epu8 (pic_row, ref_row);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
}
}
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
return _mm_cvtsi128_si32(sad);
}
static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t width, int32_t height, uint32_t stride)
{
int32_t y, x;
__m128i sse_inc = _mm_setzero_si128();
// Bytes in block in 128-bit blocks per each scanline, and remainder
const int32_t width_xmms = width & ~15;
const int32_t width_residual_pixels = width & 15;
const int32_t height_fourline_groups = height & ~3;
const int32_t height_residual_lines = height & 3;
const __m128i rds = _mm_set1_epi8 (width_residual_pixels);
const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
for (x = 0; x < width_xmms; x += 16) {
const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
for (y = 0; y < height_fourline_groups; y += 4) {
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
__m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
__m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
__m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
__m128i curr_sads_ab = _mm_sad_epu8(ref_row, a);
__m128i curr_sads_cd = _mm_sad_epu8(ref_row, c);
__m128i curr_sads_ef = _mm_sad_epu8(ref_row, e);
__m128i curr_sads_gh = _mm_sad_epu8(ref_row, g);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
}
if (height_residual_lines) {
for (; y < height; y++) {
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
__m128i curr_sads = _mm_sad_epu8(a, ref_row);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
}
}
}
if (width_residual_pixels) {
const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
for (y = 0; y < height_fourline_groups; y += 4) {
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
__m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
__m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
__m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
__m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
__m128i c_masked = _mm_blendv_epi8(ref_row, c, rdmask);
__m128i e_masked = _mm_blendv_epi8(ref_row, e, rdmask);
__m128i g_masked = _mm_blendv_epi8(ref_row, g, rdmask);
__m128i curr_sads_ab = _mm_sad_epu8 (ref_row, a_masked);
__m128i curr_sads_cd = _mm_sad_epu8 (ref_row, c_masked);
__m128i curr_sads_ef = _mm_sad_epu8 (ref_row, e_masked);
__m128i curr_sads_gh = _mm_sad_epu8 (ref_row, g_masked);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
}
if (height_residual_lines) {
for (; y < height; y++) {
__m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
__m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
__m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
}
}
}
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
return _mm_cvtsi128_si32(sad);
}
#endif

View file

@ -64,6 +64,7 @@ pixels_calc_ssd_func * kvz_pixels_calc_ssd = 0;
inter_recon_bipred_func * kvz_inter_recon_bipred_blend = 0;
get_optimized_sad_func *kvz_get_optimized_sad = 0;
ver_sad_func *kvz_ver_sad = 0;
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {

View file

@ -113,7 +113,9 @@ typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_p
typedef unsigned (pixels_calc_ssd_func)(const kvz_pixel *const ref, const kvz_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
typedef uint32_t (ver_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
int32_t block_width, int32_t block_height,
uint32_t pic_stride);
typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
const int hi_prec_luma_rec1,
@ -167,6 +169,7 @@ extern pixels_calc_ssd_func *kvz_pixels_calc_ssd;
extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend;
extern get_optimized_sad_func *kvz_get_optimized_sad;
extern ver_sad_func *kvz_ver_sad;
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
@ -201,6 +204,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
{"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \
{"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \
{"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
{"ver_sad", (void**) &kvz_ver_sad}, \