mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Avoid overreading block borders in hor_sad_sse41_arbitrary
This commit is contained in:
parent
41f51c08c4
commit
448eacba7b
|
@ -820,145 +820,107 @@ static uint32_t hor_sad_sse41_w16(const kvz_pixel *pic_data, const kvz_pixel *re
|
|||
return _mm_cvtsi128_si32(sad);
|
||||
}
|
||||
|
||||
static uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t width, int32_t height, uint32_t pic_stride,
|
||||
uint32_t ref_stride, uint32_t left, uint32_t right)
|
||||
static INLINE uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
|
||||
int32_t width, int32_t height, uint32_t pic_stride,
|
||||
uint32_t ref_stride, uint32_t left, uint32_t right)
|
||||
{
|
||||
const size_t xmm_width = 16;
|
||||
const __m128i xmm_widths = _mm_set1_epi8(xmm_width);
|
||||
|
||||
// Bytes in block in 128-bit blocks per each scanline, and remainder
|
||||
const int32_t width_xmms = width & ~(xmm_width - 1);
|
||||
const int32_t width_residual_pixels = width & (xmm_width - 1);
|
||||
|
||||
const int32_t height_fourline_groups = height & ~3;
|
||||
const int32_t height_residual_lines = height & 3;
|
||||
|
||||
__m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
const __m128i rds = _mm_set1_epi8 (width_residual_pixels);
|
||||
const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
|
||||
|
||||
int32_t border_idx;
|
||||
__m128i is_right_border = _mm_setzero_si128();
|
||||
if (left) {
|
||||
border_idx = left;
|
||||
} else {
|
||||
border_idx = width - (right + 1);
|
||||
is_right_border = _mm_cmpeq_epi8(is_right_border, is_right_border);
|
||||
}
|
||||
const __m128i epol_src_idx = _mm_set1_epi8(border_idx);
|
||||
|
||||
int32_t x, y;
|
||||
__m128i sse_inc = _mm_setzero_si128();
|
||||
__m128i epol_mask;
|
||||
for (x = 0; x < width_xmms; x += xmm_width) {
|
||||
|
||||
// This is a dirty hack, but it saves us an easily predicted branch! It
|
||||
// also marks the first or last valid pixel (the border one) for
|
||||
// extrapolating, but that makes no difference since the pixels marked
|
||||
// for extrapolation will always be written over with that exact pixel's
|
||||
// value.
|
||||
epol_mask = _mm_cmpgt_epi8(epol_src_idx, ns);
|
||||
epol_mask = _mm_xor_si128 (epol_mask, is_right_border);
|
||||
const size_t vec_width = 16;
|
||||
const size_t vecwid_bitmask = 15;
|
||||
const size_t vec_width_log2 = 4;
|
||||
|
||||
for (y = 0; y < height_fourline_groups; y += 4) {
|
||||
__m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + x));
|
||||
__m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + x));
|
||||
__m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + x));
|
||||
__m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + x));
|
||||
__m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * pic_stride + x));
|
||||
__m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 2) * ref_stride + x));
|
||||
__m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * pic_stride + x));
|
||||
__m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 3) * ref_stride + x));
|
||||
const __m128i rights = _mm_set1_epi8((uint8_t)right);
|
||||
const __m128i blk_widths = _mm_set1_epi8((uint8_t)width);
|
||||
const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width);
|
||||
const __m128i nslo = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
__m128i border_px_b = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 0) * ref_stride + border_idx));
|
||||
__m128i border_px_d = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 1) * ref_stride + border_idx));
|
||||
__m128i border_px_f = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 2) * ref_stride + border_idx));
|
||||
__m128i border_px_h = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 3) * ref_stride + border_idx));
|
||||
__m128i b_epol = _mm_blendv_epi8(b, border_px_b, epol_mask);
|
||||
__m128i d_epol = _mm_blendv_epi8(d, border_px_d, epol_mask);
|
||||
__m128i f_epol = _mm_blendv_epi8(f, border_px_f, epol_mask);
|
||||
__m128i h_epol = _mm_blendv_epi8(h, border_px_h, epol_mask);
|
||||
|
||||
__m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
|
||||
__m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
|
||||
__m128i curr_sads_ef = _mm_sad_epu8(e, f_epol);
|
||||
__m128i curr_sads_gh = _mm_sad_epu8(g, h_epol);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
|
||||
}
|
||||
if (height_residual_lines) {
|
||||
for (; y < height; y++) {
|
||||
__m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + x));
|
||||
__m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + x));
|
||||
|
||||
__m128i border_px_b = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 0) * ref_stride + border_idx));
|
||||
__m128i b_epol = _mm_blendv_epi8(b, border_px_b, epol_mask);
|
||||
|
||||
__m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
|
||||
}
|
||||
}
|
||||
ns = _mm_add_epi8(ns, xmm_widths);
|
||||
uint32_t outside_vecs, inside_vecs, left_offset, is_left_bm;
|
||||
int32_t outside_width, inside_width, border_off, invec_lstart,
|
||||
invec_lend, invec_linc;
|
||||
if (left) {
|
||||
outside_vecs = left >> vec_width_log2;
|
||||
inside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - outside_vecs;
|
||||
outside_width = outside_vecs * vec_width;
|
||||
inside_width = inside_vecs * vec_width;
|
||||
left_offset = left;
|
||||
border_off = left;
|
||||
invec_lstart = 0;
|
||||
invec_lend = inside_vecs;
|
||||
invec_linc = 1;
|
||||
is_left_bm = -1;
|
||||
} else {
|
||||
inside_vecs = ((width - right) + vecwid_bitmask) >> vec_width_log2;
|
||||
outside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - inside_vecs;
|
||||
outside_width = outside_vecs * vec_width;
|
||||
inside_width = inside_vecs * vec_width;
|
||||
left_offset = right - width;
|
||||
border_off = width - 1 - right;
|
||||
invec_lstart = inside_vecs - 1;
|
||||
invec_lend = -1;
|
||||
invec_linc = -1;
|
||||
is_left_bm = 0;
|
||||
}
|
||||
if (width_residual_pixels) {
|
||||
epol_mask = _mm_cmpgt_epi8(epol_src_idx, ns);
|
||||
epol_mask = _mm_xor_si128 (epol_mask, is_right_border);
|
||||
left_offset &= vecwid_bitmask;
|
||||
|
||||
for (y = 0; y < height_fourline_groups; y += 4) {
|
||||
__m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + x));
|
||||
__m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + x));
|
||||
__m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + x));
|
||||
__m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + x));
|
||||
__m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * pic_stride + x));
|
||||
__m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 2) * ref_stride + x));
|
||||
__m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * pic_stride + x));
|
||||
__m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 3) * ref_stride + x));
|
||||
const __m128i left_offsets = _mm_set1_epi8 ((uint8_t)left_offset);
|
||||
const __m128i is_left = _mm_cmpeq_epi8(rights, _mm_setzero_si128());
|
||||
const __m128i vw_for_left = _mm_and_si128 (is_left, vec_widths);
|
||||
|
||||
__m128i border_px_b = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 0) * ref_stride + border_idx));
|
||||
__m128i border_px_d = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 1) * ref_stride + border_idx));
|
||||
__m128i border_px_f = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 2) * ref_stride + border_idx));
|
||||
__m128i border_px_h = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 3) * ref_stride + border_idx));
|
||||
// -x == (x ^ 0xff) + 1 = (x ^ 0xff) - 0xff. Also x == (x ^ 0x00) - 0x00.
|
||||
// in other words, calculate inverse of left_offsets if is_left is true.
|
||||
const __m128i offs_neg = _mm_xor_si128 (left_offsets, is_left);
|
||||
const __m128i offs_for_sm1 = _mm_sub_epi8 (offs_neg, is_left);
|
||||
|
||||
__m128i b_epol_1 = _mm_blendv_epi8(b, border_px_b, epol_mask);
|
||||
__m128i d_epol_1 = _mm_blendv_epi8(d, border_px_d, epol_mask);
|
||||
__m128i f_epol_1 = _mm_blendv_epi8(f, border_px_f, epol_mask);
|
||||
__m128i h_epol_1 = _mm_blendv_epi8(h, border_px_h, epol_mask);
|
||||
const __m128i ns_for_sm1 = _mm_or_si128 (vw_for_left, nslo);
|
||||
const __m128i shufmask1 = _mm_add_epi8 (ns_for_sm1, offs_for_sm1);
|
||||
|
||||
__m128i b_epol_2 = _mm_blendv_epi8(a, b_epol_1, rdmask);
|
||||
__m128i d_epol_2 = _mm_blendv_epi8(c, d_epol_1, rdmask);
|
||||
__m128i f_epol_2 = _mm_blendv_epi8(e, f_epol_1, rdmask);
|
||||
__m128i h_epol_2 = _mm_blendv_epi8(g, h_epol_1, rdmask);
|
||||
const __m128i mo2bmask_l = _mm_cmpgt_epi8(left_offsets, nslo);
|
||||
const __m128i mo2bimask_l = _mm_cmpeq_epi8(mo2bmask_l, _mm_setzero_si128());
|
||||
const __m128i mo2bimask_r = _mm_cmpgt_epi8(vec_widths, shufmask1);
|
||||
const __m128i move_old_to_b_imask = _mm_blendv_epi8(mo2bimask_r, mo2bimask_l, is_left);
|
||||
|
||||
__m128i curr_sads_ab = _mm_sad_epu8(a, b_epol_2);
|
||||
__m128i curr_sads_cd = _mm_sad_epu8(c, d_epol_2);
|
||||
__m128i curr_sads_ef = _mm_sad_epu8(e, f_epol_2);
|
||||
__m128i curr_sads_gh = _mm_sad_epu8(g, h_epol_2);
|
||||
const int32_t outvec_offset = (~is_left_bm) & inside_width;
|
||||
int32_t x, y;
|
||||
for (y = 0; y < height; y++) {
|
||||
__m128i borderpx_vec = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]);
|
||||
for (x = 0; x < outside_vecs; x++) {
|
||||
__m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
|
||||
__m128i startoffs = _mm_set1_epi8 ((x + inside_vecs) << vec_width_log2);
|
||||
__m128i ns = _mm_add_epi8 (startoffs, nslo);
|
||||
|
||||
// Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
|
||||
__m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
|
||||
unrd_imask = _mm_or_si128 (unrd_imask, is_left);
|
||||
__m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
|
||||
__m128i b_unread = _mm_blendv_epi8(borderpx_vec, a, unrd_mask);
|
||||
|
||||
__m128i sad_ab = _mm_sad_epu8 (a, b_unread);
|
||||
sse_inc = _mm_add_epi64(sse_inc, sad_ab);
|
||||
}
|
||||
if (height_residual_lines) {
|
||||
for (; y < height; y++) {
|
||||
__m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + x));
|
||||
__m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + x));
|
||||
int32_t a_off = outside_width & is_left_bm;
|
||||
int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
|
||||
|
||||
__m128i border_px_b = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 0) * ref_stride + border_idx));
|
||||
__m128i b_epol_1 = _mm_blendv_epi8(b, border_px_b, epol_mask);
|
||||
__m128i b_epol_2 = _mm_blendv_epi8(a, b_epol_1, rdmask);
|
||||
__m128i old_b = borderpx_vec;
|
||||
for (x = invec_lstart; x != invec_lend; x += invec_linc) {
|
||||
__m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
|
||||
__m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
|
||||
|
||||
__m128i curr_sads_ab = _mm_sad_epu8(a, b_epol_2);
|
||||
__m128i b_shifted = _mm_shuffle_epi8(b, shufmask1);
|
||||
__m128i b_with_old = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
|
||||
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
|
||||
}
|
||||
uint8_t startoff = (x << vec_width_log2) + a_off;
|
||||
__m128i startoffs = _mm_set1_epi8 (startoff);
|
||||
__m128i curr_ns = _mm_add_epi8 (startoffs, nslo);
|
||||
__m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, curr_ns);
|
||||
__m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
|
||||
__m128i b_unread = _mm_blendv_epi8 (b_with_old, a, unrd_mask);
|
||||
|
||||
old_b = b_shifted;
|
||||
|
||||
__m128i sad_ab = _mm_sad_epu8(a, b_unread);
|
||||
sse_inc = _mm_add_epi64(sse_inc, sad_ab);
|
||||
}
|
||||
}
|
||||
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
|
|
Loading…
Reference in a new issue