Create hor_sad_w8 and w4 epol mask the way w16 works

This commit is contained in:
Pauli Oikkonen 2019-02-06 19:26:08 +02:00
parent aa19bcac8a
commit 770db825b9

View file

@ -546,34 +546,30 @@ static uint32_t hor_sad_sse41_w4(const kvz_pixel *pic_data, const kvz_pixel *ref
int32_t height, uint32_t pic_stride, uint32_t ref_stride, int32_t height, uint32_t pic_stride, uint32_t ref_stride,
uint32_t left, uint32_t right) uint32_t left, uint32_t right)
{ {
int32_t leftoff = left; const int32_t right_border_idx = 3 - right;
int8_t border_idx; const int32_t border_idx = left ? left : right_border_idx;
if (left)
border_idx = left; const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
else 8, 9, 10, 11, 12, 13, 14, 15);
border_idx = 3 - right;
const int32_t border_idx_negative = border_idx >> 31;
const int32_t leftoff = border_idx_negative | left;
// Dualword (ie. line) base indexes, ie. the edges the lines read will be // Dualword (ie. line) base indexes, ie. the edges the lines read will be
// clamped towards // clamped towards
const __m128i dwbaseids = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4, const __m128i dwbaseids = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4,
8, 8, 8, 8, 12, 12, 12, 12); 8, 8, 8, 8, 12, 12, 12, 12);
const __m128i border_idxs = _mm_set1_epi8(border_idx); __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, __m128i left_128 = _mm_set1_epi8((int8_t)left);
8, 9, 10, 11, 12, 13, 14, 15);
__m128i epol_mask; right_border_idxs = _mm_add_epi8 (right_border_idxs, dwbaseids);
if (left) {
__m128i mask1 = _mm_sub_epi8(ns, border_idxs); __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
epol_mask = _mm_max_epi8(mask1, dwbaseids); __m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
} else {
if (right != 4) { const __m128i epol_mask = _mm_max_epi8(mask1, dwbaseids);
__m128i border_idxs_linewise = _mm_add_epi8(border_idxs, dwbaseids);
epol_mask = _mm_min_epi8(ns, border_idxs_linewise);
} else {
epol_mask = dwbaseids;
leftoff = -1;
}
}
const int32_t height_fourline_groups = height & ~3; const int32_t height_fourline_groups = height & ~3;
const int32_t height_residual_lines = height & 3; const int32_t height_residual_lines = height & 3;
@ -614,12 +610,13 @@ static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref
int32_t height, uint32_t pic_stride, uint32_t ref_stride, int32_t height, uint32_t pic_stride, uint32_t ref_stride,
uint32_t left, uint32_t right) uint32_t left, uint32_t right)
{ {
int32_t leftoff = left; // right is the number of overhanging pixels in the vector, so it has to be
int8_t border_idx; // handled this way to produce the index of last valid (border) pixel
if (left) const int32_t right_border_idx = 7 - right;
border_idx = left; const int32_t border_idx = left ? left : right_border_idx;
else
border_idx = 7 - right; const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
// Quadword (ie. line) base indexes, ie. the edges the lines read will be // Quadword (ie. line) base indexes, ie. the edges the lines read will be
// clamped towards; higher qword (lower line) bytes tend towards 8 and lower // clamped towards; higher qword (lower line) bytes tend towards 8 and lower
@ -627,22 +624,34 @@ static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref
const __m128i qwbaseids = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, const __m128i qwbaseids = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
8, 8, 8, 8, 8, 8, 8, 8); 8, 8, 8, 8, 8, 8, 8, 8);
const __m128i border_idxs = _mm_set1_epi8(border_idx); // Dirty hack alert! If right == block_width (ie. the entire vector is
const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, // outside the frame), move the block offset one pixel to the left (so
8, 9, 10, 11, 12, 13, 14, 15); // that the leftmost pixel in vector is actually the valid border pixel
__m128i epol_mask; // from which we want to extrapolate), and use an epol mask that will
if (left) { // simply stretch the pixel all over the vector.
__m128i mask1 = _mm_sub_epi8(ns, border_idxs); //
epol_mask = _mm_max_epi8(mask1, qwbaseids); // To avoid a branch here:
} else { // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
if (right != 8) { const int32_t border_idx_negative = border_idx >> 31;
__m128i border_idxs_linewise = _mm_add_epi8(border_idxs, qwbaseids); const int32_t leftoff = border_idx_negative | left;
epol_mask = _mm_min_epi8(ns, border_idxs_linewise);
} else { __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
epol_mask = qwbaseids; __m128i left_128 = _mm_set1_epi8((int8_t)left);
leftoff = -1;
} right_border_idxs = _mm_add_epi8 (right_border_idxs, qwbaseids);
}
// If we're straddling the left border, right_border_idx is 7 and the first
// operation does nothing. If right border, left is 0 and the second
// operation does nothing.
__m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
__m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
// If right == 8 (we're completely outside the frame), right_border_idx is
// -1 and so is mask1. Clamp negative values to qwbaseid and as discussed
// earlier, adjust the load offset instead to load the "-1'st" pixels and
// using qwbaseids as the shuffle mask, broadcast it all over the rows.
const __m128i epol_mask = _mm_max_epi8(mask1, qwbaseids);
const __m64 epol_mask_64 = (__m64)_mm_cvtsi128_si64(epol_mask); const __m64 epol_mask_64 = (__m64)_mm_cvtsi128_si64(epol_mask);
const int32_t height_fourline_groups = height & ~3; const int32_t height_fourline_groups = height & ~3;