Refactor luma deblocking

Changes luma deblocking to use gather and scatter instead of reading
to and writing from here and there in memory. Should make them
faster and easier to vectorize, or at least cleaner.

Splits strong and weak luma deblocking to two functions, as they have
almost nothing in common.
This commit is contained in:
Ari Koivula 2017-01-17 21:31:48 +02:00
parent 4cb2fca924
commit 24a69c7467

View file

@ -75,67 +75,80 @@ const int8_t kvz_g_chroma_filter[8][4] =
// FUNCTIONS
/**
* \brief
* \brief Perform in strong luma filtering in place.
* \param line line of 8 pixels, with center at index 4
* \param tc tc treshold
* \return Reach of the filter starting from center.
*/
static INLINE void kvz_filter_deblock_luma(const encoder_control_t * const encoder,
kvz_pixel *src,
int32_t offset,
int32_t tc,
int8_t sw,
int8_t part_P_nofilter,
int8_t part_Q_nofilter,
int32_t thr_cut,
int8_t filter_second_P,
int8_t filter_second_Q)
static INLINE int kvz_filter_deblock_luma_strong(
kvz_pixel *line,
int32_t tc)
{
int32_t delta;
const kvz_pixel m0 = line[0];
const kvz_pixel m1 = line[1];
const kvz_pixel m2 = line[2];
const kvz_pixel m3 = line[3];
const kvz_pixel m4 = line[4];
const kvz_pixel m5 = line[5];
const kvz_pixel m6 = line[6];
const kvz_pixel m7 = line[7];
int16_t m0 = src[-offset * 4];
int16_t m1 = src[-offset * 3];
int16_t m2 = src[-offset * 2];
int16_t m3 = src[-offset];
int16_t m4 = src[0];
int16_t m5 = src[offset];
int16_t m6 = src[offset * 2];
int16_t m7 = src[offset * 3];
line[1] = CLIP(m1 - 2*tc, m1 + 2*tc, (2*m0 + 3*m1 + m2 + m3 + m4 + 4) >> 3);
line[2] = CLIP(m2 - 2*tc, m2 + 2*tc, ( m1 + m2 + m3 + m4 + 2) >> 2);
line[3] = CLIP(m3 - 2*tc, m3 + 2*tc, ( m1 + 2*m2 + 2*m3 + 2*m4 + m5 + 4) >> 3);
line[4] = CLIP(m4 - 2*tc, m4 + 2*tc, ( m2 + 2*m3 + 2*m4 + 2*m5 + m6 + 4) >> 3);
line[5] = CLIP(m5 - 2*tc, m5 + 2*tc, ( m3 + m4 + m5 + m6 + 2) >> 2);
line[6] = CLIP(m6 - 2*tc, m6 + 2*tc, ( m3 + m4 + m5 + 3*m6 + 2*m7 + 4) >> 3);
if (sw) {
src[-offset * 3] = CLIP(m1 - 2*tc, m1 + 2*tc, (2*m0 + 3*m1 + m2 + m3 + m4 + 4) >> 3);
src[-offset * 2] = CLIP(m2 - 2*tc, m2 + 2*tc, ( m1 + m2 + m3 + m4 + 2) >> 2);
src[-offset] = CLIP(m3 - 2*tc, m3 + 2*tc, ( m1 + 2*m2 + 2*m3 + 2*m4 + m5 + 4) >> 3);
src[0] = CLIP(m4 - 2*tc, m4 + 2*tc, ( m2 + 2*m3 + 2*m4 + 2*m5 + m6 + 4) >> 3);
src[offset] = CLIP(m5 - 2*tc, m5 + 2*tc, ( m3 + m4 + m5 + m6 + 2) >> 2);
src[offset * 2] = CLIP(m6 - 2*tc, m6 + 2*tc, ( m3 + m4 + m5 + 3*m6 + 2*m7 + 4) >> 3);
return 3;
}
/**
* \brief Perform in weak luma filtering in place.
* \param encoder Encoder
* \param line Line of 8 pixels, with center at index 4
* \param tc The tc treshold
* \param p_2nd Whether to filter the 2nd line of P
* \param q_2nd Whether to filter the 2nd line of Q
*/
static INLINE int kvz_filter_deblock_luma_weak(
const encoder_control_t * const encoder,
kvz_pixel *line,
int32_t tc,
bool p_2nd,
bool q_2nd)
{
const kvz_pixel m1 = line[1];
const kvz_pixel m2 = line[2];
const kvz_pixel m3 = line[3];
const kvz_pixel m4 = line[4];
const kvz_pixel m5 = line[5];
const kvz_pixel m6 = line[6];
int32_t delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4;
if (abs(delta) >= tc * 10) {
return 0;
} else {
// Weak filter
delta = (9*(m4 - m3) - 3*(m5 - m2) + 8) >> 4;
int32_t tc2 = tc >> 1;
delta = CLIP(-tc, tc, delta);
line[3] = CLIP(0, (1 << encoder->bitdepth) - 1, (m3 + delta));
line[4] = CLIP(0, (1 << encoder->bitdepth) - 1, (m4 - delta));
if (abs(delta) < thr_cut) {
int32_t tc2 = tc >> 1;
delta = CLIP(-tc, tc, delta);
src[-offset] = CLIP(0, (1 << encoder->bitdepth) - 1, (m3 + delta));
src[0] = CLIP(0, (1 << encoder->bitdepth) - 1, (m4 - delta));
if(filter_second_P) {
int32_t delta1 = CLIP(-tc2, tc2, (((m1 + m3 + 1) >> 1) - m2 + delta) >> 1);
src[-offset * 2] = CLIP(0, (1 << encoder->bitdepth) - 1, m2 + delta1);
}
if(filter_second_Q) {
int32_t delta2 = CLIP(-tc2, tc2, (((m6 + m4 + 1) >> 1) - m5 - delta) >> 1);
src[offset] = CLIP(0, (1 << encoder->bitdepth) - 1, m5 + delta2);
}
if (p_2nd) {
int32_t delta1 = CLIP(-tc2, tc2, (((m1 + m3 + 1) >> 1) - m2 + delta) >> 1);
line[2] = CLIP(0, (1 << encoder->bitdepth) - 1, m2 + delta1);
}
if (q_2nd) {
int32_t delta2 = CLIP(-tc2, tc2, (((m6 + m4 + 1) >> 1) - m5 - delta) >> 1);
line[5] = CLIP(0, (1 << encoder->bitdepth) - 1, m5 + delta2);
}
if (p_2nd || q_2nd) {
return 2;
} else {
return 1;
}
}
if(part_P_nofilter) {
src[-offset] = (kvz_pixel)m3;
src[-offset * 2] = (kvz_pixel)m2;
src[-offset * 3] = (kvz_pixel)m1;
}
if(part_Q_nofilter) {
src[0] = (kvz_pixel)m4;
src[offset] = (kvz_pixel)m5;
src[offset * 2] = (kvz_pixel)m6;
}
}
@ -268,11 +281,33 @@ static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir
return (qp_p + qp_q + 1) >> 1;
}
static INLINE void gather_deblock_pixels(const kvz_pixel *src, int stride, int offset, kvz_pixel result[2][8])
/**
* \brief Gather pixels needed for deblocking
*/
static INLINE void gather_deblock_pixels(
const kvz_pixel *src,
int step,
int stride,
int reach,
kvz_pixel *dst)
{
for (int i = 0; i < 8; ++i) {
result[0][i] = src[(i - 4) * stride];
result[1][i] = src[(i - 4) * stride + offset];
for (int i = -reach; i < +reach; ++i) {
dst[i + 4] = src[i * step + stride];
}
}
/**
* \brief Scatter pixels
*/
static INLINE void scatter_deblock_pixels(
const kvz_pixel *src,
int step,
int stride,
int reach,
kvz_pixel *dst)
{
for (int i = -reach; i < +reach; ++i) {
dst[i * step + stride] = src[i + 4];
}
}
@ -328,12 +363,13 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
int32_t side_threshold = (beta + (beta >>1 )) >> 3;
int32_t tc_index;
int32_t tc;
int32_t thr_cut;
uint32_t num_4px_parts = length / 4;
const int32_t offset = (dir == EDGE_HOR) ? stride : 1;
const int32_t step = (dir == EDGE_HOR) ? 1 : stride;
// Transpose the image by swapping x and y strides when doing horizontal
// edges.
const int32_t x_stride = (dir == EDGE_VER) ? 1 : stride;
const int32_t y_stride = (dir == EDGE_VER) ? stride : 1;
// TODO: add CU based QP calculation
@ -436,42 +472,51 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
tc_index = CLIP(0, 51 + 2, (int32_t)(qp + 2*(strength - 1) + (tc_offset_div2 << 1)));
tc = kvz_g_tc_table_8x8[tc_index] * bitdepth_scale;
thr_cut = tc * 10;
}
if (strength == 0) continue;
// Gather the 6 pixels from each line required for the filter on/off
// decision. Include 2 more for the weak/strong filtering decision.
kvz_pixel b[2][8];
if (dir == EDGE_VER) {
gather_deblock_pixels(&src[block_idx * 4 * step], 1, 3 * stride, b);
} else {
gather_deblock_pixels(&src[block_idx * 4 * step], stride, 3, b);
}
// +-- edge_src
// v
// line0 p3 p2 p1 p0 q0 q1 q2 q3
kvz_pixel *edge_src = &src[block_idx * 4 * y_stride];
// Gather the lines of pixels required for the filter on/off decision.
kvz_pixel b[4][8];
gather_deblock_pixels(edge_src, x_stride, 0 * y_stride, 4, &b[0][0]);
gather_deblock_pixels(edge_src, x_stride, 3 * y_stride, 4, &b[3][0]);
int_fast32_t dp0 = abs(b[0][1] - 2 * b[0][2] + b[0][3]);
int_fast32_t dq0 = abs(b[0][4] - 2 * b[0][5] + b[0][6]);
int_fast32_t dp3 = abs(b[1][1] - 2 * b[1][2] + b[1][3]);
int_fast32_t dq3 = abs(b[1][4] - 2 * b[1][5] + b[1][6]);
int_fast32_t dp3 = abs(b[3][1] - 2 * b[3][2] + b[3][3]);
int_fast32_t dq3 = abs(b[3][4] - 2 * b[3][5] + b[3][6]);
int_fast32_t dp = dp0 + dp3;
int_fast32_t dq = dq0 + dq3;
if (dp + dq < beta) {
int8_t filter_P = (dp < side_threshold) ? 1 : 0;
int8_t filter_Q = (dq < side_threshold) ? 1 : 0;
// Strong filtering flag checking
int8_t sw = 2 * (dp0 + dq0) < beta >> 2 &&
2 * (dp3 + dq3) < beta >> 2 &&
abs(b[0][3] - b[0][4]) < (5 * tc + 1) >> 1 &&
abs(b[1][3] - b[1][4]) < (5 * tc + 1) >> 1 &&
abs(b[3][3] - b[3][4]) < (5 * tc + 1) >> 1 &&
abs(b[0][0] - b[0][3]) + abs(b[0][4] - b[0][7]) < beta >> 3 &&
abs(b[1][0] - b[1][3]) + abs(b[1][4] - b[1][7]) < beta >> 3;
abs(b[3][0] - b[3][3]) + abs(b[3][4] - b[3][7]) < beta >> 3;
// Filter four rows/columns
for (int i = 0; i < 4; i++) {
kvz_filter_deblock_luma(encoder, src + step * (4*block_idx + i), offset, tc, sw, 0, 0, thr_cut, filter_P, filter_Q);
// Read lines 1 and 2. Weak filtering doesn't use the outermost pixels
// but let's give them anyway to simplify control flow.
gather_deblock_pixels(edge_src, x_stride, 1 * y_stride, 4, &b[1][0]);
gather_deblock_pixels(edge_src, x_stride, 2 * y_stride, 4, &b[2][0]);
for (int i = 0; i < 4; ++i) {
int filter_reach;
if (sw) {
filter_reach = kvz_filter_deblock_luma_strong(&b[i][0], tc);
} else {
bool p_2nd = dp < side_threshold;
bool q_2nd = dq < side_threshold;
filter_reach = kvz_filter_deblock_luma_weak(encoder, &b[i][0], tc, p_2nd, q_2nd);
}
scatter_deblock_pixels(&b[i][0], x_stride, i * y_stride, filter_reach, edge_src);
}
}
}