diff --git a/src/filter.c b/src/filter.c index 7ff98ea2..f3f2afcc 100644 --- a/src/filter.c +++ b/src/filter.c @@ -75,67 +75,80 @@ const int8_t kvz_g_chroma_filter[8][4] = // FUNCTIONS /** - * \brief + * \brief Perform in strong luma filtering in place. + * \param line line of 8 pixels, with center at index 4 + * \param tc tc treshold + * \return Reach of the filter starting from center. */ -static INLINE void kvz_filter_deblock_luma(const encoder_control_t * const encoder, - kvz_pixel *src, - int32_t offset, - int32_t tc, - int8_t sw, - int8_t part_P_nofilter, - int8_t part_Q_nofilter, - int32_t thr_cut, - int8_t filter_second_P, - int8_t filter_second_Q) +static INLINE int kvz_filter_deblock_luma_strong( + kvz_pixel *line, + int32_t tc) { - int32_t delta; + const kvz_pixel m0 = line[0]; + const kvz_pixel m1 = line[1]; + const kvz_pixel m2 = line[2]; + const kvz_pixel m3 = line[3]; + const kvz_pixel m4 = line[4]; + const kvz_pixel m5 = line[5]; + const kvz_pixel m6 = line[6]; + const kvz_pixel m7 = line[7]; - int16_t m0 = src[-offset * 4]; - int16_t m1 = src[-offset * 3]; - int16_t m2 = src[-offset * 2]; - int16_t m3 = src[-offset]; - int16_t m4 = src[0]; - int16_t m5 = src[offset]; - int16_t m6 = src[offset * 2]; - int16_t m7 = src[offset * 3]; + line[1] = CLIP(m1 - 2*tc, m1 + 2*tc, (2*m0 + 3*m1 + m2 + m3 + m4 + 4) >> 3); + line[2] = CLIP(m2 - 2*tc, m2 + 2*tc, ( m1 + m2 + m3 + m4 + 2) >> 2); + line[3] = CLIP(m3 - 2*tc, m3 + 2*tc, ( m1 + 2*m2 + 2*m3 + 2*m4 + m5 + 4) >> 3); + line[4] = CLIP(m4 - 2*tc, m4 + 2*tc, ( m2 + 2*m3 + 2*m4 + 2*m5 + m6 + 4) >> 3); + line[5] = CLIP(m5 - 2*tc, m5 + 2*tc, ( m3 + m4 + m5 + m6 + 2) >> 2); + line[6] = CLIP(m6 - 2*tc, m6 + 2*tc, ( m3 + m4 + m5 + 3*m6 + 2*m7 + 4) >> 3); - if (sw) { - src[-offset * 3] = CLIP(m1 - 2*tc, m1 + 2*tc, (2*m0 + 3*m1 + m2 + m3 + m4 + 4) >> 3); - src[-offset * 2] = CLIP(m2 - 2*tc, m2 + 2*tc, ( m1 + m2 + m3 + m4 + 2) >> 2); - src[-offset] = CLIP(m3 - 2*tc, m3 + 2*tc, ( m1 + 2*m2 + 2*m3 + 2*m4 + m5 + 4) >> 3); - src[0] = CLIP(m4 - 2*tc, m4 + 2*tc, ( m2 + 2*m3 + 2*m4 + 2*m5 + m6 + 4) >> 3); - src[offset] = CLIP(m5 - 2*tc, m5 + 2*tc, ( m3 + m4 + m5 + m6 + 2) >> 2); - src[offset * 2] = CLIP(m6 - 2*tc, m6 + 2*tc, ( m3 + m4 + m5 + 3*m6 + 2*m7 + 4) >> 3); + return 3; +} + +/** + * \brief Perform in weak luma filtering in place. + * \param encoder Encoder + * \param line Line of 8 pixels, with center at index 4 + * \param tc The tc treshold + * \param p_2nd Whether to filter the 2nd line of P + * \param q_2nd Whether to filter the 2nd line of Q + */ +static INLINE int kvz_filter_deblock_luma_weak( + const encoder_control_t * const encoder, + kvz_pixel *line, + int32_t tc, + bool p_2nd, + bool q_2nd) +{ + const kvz_pixel m1 = line[1]; + const kvz_pixel m2 = line[2]; + const kvz_pixel m3 = line[3]; + const kvz_pixel m4 = line[4]; + const kvz_pixel m5 = line[5]; + const kvz_pixel m6 = line[6]; + + int32_t delta = (9 * (m4 - m3) - 3 * (m5 - m2) + 8) >> 4; + + if (abs(delta) >= tc * 10) { + return 0; } else { - // Weak filter - delta = (9*(m4 - m3) - 3*(m5 - m2) + 8) >> 4; + int32_t tc2 = tc >> 1; + delta = CLIP(-tc, tc, delta); + line[3] = CLIP(0, (1 << encoder->bitdepth) - 1, (m3 + delta)); + line[4] = CLIP(0, (1 << encoder->bitdepth) - 1, (m4 - delta)); - if (abs(delta) < thr_cut) { - int32_t tc2 = tc >> 1; - delta = CLIP(-tc, tc, delta); - src[-offset] = CLIP(0, (1 << encoder->bitdepth) - 1, (m3 + delta)); - src[0] = CLIP(0, (1 << encoder->bitdepth) - 1, (m4 - delta)); - - if(filter_second_P) { - int32_t delta1 = CLIP(-tc2, tc2, (((m1 + m3 + 1) >> 1) - m2 + delta) >> 1); - src[-offset * 2] = CLIP(0, (1 << encoder->bitdepth) - 1, m2 + delta1); - } - if(filter_second_Q) { - int32_t delta2 = CLIP(-tc2, tc2, (((m6 + m4 + 1) >> 1) - m5 - delta) >> 1); - src[offset] = CLIP(0, (1 << encoder->bitdepth) - 1, m5 + delta2); - } + if (p_2nd) { + int32_t delta1 = CLIP(-tc2, tc2, (((m1 + m3 + 1) >> 1) - m2 + delta) >> 1); + line[2] = CLIP(0, (1 << encoder->bitdepth) - 1, m2 + delta1); + } + if (q_2nd) { + int32_t delta2 = CLIP(-tc2, tc2, (((m6 + m4 + 1) >> 1) - m5 - delta) >> 1); + line[5] = CLIP(0, (1 << encoder->bitdepth) - 1, m5 + delta2); + } + + if (p_2nd || q_2nd) { + return 2; + } else { + return 1; } - } - - if(part_P_nofilter) { - src[-offset] = (kvz_pixel)m3; - src[-offset * 2] = (kvz_pixel)m2; - src[-offset * 3] = (kvz_pixel)m1; - } - if(part_Q_nofilter) { - src[0] = (kvz_pixel)m4; - src[offset] = (kvz_pixel)m5; - src[offset * 2] = (kvz_pixel)m6; } } @@ -268,11 +281,33 @@ static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir return (qp_p + qp_q + 1) >> 1; } -static INLINE void gather_deblock_pixels(const kvz_pixel *src, int stride, int offset, kvz_pixel result[2][8]) +/** + * \brief Gather pixels needed for deblocking + */ +static INLINE void gather_deblock_pixels( + const kvz_pixel *src, + int step, + int stride, + int reach, + kvz_pixel *dst) { - for (int i = 0; i < 8; ++i) { - result[0][i] = src[(i - 4) * stride]; - result[1][i] = src[(i - 4) * stride + offset]; + for (int i = -reach; i < +reach; ++i) { + dst[i + 4] = src[i * step + stride]; + } +} + +/** +* \brief Scatter pixels +*/ +static INLINE void scatter_deblock_pixels( + const kvz_pixel *src, + int step, + int stride, + int reach, + kvz_pixel *dst) +{ + for (int i = -reach; i < +reach; ++i) { + dst[i * step + stride] = src[i + 4]; } } @@ -328,12 +363,13 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, int32_t side_threshold = (beta + (beta >>1 )) >> 3; int32_t tc_index; int32_t tc; - int32_t thr_cut; uint32_t num_4px_parts = length / 4; - const int32_t offset = (dir == EDGE_HOR) ? stride : 1; - const int32_t step = (dir == EDGE_HOR) ? 1 : stride; + // Transpose the image by swapping x and y strides when doing horizontal + // edges. + const int32_t x_stride = (dir == EDGE_VER) ? 1 : stride; + const int32_t y_stride = (dir == EDGE_VER) ? stride : 1; // TODO: add CU based QP calculation @@ -436,42 +472,51 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, tc_index = CLIP(0, 51 + 2, (int32_t)(qp + 2*(strength - 1) + (tc_offset_div2 << 1))); tc = kvz_g_tc_table_8x8[tc_index] * bitdepth_scale; - thr_cut = tc * 10; } if (strength == 0) continue; - // Gather the 6 pixels from each line required for the filter on/off - // decision. Include 2 more for the weak/strong filtering decision. - kvz_pixel b[2][8]; - if (dir == EDGE_VER) { - gather_deblock_pixels(&src[block_idx * 4 * step], 1, 3 * stride, b); - } else { - gather_deblock_pixels(&src[block_idx * 4 * step], stride, 3, b); - } + // +-- edge_src + // v + // line0 p3 p2 p1 p0 q0 q1 q2 q3 + kvz_pixel *edge_src = &src[block_idx * 4 * y_stride]; + + // Gather the lines of pixels required for the filter on/off decision. + kvz_pixel b[4][8]; + gather_deblock_pixels(edge_src, x_stride, 0 * y_stride, 4, &b[0][0]); + gather_deblock_pixels(edge_src, x_stride, 3 * y_stride, 4, &b[3][0]); int_fast32_t dp0 = abs(b[0][1] - 2 * b[0][2] + b[0][3]); int_fast32_t dq0 = abs(b[0][4] - 2 * b[0][5] + b[0][6]); - int_fast32_t dp3 = abs(b[1][1] - 2 * b[1][2] + b[1][3]); - int_fast32_t dq3 = abs(b[1][4] - 2 * b[1][5] + b[1][6]); + int_fast32_t dp3 = abs(b[3][1] - 2 * b[3][2] + b[3][3]); + int_fast32_t dq3 = abs(b[3][4] - 2 * b[3][5] + b[3][6]); int_fast32_t dp = dp0 + dp3; int_fast32_t dq = dq0 + dq3; if (dp + dq < beta) { - int8_t filter_P = (dp < side_threshold) ? 1 : 0; - int8_t filter_Q = (dq < side_threshold) ? 1 : 0; - // Strong filtering flag checking int8_t sw = 2 * (dp0 + dq0) < beta >> 2 && 2 * (dp3 + dq3) < beta >> 2 && abs(b[0][3] - b[0][4]) < (5 * tc + 1) >> 1 && - abs(b[1][3] - b[1][4]) < (5 * tc + 1) >> 1 && + abs(b[3][3] - b[3][4]) < (5 * tc + 1) >> 1 && abs(b[0][0] - b[0][3]) + abs(b[0][4] - b[0][7]) < beta >> 3 && - abs(b[1][0] - b[1][3]) + abs(b[1][4] - b[1][7]) < beta >> 3; + abs(b[3][0] - b[3][3]) + abs(b[3][4] - b[3][7]) < beta >> 3; - // Filter four rows/columns - for (int i = 0; i < 4; i++) { - kvz_filter_deblock_luma(encoder, src + step * (4*block_idx + i), offset, tc, sw, 0, 0, thr_cut, filter_P, filter_Q); + // Read lines 1 and 2. Weak filtering doesn't use the outermost pixels + // but let's give them anyway to simplify control flow. + gather_deblock_pixels(edge_src, x_stride, 1 * y_stride, 4, &b[1][0]); + gather_deblock_pixels(edge_src, x_stride, 2 * y_stride, 4, &b[2][0]); + + for (int i = 0; i < 4; ++i) { + int filter_reach; + if (sw) { + filter_reach = kvz_filter_deblock_luma_strong(&b[i][0], tc); + } else { + bool p_2nd = dp < side_threshold; + bool q_2nd = dq < side_threshold; + filter_reach = kvz_filter_deblock_luma_weak(encoder, &b[i][0], tc, p_2nd, q_2nd); + } + scatter_deblock_pixels(&b[i][0], x_stride, i * y_stride, filter_reach, edge_src); } } }