Merge branch 'deblock_fix' into 'master'

[Deblock] Fix deblock when using inter See merge request cs/ultravideo/vvc/uvg266!8
2024-11-30 12:44:07 +00:00 · 2021-12-31 14:41:06 +02:00 · 2021-12-31 14:41:06 +02:00 · 25a8a40de0
parent 67ba444884 57d8623931
commit 25a8a40de0
4 changed files with 51 additions and 32 deletions
--- a/src/filter.c
+++ b/src/filter.c
@ -374,6 +374,21 @@ static INLINE void gather_deblock_pixels(
  }
 }

+/**
+* \brief Gather pixels from src to dst using a custom stride and step for src
+*/
+static INLINE void gather_pixels(
+    const kvz_pixel *src,
+    int step,
+    int stride,
+    int numel,
+    kvz_pixel *dst)
+{
+  for (int i = 0; i < numel; ++i) {
+    dst[i] = src[i * step + stride];
+  }
+}
+
 /**
 * \brief Scatter pixels
 */
@ -693,8 +708,8 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
    int32_t tc_index;
    int32_t tc;

-    //Deblock adapted to halve pixel mvd. TODO: Tie into actual number of fractional mv bits
-    const int16_t mvdThreashold = 2; //(1 << (MV_INTERNAL_FRACTIONAL_BITS - 1))
+    //Deblock adapted to halve pixel mvd.
+    const int16_t mvdThreashold = 1 << (INTERNAL_MV_PREC - 1);

    uint32_t num_4px_parts  = length / 4;

@ -722,7 +737,7 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
        } else {
          x_coord = x + 4 * block_idx;
          cu_p = kvz_cu_array_at(frame->cu_array, x_coord, y - 1);
-          cu_q = kvz_cu_array_at(frame->cu_array, x_coord, y    );
+          cu_q = kvz_cu_array_at(frame->cu_array, x_coord, y);
        }

        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y)
@ -730,24 +745,15 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,

        // Filter strength
        strength = 0;
-        if (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
+        if (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) { // Intra is used
          strength = 2;
-        } else if (tu_boundary && nonzero_coeffs) {
+        }
+        else if (tu_boundary && nonzero_coeffs) {
          // Non-zero residual/coeffs and transform boundary
          // Neither CU is intra so tr_depth <= MAX_DEPTH.
          strength = 1;
-        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 &&
-                 ((abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][0] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][0]) >= mvdThreashold) ||
-                  (abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][1] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][1]) >= mvdThreashold))) {
-          // Absolute motion vector diff between blocks >= 0.5 (Integer pixel)
-          strength = 1;
-        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 &&
-                   cu_q->inter.mv_ref[cu_q->inter.mv_dir - 1] != cu_p->inter.mv_ref[cu_p->inter.mv_dir - 1]) {
-          strength = 1;
        }
-        
-        // B-slice related checks
-        if(!strength && state->frame->slicetype == KVZ_SLICE_B) {
+        else if(cu_p->inter.mv_dir == 3 || cu_q->inter.mv_dir == 3 || state->frame->slicetype == KVZ_SLICE_B) { // B-slice related checks. TODO: Need to account for cu_p being in another slice?

          // Zero all undefined motion vectors for easier usage
          if(!(cu_q->inter.mv_dir & 1)) {
@ -807,6 +813,17 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
            strength = 1;
          }
        }
+        else /*if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3)*/ { //is P-slice
+          if (cu_q->inter.mv_ref[cu_q->inter.mv_dir - 1] != cu_p->inter.mv_ref[cu_p->inter.mv_dir - 1]) {
+            // Reference pictures are different
+            strength = 1;
+          } else if (
+            ((abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][0] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][0]) >= mvdThreashold) ||
+            (abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][1] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][1]) >= mvdThreashold))) {
+            // Absolute motion vector diff between blocks >= 0.5 (Integer pixel)
+            strength = 1;
+          }
+        }
      
        tc_index        = CLIP(0, MAX_QP + 2, (int32_t)(qp + 2*(strength - 1) + (tc_offset_div2 << 1)));
        tc              = lumaBitdepth < 10 ? ((kvz_g_tc_table_8x8[tc_index] + (1 << (9 - lumaBitdepth))) >> (10 - lumaBitdepth))
@ -867,19 +884,20 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
        int_fast32_t dp3L = dp3;
        int_fast32_t dq3L = dq3;
        
+        //In case of large blocks, need to gather extra pixels
        //bL:
        //line0 p7 p6 p5 p4 q4 q5 q6 q7
        kvz_pixel bL[4][8];

        if (is_side_P_large) {
-          gather_deblock_pixels(edge_src - 6 * x_stride, x_stride, 0 * y_stride, 2, &bL[0][0]/* - 2 */);
-          gather_deblock_pixels(edge_src - 6 * x_stride, x_stride, 3 * y_stride, 2, &bL[3][0]/* - 2 */);
+          gather_pixels(edge_src - 8 * x_stride, x_stride, 0 * y_stride, 4, &bL[0][0]);
+          gather_pixels(edge_src - 8 * x_stride, x_stride, 3 * y_stride, 4, &bL[3][0]);
          dp0L = (dp0L + abs(bL[0][2] - 2 * bL[0][3] + b[0][0]) + 1) >> 1;
          dp3L = (dp3L + abs(bL[3][2] - 2 * bL[3][3] + b[3][0]) + 1) >> 1;
        }
        if (is_side_Q_large) {
-          gather_deblock_pixels(edge_src + 6 * x_stride, x_stride, 0 * y_stride, 2, &bL[0][2]);
-          gather_deblock_pixels(edge_src + 6 * x_stride, x_stride, 3 * y_stride, 2, &bL[3][2]);
+          gather_pixels(edge_src + 4 * x_stride, x_stride, 0 * y_stride, 4, &bL[0][4]);
+          gather_pixels(edge_src + 4 * x_stride, x_stride, 3 * y_stride, 4, &bL[3][4]);
          dq0L = (dq0L + abs(b[0][7] - 2 * bL[0][4] + bL[0][5]) + 1) >> 1;
          dq3L = (dq3L + abs(b[3][7] - 2 * bL[3][4] + bL[3][5]) + 1) >> 1;
        }
@ -897,13 +915,13 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
            gather_deblock_pixels(edge_src, x_stride, 2 * y_stride, 4, &b[2][0]);
            if (is_side_P_large)
            {
-              gather_deblock_pixels(edge_src - 6 * x_stride, x_stride, 1 * y_stride, 2, &bL[1][0] - 2);
-              gather_deblock_pixels(edge_src - 6 * x_stride, x_stride, 2 * y_stride, 2, &bL[2][0] - 2);
+              gather_pixels(edge_src - 8 * x_stride, x_stride, 1 * y_stride, 4, &bL[1][0]);
+              gather_pixels(edge_src - 8 * x_stride, x_stride, 2 * y_stride, 4, &bL[2][0]);
            }
            if (is_side_Q_large)
            {
-              gather_deblock_pixels(edge_src + 6 * x_stride, x_stride, 1 * y_stride, 2, &bL[1][2]);
-              gather_deblock_pixels(edge_src + 6 * x_stride, x_stride, 2 * y_stride, 2, &bL[2][2]);
+              gather_pixels(edge_src + 4 * x_stride, x_stride, 1 * y_stride, 4, &bL[1][4]);
+              gather_pixels(edge_src + 4 * x_stride, x_stride, 2 * y_stride, 4, &bL[2][4]);
            }

            for (int i = 0; i < 4; ++i) {
@ -1302,7 +1320,6 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state,
 // - Strength calculation to include average Luma level (Luma Adaptive Deblocing Filter LADF) (optional)
 // - Deblocking strength for CIIP and IBC modes (CIIP/IBC not currently used)
 // - Handle new prediction modes (i.e. PLT) (PLT not currently used)
-// - Luma deblocking on a 4x4 grid
 // - Deblocking filter for subblock boundaries
 // - Allow loop filtering across slice/tile boundaries?
 void kvz_filter_deblock_lcu(encoder_state_t * const state, int x_px, int y_px)
--- a/src/search.c
+++ b/src/search.c
@ -200,8 +200,10 @@ static void lcu_fill_cbf(lcu_t *lcu, int x_local, int y_local, int width, cu_inf
      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & mask, y & mask);
      cu_info_t *cu_to   = LCU_GET_CU_AT_PX(lcu, x, y);
      if (cu_from != cu_to) {
-        // Chroma coeff data is not used, luma is needed for deblocking
+        // Chroma and luma coeff data is needed for deblocking
        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y);
+        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U);
+        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V);
      }
    }
  }
--- a/tests/test_slices.sh
+++ b/tests/test_slices.sh
@ -3,6 +3,6 @@
 set -eu
 . "${0%/*}/util.sh"

-valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --tiles=2x2 --no-deblock
+valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --tiles=2x2
 #valgrind_test 264x130 10 --threads=2 --owf=1 --preset=ultrafast --slices=wpp
 #if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 --threads=2 --owf=1 --preset=fast --slices=wpp --no-open-gop; fi
--- a/tests/test_tools.sh
+++ b/tests/test_tools.sh
@ -5,7 +5,7 @@
 set -eu
 . "${0%/*}/util.sh"

-common_args='264x128 10 yuv420p -p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --pu-depth-inter 0-3 --no-bipred --no-tmvp --no-deblock --gop=0'
+common_args='264x128 10 yuv420p -p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --pu-depth-inter 0-3 --no-bipred --no-tmvp --gop=0'

 valgrind_test $common_args --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3
 valgrind_test $common_args --no-rdoq --no-signhide --subme=0 --bipred