From b49d32af214b6ea24b02eb22bdf14853132c70c7 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 29 Jun 2022 08:59:20 +0300
Subject: [PATCH] [ibc] Add IBC buffers

---
 src/cu.h                        |  2 +-
 src/encode_coding_tree.c        | 95 +-------------------------------
 src/encoder_state-ctors_dtors.c | 21 +++++++
 src/encoderstate.c              | 52 +++++++++++++++++
 src/encoderstate.h              |  3 -
 src/global.h                    |  4 +-
 src/inter.c                     | 98 +++++++++++++++++++--------------
 src/search.c                    |  8 +--
 src/videoframe.h                |  3 +
 9 files changed, 143 insertions(+), 143 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index e3555d08..ddddaf55 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -147,7 +147,7 @@ enum uvg_tree_type {
  */
 typedef struct
 {
-  uint8_t type        : 2; //!< \brief block type, one of cu_type_t values
+  uint8_t type        : 3; //!< \brief block type, one of cu_type_t values
   uint8_t depth       : 3; //!< \brief depth / size of this block
   uint8_t part_size   : 3; //!< \brief partition mode, one of part_mode_t values
   uint8_t tr_depth    : 3; //!< \brief transform depth
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index cb27099b..fa73e08e 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1262,95 +1262,6 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
   if (cabac->only_count && bits_out) *bits_out += bits;
 }
 
-/**
-static void encode_part_mode(encoder_state_t * const state,
-                             cabac_data_t * const cabac,
-                             const cu_info_t * const cur_cu,
-                             int depth)
-{
-  // Binarization from Table 9-34 of the HEVC spec:
-  //
-  //                |   log2CbSize >     |    log2CbSize ==
-  //                |   MinCbLog2SizeY   |    MinCbLog2SizeY
-  // -------+-------+----------+---------+-----------+----------
-  //  pred  | part  | AMP      | AMP     |           |
-  //  mode  | mode  | disabled | enabled | size == 8 | size > 8
-  // -------+-------+----------+---------+-----------+----------
-  //  intra | 2Nx2N |        -         - |         1          1
-  //        |   NxN |        -         - |         0          0
-  // -------+-------+--------------------+----------------------
-  //  inter | 2Nx2N |        1         1 |         1          1
-  //        |  2NxN |       01       011 |        01         01
-  //        |  Nx2N |       00       001 |        00        001
-  //        |   NxN |        -         - |         -        000
-  //        | 2NxnU |        -      0100 |         -          -
-  //        | 2NxnD |        -      0101 |         -          -
-  //        | nLx2N |        -      0000 |         -          -
-  //        | nRx2N |        -      0001 |         -          -
-  // -------+-------+--------------------+----------------------
-  //
-  //
-  // Context indices from Table 9-37 of the HEVC spec:
-  //
-  //                                      binIdx
-  //                               |  0  1  2       3
-  // ------------------------------+------------------
-  //  log2CbSize == MinCbLog2SizeY |  0  1  2  bypass
-  //  log2CbSize >  MinCbLog2SizeY |  0  1  3  bypass
-  // ------------------------------+------------------
-  double bits = 0;
-  if (cur_cu->type == CU_INTRA) {
-    if (depth == MAX_DEPTH) {
-      cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
-      if (cur_cu->part_size == SIZE_2Nx2N) {
-        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
-      } else {
-        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN");
-      }
-    }
-  } else {
-
-    cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
-    if (cur_cu->part_size == SIZE_2Nx2N) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
-      return bits;
-    }
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split");
-
-    cabac->cur_ctx = &(cabac->ctx.part_size_model[1]);
-    if (cur_cu->part_size == SIZE_2NxN ||
-        cur_cu->part_size == SIZE_2NxnU ||
-        cur_cu->part_size == SIZE_2NxnD) {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical");
-    } else {
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal");
-    }
-
-    if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) {
-      cabac->cur_ctx = &(cabac->ctx.part_size_model[3]);
-
-      if (cur_cu->part_size == SIZE_2NxN ||
-          cur_cu->part_size == SIZE_Nx2N) {
-        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP");
-        return bits;
-      }
-      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP");
-
-      if (cur_cu->part_size == SIZE_2NxnU ||
-          cur_cu->part_size == SIZE_nLx2N) {
-        CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP");
-        if(cabac->only_count) bits += 1;
-      } else {
-        CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP");
-        if(cabac->only_count) bits += 1;
-      }
-    }
-  }
-  return bits;
-}
-**/
-
-
 bool uvg_write_split_flag(
   const encoder_state_t * const state,
   cabac_data_t* cabac,
@@ -1684,7 +1595,7 @@ void uvg_encode_coding_tree(
   } else 
 #endif
 
-  if (cur_cu->type == CU_INTER) {
+  if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
     uint8_t imv_mode = UVG_IMV_OFF;
     
     const int num_pu = uvg_part_mode_num_parts[cur_cu->part_size];
@@ -1706,10 +1617,10 @@ void uvg_encode_coding_tree(
     // 0 = off, 1 = fullpel, 2 = 4-pel, 3 = half-pel
     if (ctrl->cfg.amvr && non_zero_mvd) {
       cabac->cur_ctx = &(cabac->ctx.imv_flag[0]);
-      CABAC_BIN(cabac, (imv_mode > UVG_IMV_OFF), "imv_flag");
+      if(cur_cu->type != CU_IBC) CABAC_BIN(cabac, (imv_mode > UVG_IMV_OFF), "imv_flag");
       if (imv_mode > UVG_IMV_OFF) {
         cabac->cur_ctx = &(cabac->ctx.imv_flag[4]);
-        CABAC_BIN(cabac, (imv_mode < UVG_IMV_HPEL), "imv_flag");
+        if(cur_cu->type != CU_IBC) CABAC_BIN(cabac, (imv_mode < UVG_IMV_HPEL), "imv_flag");
         if (imv_mode < UVG_IMV_HPEL) {
           cabac->cur_ctx = &(cabac->ctx.imv_flag[1]);
           CABAC_BIN(cabac, (imv_mode > UVG_IMV_FPEL), "imv_flag"); // 1 indicates 4PEL, 0 FPEL
diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c
index bb1300af..037f61d8 100644
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@@ -122,6 +122,18 @@ static int encoder_state_config_tile_init(encoder_state_t * const state,
   state->tile->frame->hmvp_lut = malloc(sizeof(cu_info_t) * height_in_lcu * MAX_NUM_HMVP_CANDS);
   state->tile->frame->hmvp_size = calloc(1, sizeof(uint8_t) * height_in_lcu);
 
+  if (state->encoder_control->cfg.ibc) {
+    // Allocate pixel buffer for each LCU row
+    state->tile->frame->ibc_buffer_y = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu);
+    state->tile->frame->ibc_buffer_u = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu);
+    state->tile->frame->ibc_buffer_v = malloc(sizeof(uvg_pixel*) * state->tile->frame->height_in_lcu);
+    for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) {
+      state->tile->frame->ibc_buffer_y[i] = (uvg_pixel*)malloc(IBC_BUFFER_SIZE * 3); // ToDo: we don't need this much, but it would also support 4:4:4
+      state->tile->frame->ibc_buffer_u[i] = &state->tile->frame->ibc_buffer_y[i][IBC_BUFFER_SIZE];
+      state->tile->frame->ibc_buffer_v[i] = &state->tile->frame->ibc_buffer_y[i][IBC_BUFFER_SIZE * 2];
+    }
+  }
+
   state->tile->frame->rec = NULL;
   
   state->tile->frame->source = NULL;
@@ -197,6 +209,15 @@ static void encoder_state_config_tile_finalize(encoder_state_t * const state) {
   FREE_POINTER(state->tile->frame->hmvp_lut);
   FREE_POINTER(state->tile->frame->hmvp_size);
 
+  if (state->encoder_control->cfg.ibc) {
+    for (uint32_t i = 0; i < state->tile->frame->height_in_lcu; i++) {
+      FREE_POINTER(state->tile->frame->ibc_buffer_y[i]);
+    }
+    FREE_POINTER(state->tile->frame->ibc_buffer_y);
+    FREE_POINTER(state->tile->frame->ibc_buffer_u);
+    FREE_POINTER(state->tile->frame->ibc_buffer_v);
+  }
+
   uvg_videoframe_free(state->tile->frame);
   state->tile->frame = NULL;
   FREE_POINTER(state->tile->wf_jobs);
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 9bed1b86..bee55980 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -250,6 +250,58 @@ static void encoder_state_recdata_to_bufs(encoder_state_t * const state,
                       frame->rec->stride / 2, 1);
     }
   }
+
+  // Fill IBC buffer
+  if (state->encoder_control->cfg.ibc) {
+
+    uint32_t ibc_buffer_pos_x = lcu->position_px.x + LCU_WIDTH > IBC_BUFFER_WIDTH ? IBC_BUFFER_WIDTH - LCU_WIDTH: lcu->position_px.x;
+    uint32_t ibc_buffer_pos_x_c = ibc_buffer_pos_x >> 1;
+    uint32_t ibc_buffer_row     = lcu->position_px.y / LCU_WIDTH;
+
+    // If the buffer is full shift all the lines LCU_WIDTH left
+    if (lcu->position_px.x + LCU_WIDTH > IBC_BUFFER_WIDTH) {
+      for (uint32_t i = 0; i < LCU_WIDTH; i++) {
+        memmove(
+          &frame->ibc_buffer_y[ibc_buffer_row][i * IBC_BUFFER_WIDTH],
+          &frame->ibc_buffer_y[ibc_buffer_row][i * IBC_BUFFER_WIDTH + LCU_WIDTH],
+          sizeof(uvg_pixel) * (IBC_BUFFER_WIDTH - LCU_WIDTH));
+      }
+      if (state->encoder_control->chroma_format != UVG_CSP_400) {
+        for (uint32_t i = 0; i < LCU_WIDTH_C; i++) {
+          memmove(
+            &frame->ibc_buffer_u[ibc_buffer_row][i * IBC_BUFFER_WIDTH_C],
+            &frame->ibc_buffer_u[ibc_buffer_row]
+                                [i * IBC_BUFFER_WIDTH_C + LCU_WIDTH_C],
+            sizeof(uvg_pixel) * (IBC_BUFFER_WIDTH_C - LCU_WIDTH_C));
+          memmove(
+            &frame->ibc_buffer_v[ibc_buffer_row][i * IBC_BUFFER_WIDTH_C],
+            &frame->ibc_buffer_v[ibc_buffer_row]
+                                [i * IBC_BUFFER_WIDTH_C + LCU_WIDTH_C],
+            sizeof(uvg_pixel) * (IBC_BUFFER_WIDTH_C - LCU_WIDTH_C));
+        }
+      }
+    }
+
+    const uint32_t ibc_block_width = MIN(LCU_WIDTH, (state->tile->frame->width-lcu->position_px.x));
+    const uint32_t ibc_block_height = MIN(LCU_WIDTH, (state->tile->frame->height-lcu->position_px.y));
+
+    uvg_pixels_blit(&frame->rec->y[lcu->position_px.y * frame->rec->stride + lcu->position_px.x],
+                    &frame->ibc_buffer_y[ibc_buffer_row][ibc_buffer_pos_x],
+                    ibc_block_width, ibc_block_height,
+                    frame->rec->stride, IBC_BUFFER_WIDTH);
+
+    if (state->encoder_control->chroma_format != UVG_CSP_400) {
+       uvg_pixels_blit(&frame->rec->u[(lcu->position_px.y >> 1) * (frame->rec->stride >> 1) + (lcu->position_px.x >> 1)],
+                       &frame->ibc_buffer_u[ibc_buffer_row][ibc_buffer_pos_x_c],
+                       ibc_block_width>>1, ibc_block_height>>1,
+                       frame->rec->stride >> 1, IBC_BUFFER_WIDTH_C);
+       uvg_pixels_blit(&frame->rec->v[(lcu->position_px.y >> 1) * (frame->rec->stride >> 1) + (lcu->position_px.x >> 1)],
+                       &frame->ibc_buffer_v[ibc_buffer_row][ibc_buffer_pos_x_c],
+                       ibc_block_width>>1, ibc_block_height>>1,
+                       frame->rec->stride >> 1, IBC_BUFFER_WIDTH_C);
+
+     }
+  }
   
 }
 
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 40e1dc24..55d265e3 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -192,9 +192,6 @@ typedef struct encoder_state_config_frame_t {
   double *c_para;
   double *k_para;
 
-
-  cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row
-  uint8_t* hmvp_size; //!< \brief HMVP LUT size
   bool jccr_sign; 
 
 } encoder_state_config_frame_t;
diff --git a/src/global.h b/src/global.h
index 1c2da76f..773f9c15 100644
--- a/src/global.h
+++ b/src/global.h
@@ -176,7 +176,6 @@ typedef int32_t mv_t;
 //! pow(2, MIN_SIZE)
 #define CU_MIN_SIZE_PIXELS (1 << MIN_SIZE)
 
-//! Round frame size up to this interval (8 pixels)
 #define CONF_WINDOW_PAD_IN_PIXELS ((1 << MIN_SIZE)<<1)
 
 //! spec: CtbSizeY
@@ -259,6 +258,9 @@ typedef int32_t mv_t;
  *
  */
 #define IBC_MRG_MAX_NUM_CANDS 6
+#define IBC_BUFFER_SIZE       (256*128)
+#define IBC_BUFFER_WIDTH      (IBC_BUFFER_SIZE / LCU_WIDTH)
+#define IBC_BUFFER_WIDTH_C    ((IBC_BUFFER_SIZE / LCU_WIDTH) >> 1)
 
 
 #define MAX_NUM_HMVP_CANDS 5
diff --git a/src/inter.c b/src/inter.c
index f89ddf50..7f4c81bf 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -626,49 +626,63 @@ void uvg_inter_pred_pu(const encoder_state_t * const state,
   const int pu_h = PU_GET_H(cu->part_size, width, i_pu);
   cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
 
-  if (pu->inter.mv_dir == 3) {
-    const uvg_picture *const refs[2] = {
-      state->frame->ref->images[
-        state->frame->ref_LX[0][
-          pu->inter.mv_ref[0]]],
-      state->frame->ref->images[
-        state->frame->ref_LX[1][
-          pu->inter.mv_ref[1]]],
-    };
-    uvg_inter_recon_bipred(state,
-      refs[0], refs[1],
-      pu_x, pu_y,
-      pu_w, pu_h,
-      pu->inter.mv,
-      lcu,
-      predict_luma, predict_chroma);
+  if (pu->type == CU_IBC) {
+    const int offset = x_scu + y_scu * LCU_WIDTH;
+    const int offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
+    uvg_pixels_blit(lcu->rec.y + offset, lcu->rec.y + offset, width, width, LCU_WIDTH, LCU_WIDTH);
+    uvg_pixels_blit(lcu->rec.u + offset_c, lcu->rec.joint_u + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.v + offset_c, lcu->rec.joint_v + offset_c, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+  } else {
+
+    if (pu->inter.mv_dir == 3) {
+      const uvg_picture * const refs[2] = {
+        state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]],
+        state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]],
+      };
+      uvg_inter_recon_bipred(
+        state,
+        refs[0],
+        refs[1],
+        pu_x,
+        pu_y,
+        pu_w,
+        pu_h,
+        pu->inter.mv,
+        lcu,
+        predict_luma,
+        predict_chroma);
+    } else {
+      const int                 mv_idx = pu->inter.mv_dir - 1;
+      const uvg_picture * const ref =
+        (cu->type == CU_IBC) ?
+          state->tile->frame->rec :
+          (state->frame->ref
+             ->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]]);
+
+      const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x);
+      const unsigned offset_chroma =
+        SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2;
+      yuv_t lcu_adapter;
+      lcu_adapter.size = pu_w * pu_h;
+      lcu_adapter.y    = lcu->rec.y + offset_luma,
+      lcu_adapter.u    = lcu->rec.u + offset_chroma,
+      lcu_adapter.v    = lcu->rec.v + offset_chroma,
+
+      inter_recon_unipred(
+        state,
+        ref,
+        pu_x,
+        pu_y,
+        pu_w,
+        pu_h,
+        LCU_WIDTH,
+        pu->inter.mv[mv_idx],
+        &lcu_adapter,
+        NULL,
+        predict_luma,
+        predict_chroma);
+    }
   }
-  else {
-    const int mv_idx = pu->inter.mv_dir - 1;
-    const uvg_picture *const ref =
-      state->frame->ref->images[
-        state->frame->ref_LX[mv_idx][
-          pu->inter.mv_ref[mv_idx]]];
-
-    const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x);
-    const unsigned offset_chroma = SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2;
-    yuv_t lcu_adapter;
-    lcu_adapter.size = pu_w * pu_h;
-    lcu_adapter.y = lcu->rec.y + offset_luma,
-    lcu_adapter.u = lcu->rec.u + offset_chroma,
-    lcu_adapter.v = lcu->rec.v + offset_chroma,
-
-    inter_recon_unipred(state,
-      ref,
-      pu_x, pu_y,
-      pu_w, pu_h,
-      LCU_WIDTH,
-      pu->inter.mv[mv_idx],
-      &lcu_adapter,
-      NULL,
-      predict_luma, predict_chroma);
-  }
-
   if (predict_chroma && state->encoder_control->cfg.jccr) {
     const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
     uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
diff --git a/src/search.c b/src/search.c
index 2e594126..59c99473 100644
--- a/src/search.c
+++ b/src/search.c
@@ -179,7 +179,7 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
   }
 }
 
-static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width)
+static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width, uint8_t type)
 {
   const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size;
   const int num_pu = uvg_part_mode_num_parts[part_mode];
@@ -191,7 +191,7 @@ static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width)
     const int height_pu = PU_GET_H(part_mode, cu_width, i);
 
     cu_info_t *pu  = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
-    pu->type = CU_INTER;
+    pu->type = type;
     lcu_fill_cu_info(lcu, x_pu, y_pu, width_pu, height_pu, pu);
   }
 }
@@ -1034,7 +1034,7 @@ static double search_cu(
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
 
 
-    } else if (cur_cu->type == CU_INTER) {
+    } else if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) {
 
       if (!cur_cu->skipped) {
 
@@ -1080,7 +1080,7 @@ static double search_cu(
           inter_bitcost += cur_cu->merge_idx;        
         }
       }
-      lcu_fill_inter(lcu, x_local, y_local, cu_width);
+      lcu_fill_inter(lcu, x_local, y_local, cu_width, cur_cu->type);
       lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
     }
   }
diff --git a/src/videoframe.h b/src/videoframe.h
index e1a82181..54f17689 100644
--- a/src/videoframe.h
+++ b/src/videoframe.h
@@ -78,6 +78,9 @@ typedef struct videoframe
 
   int32_t poc;           //!< \brief Picture order count
   cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row
+  uvg_pixel **ibc_buffer_y; //!< \brief Intra Block Copy buffer for each LCU row 
+  uvg_pixel **ibc_buffer_u; //!< \brief Intra Block Copy buffer for each LCU row 
+  uvg_pixel **ibc_buffer_v; //!< \brief Intra Block Copy buffer for each LCU row 
 
   uint8_t* hmvp_size; //!< \brief HMVP LUT size
   bool source_lmcs_mapped; //!< \brief Indicate if source_lmcs is available and mapped to LMCS