From cc4c7576950117e6675df8ecf83b2b3db0857b70 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Sat, 2 Jul 2022 18:18:42 +0300
Subject: [PATCH] [ibc] Fix bugs on IBC reconstruction and add a simple search
 for I-frames

---
 src/debug.c              |  2 +-
 src/encode_coding_tree.c |  2 +-
 src/encoderstate.c       |  2 +-
 src/inter.c              | 22 +++++------
 src/search.c             | 85 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 99 insertions(+), 14 deletions(-)

diff --git a/src/debug.c b/src/debug.c
index eed773ee..1a2f00a0 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -131,7 +131,7 @@ void uvg_dbg_yuview_init(const encoder_control_t* const encoder, char* filename,
   fprintf(yuview_output, "%%;scaleFactor;16\r\n");
   fprintf(yuview_output, "%%;type;13;MVInterL0;vector\r\n");
   fprintf(yuview_output, "%%;vectorColor;0;0;0;255\r\n");
-  fprintf(yuview_output, "%%;scaleFactor;16\r\n");
+  fprintf(yuview_output, "%%;scaleFactor;4\r\n");
   fprintf(yuview_output, "%%;type;14;MVInterL1;vector\r\n");
   fprintf(yuview_output, "%%;vectorColor;255;255;255;255\r\n");
   fprintf(yuview_output, "%%;scaleFactor;16\r\n");
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index fa73e08e..88aec44e 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1458,7 +1458,7 @@ void uvg_encode_coding_tree(
    // CABAC_BIN(cabac, 0, "split_transform_flag");
   }
 
-  DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, (cur_cu->type == CU_INTRA)?0:1);
+  DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, cur_cu->type-1);
 
   if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 7bb12de8..e6f8546e 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -254,7 +254,7 @@ static void encoder_state_recdata_to_bufs(encoder_state_t * const state,
   // Fill IBC buffer
   if (state->encoder_control->cfg.ibc) {
 
-    uint32_t ibc_buffer_pos_x = lcu->position_px.x + LCU_WIDTH > IBC_BUFFER_WIDTH ? IBC_BUFFER_WIDTH - LCU_WIDTH: lcu->position_px.x;
+    uint32_t ibc_buffer_pos_x = lcu->position_px.x + LCU_WIDTH >= IBC_BUFFER_WIDTH ? IBC_BUFFER_WIDTH - LCU_WIDTH: lcu->position_px.x;
     uint32_t ibc_buffer_pos_x_c = ibc_buffer_pos_x >> 1;
     uint32_t ibc_buffer_row     = lcu->position_px.y / LCU_WIDTH;
 
diff --git a/src/inter.c b/src/inter.c
index 5fd8c21c..944f9c47 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -612,20 +612,20 @@ static void ibc_recon_cu(const encoder_state_t * const state,
   int32_t    mv_y     = cu->inter.mv[0][1] >> UVG_IMV_4PEL;
   uint32_t   ibc_row  = y / LCU_WIDTH;
 
-  int32_t    buffer_x = ((x - x_scu) + LCU_WIDTH < IBC_BUFFER_WIDTH ?
+  int32_t    buffer_x = ((x - x_scu) + LCU_WIDTH <= IBC_BUFFER_WIDTH ?
                           x :
-                          x - (((x - x_scu) + LCU_WIDTH) - IBC_BUFFER_WIDTH)) + mv_x;
+                          x - (((x - x_scu)) - IBC_BUFFER_WIDTH)) + mv_x;
   int32_t buffer_y = y_scu + mv_y;
 
-  // The whole block must fir to the left of the current position
+  // The whole block must be to the left of the current position
   assert(-mv_x >= width);
 
   // Predicted block completely outside of this LCU
   if (mv_x + x_scu + width <= 0) {  
     if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width, width, IBC_BUFFER_WIDTH, LCU_WIDTH);
     if (predict_chroma) {
-      uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
-      uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.u + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.v + offset_c, width / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
     }
   } else if (mv_x + x_scu + width >= width) { // Completely in current LCU
     if(predict_luma) uvg_pixels_blit(&lcu->rec.y[(y_scu + mv_y) * LCU_WIDTH + x_scu + mv_x], lcu->rec.y + offset, width, width, LCU_WIDTH, LCU_WIDTH);
@@ -639,15 +639,15 @@ static void ibc_recon_cu(const encoder_state_t * const state,
     uint32_t width_lcu    = width - width_buffer;
     if(predict_luma) uvg_pixels_blit(&state->tile->frame->ibc_buffer_y[ibc_row][buffer_y * IBC_BUFFER_WIDTH + buffer_x], lcu->rec.y + offset, width_buffer, width, IBC_BUFFER_WIDTH, LCU_WIDTH);
     if (predict_chroma) {    
-      uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.u + offset_c, width_buffer / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
-      uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y >> 1) * IBC_BUFFER_WIDTH_C + (buffer_x >> 1)], lcu->rec.v + offset_c, width_buffer / 2, width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(&state->tile->frame->ibc_buffer_u[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.u + offset_c, width_buffer / 2 + (width_buffer&1), width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
+      uvg_pixels_blit(&state->tile->frame->ibc_buffer_v[ibc_row][(buffer_y / 2) * IBC_BUFFER_WIDTH_C + (buffer_x / 2)], lcu->rec.v + offset_c, width_buffer / 2 + (width_buffer&1), width / 2, IBC_BUFFER_WIDTH_C, LCU_WIDTH_C);
     }
 
     offset += width_buffer;
-    offset_c += width_buffer/2;
+    offset_c += width_buffer/2 + (width_buffer&1);
 
     if(predict_luma) uvg_pixels_blit(&lcu->rec.y[(y_scu + mv_y) * LCU_WIDTH + x_scu + mv_x + width_buffer], lcu->rec.y + offset, width_lcu, width, LCU_WIDTH, LCU_WIDTH);
-    if (predict_chroma) {
+    if (predict_chroma && (width_lcu / 2)) {
       uvg_pixels_blit(&lcu->rec.u[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x + width_buffer) / 2], lcu->rec.u + offset_c, width_lcu / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
       uvg_pixels_blit(&lcu->rec.v[((y_scu+mv_y) / 2) * LCU_WIDTH_C + (x_scu + mv_x + width_buffer) / 2], lcu->rec.v + offset_c, width_lcu / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
     }
@@ -1202,8 +1202,8 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state,
     const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
     const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
     int32_t num_cand = state->tile->frame->hmvp_size_ibc[ctu_row];
-    for (int i = 0; i < MIN(MAX_NUM_HMVP_CANDS,num_cand); i++) {
-      cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + num_cand - 1 - i];
+    for (int i = 0; i < MIN(4,num_cand); i++) {
+      cu_info_t* cand = &state->tile->frame->hmvp_lut_ibc[ctu_row_mul_five + i];
       mv_cand[candidates][0] = cand->inter.mv[0][0];
       mv_cand[candidates][1] = cand->inter.mv[0][1];
       candidates++;
diff --git a/src/search.c b/src/search.c
index 3fefd1c2..9743905e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1008,6 +1008,91 @@ static double search_cu(
       }
     }
 
+    // Simple IBC search
+    if (can_use_intra && state->frame->slicetype == UVG_SLICE_I
+           && state->encoder_control->cfg.ibc) {
+      cu_info_t cu_backup  = *cur_cu;
+
+      uint32_t ibc_cost      = MAX_INT;
+      uint32_t ibc_cost_y    = MAX_INT;
+      uint32_t base_cost     = MAX_INT;
+      uint32_t base_cost_y   = MAX_INT;
+
+      
+      if(cur_cu->type == CU_INTRA) {
+         uvg_intra_recon_cu(state,x, y,depth, &intra_search,NULL,lcu);
+      } else {
+        uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400);
+      }
+
+      bool ibc_better = false;
+      cur_cu->type    = CU_IBC;
+      cur_cu->inter.mv_dir   = 1;
+      cur_cu->skipped                         = false;
+      cur_cu->merged                          = false;
+      cur_cu->inter.mv_cand0                  = 0;
+      optimized_sad_func_ptr_t optimized_sad = uvg_get_optimized_sad(cu_width);
+      uint32_t  source_stride = state->tile->frame->width;
+      const int x_scu    = SUB_SCU(x);
+      const int y_scu    = SUB_SCU(y);
+      const uint32_t offset = x_scu + y_scu * LCU_WIDTH;
+      const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
+
+      mv_t   best_vector[2] = {0, 0};
+
+
+      if (optimized_sad != NULL) {
+          base_cost_y = base_cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride);
+          if(state->encoder_control->chroma_format != UVG_CSP_400) {
+            base_cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+            base_cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+          }
+        } else {
+          base_cost_y = base_cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride);
+          if(state->encoder_control->chroma_format != UVG_CSP_400) {
+            base_cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+            base_cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+          }
+        }
+
+      for(int i = 0; i < 8; i++) {
+        cur_cu->inter.mv[0][0] = (-cu_width - i) << UVG_IMV_4PEL;
+        cur_cu->inter.mv[0][1] = 0;
+
+        if (x -cu_width - i < 0) break;
+
+        uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, state->encoder_control->chroma_format != UVG_CSP_400);
+        
+        if (optimized_sad != NULL) {
+          ibc_cost_y = ibc_cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width, LCU_WIDTH, source_stride);
+          if(state->encoder_control->chroma_format != UVG_CSP_400) {
+            ibc_cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+            ibc_cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+          }
+        } else {
+          ibc_cost_y = ibc_cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * source_stride + x], cu_width,cu_width, LCU_WIDTH, source_stride);
+          if(state->encoder_control->chroma_format != UVG_CSP_400) {
+            ibc_cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+            ibc_cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * source_stride / 2 + x / 2], cu_width / 2, cu_width / 2, LCU_WIDTH_C, source_stride / 2);
+          }
+        }
+        if (ibc_cost_y < base_cost_y) {
+          ibc_better     = true;
+          base_cost_y    = ibc_cost_y;
+          best_vector[0] = cur_cu->inter.mv[0][0];          
+          best_vector[1] = cur_cu->inter.mv[0][1];
+          //break;
+        }
+      }
+
+      if (!ibc_better) *cur_cu = cu_backup;
+      else {
+        cur_cu->inter.mv[0][0] = best_vector[0];
+        cur_cu->inter.mv[0][1] = best_vector[1];
+        //fprintf(stderr, "Coding IBC: %d, %d: %d, %d size: %d\r\n", x,y,cur_cu->inter.mv[0][0] / 4, cur_cu->inter.mv[0][1] / 4, cu_width);
+      }
+    }
+
     // Reconstruct best mode because we need the reconstructed pixels for
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {