From 4cd5bc38a1ec618caf6f142b84d35df8b3a680f9 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Mon, 24 May 2021 17:23:17 +0300
Subject: [PATCH] [LMCS] Luma mapping working after some rework, have to keep
 the reconstruction in the mapped domain

---
 src/encoderstate.c                     | 35 ++++++++++++++++++++------
 src/search.c                           | 10 --------
 src/strategies/avx2/quant-avx2.c       | 14 +++++++++--
 src/strategies/generic/quant-generic.c | 16 +++++++++---
 4 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/src/encoderstate.c b/src/encoderstate.c
index 6e67d143..d70a0294 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -646,6 +646,19 @@ static void encoder_state_worker_encode_lcu(void * opaque)
     set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
   }
 
+
+  if (state->encoder_control->cfg.lmcs_enable) {
+    kvz_pixel* luma = &state->tile->frame->rec->y[lcu->position_px.x + lcu->position_px.y * state->tile->frame->rec->stride];
+    for (int y = 0; y < LCU_WIDTH; y++) {
+      if (lcu->position_px.y + y < state->tile->frame->rec->height) {
+        for (int x = 0; x < LCU_WIDTH; x++) {
+          if (lcu->position_px.x + x < state->tile->frame->rec->width) luma[x] = state->tile->frame->lmcs_aps->m_invLUT[luma[x]];
+        }
+      }
+      luma += state->tile->frame->rec->stride;
+    }
+  }
+
   if (encoder->cfg.deblock_enable) {
     kvz_filter_deblock_lcu(state, lcu->position_px.x, lcu->position_px.y);
   }
@@ -771,6 +784,18 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
     set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
   }
 
+  if (state->encoder_control->cfg.lmcs_enable) {
+    kvz_pixel* luma = &state->tile->frame->rec->y[lcu->position_px.x + lcu->position_px.y * state->tile->frame->rec->stride];
+    for (int y = 0; y < LCU_WIDTH; y++) {
+      if (lcu->position_px.y+y < state->tile->frame->rec->height) {
+        for (int x = 0; x < LCU_WIDTH; x++) {
+          if (lcu->position_px.x+x < state->tile->frame->rec->width) luma[x] = state->tile->frame->lmcs_aps->m_invLUT[luma[x]];
+        }
+      }
+      luma += state->tile->frame->rec->stride;
+    }
+  }
+
   if (encoder->cfg.deblock_enable) {
     kvz_filter_deblock_lcu(state, lcu->position_px.x, lcu->position_px.y);
   }
@@ -1343,13 +1368,6 @@ static void encoder_set_source_picture(encoder_state_t * const state, kvz_pictur
   }
   state->tile->frame->rec_lmcs = state->tile->frame->rec;
 
-  if (state->encoder_control->cfg.lmcs_enable) {
-    state->tile->frame->source_lmcs = kvz_image_alloc(state->encoder_control->chroma_format, frame->width, frame->height);
-    state->tile->frame->rec_lmcs = kvz_image_alloc(state->encoder_control->chroma_format, frame->width, frame->height);
-    state->tile->frame->lmcs_aps = calloc(1, sizeof(lmcs_aps));
-    kvz_init_lmcs_aps(state->tile->frame->lmcs_aps, state->encoder_control->cfg.width, state->encoder_control->cfg.height, LCU_CU_WIDTH, LCU_CU_WIDTH, state->encoder_control->bitdepth);
-  }
-
   kvz_videoframe_set_poc(state->tile->frame, state->frame->poc);
 }
 
@@ -1618,6 +1636,9 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
   }
 
   if (state->encoder_control->cfg.lmcs_enable) {
+    state->tile->frame->lmcs_aps = calloc(1, sizeof(lmcs_aps));
+    kvz_init_lmcs_aps(state->tile->frame->lmcs_aps, state->encoder_control->cfg.width, state->encoder_control->cfg.height, LCU_CU_WIDTH, LCU_CU_WIDTH, state->encoder_control->bitdepth);
+
     // ToDo: support other signal types in LMCS
     kvz_lmcs_preanalyzer(state, state->tile->frame, state->tile->frame->lmcs_aps, RESHAPE_SIGNAL_SDR);
     kvz_construct_reshaper_lmcs(state->tile->frame->lmcs_aps);
diff --git a/src/search.c b/src/search.c
index 0c8474e2..52b891eb 100644
--- a/src/search.c
+++ b/src/search.c
@@ -910,9 +910,6 @@ static void init_lcu_t(const encoder_state_t * const state, const int x, const i
 
       memcpy(&lcu->top_ref.y[x_min_in_lcu], &hor_buf->y[luma_offset], luma_bytes);
 
-      if(state->encoder_control->cfg.lmcs_enable)
-        for (int i = 0; i < luma_bytes; i++) lcu->top_ref.y[x_min_in_lcu + i] = state->tile->frame->lmcs_aps->m_fwdLUT[lcu->top_ref.y[x_min_in_lcu + i]];
-
       if (state->encoder_control->chroma_format != KVZ_CSP_400) {
         memcpy(&lcu->top_ref.u[x_min_in_lcu], &hor_buf->u[chroma_offset], chroma_bytes);
         memcpy(&lcu->top_ref.v[x_min_in_lcu], &hor_buf->v[chroma_offset], chroma_bytes);
@@ -928,9 +925,6 @@ static void init_lcu_t(const encoder_state_t * const state, const int x, const i
 
       memcpy(&lcu->left_ref.y[y_min_in_lcu], &ver_buf->y[luma_offset], luma_bytes);
 
-      if (state->encoder_control->cfg.lmcs_enable)
-        for (int i = 0; i < luma_bytes; i++) lcu->left_ref.y[y_min_in_lcu + i] = state->tile->frame->lmcs_aps->m_fwdLUT[lcu->left_ref.y[y_min_in_lcu + i]];
-
       if (state->encoder_control->chroma_format != KVZ_CSP_400) {
         memcpy(&lcu->left_ref.u[y_min_in_lcu], &ver_buf->u[chroma_offset], chroma_bytes);
         memcpy(&lcu->left_ref.v[y_min_in_lcu], &ver_buf->v[chroma_offset], chroma_bytes);
@@ -1026,11 +1020,7 @@ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   // The best decisions through out the LCU got propagated back to depth 0,
   // so copy those back to the frame.
-  if (state->encoder_control->cfg.lmcs_enable)
-    for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; i++) work_tree[0].rec.y[i] = state->tile->frame->lmcs_aps->m_invLUT[work_tree[0].rec.y[i]];
   copy_lcu_to_cu_data(state, x, y, &work_tree[0]);
-  if (state->encoder_control->cfg.lmcs_enable)
-    for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; i++) work_tree[0].rec.y[i] = state->tile->frame->lmcs_aps->m_fwdLUT[work_tree[0].rec.y[i]];
 
   // Copy coeffs to encoder state.
   copy_coeffs(work_tree[0].coeff.y, state->coeff->y, LCU_WIDTH);
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index a92129da..ba2217d0 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -664,8 +664,18 @@ int kvz_quantize_residual_avx2(encoder_state_t *const state,
   assert(width <= TR_MAX_WIDTH);
   assert(width >= TR_MIN_WIDTH);
 
-  // Get residual. (ref_in - pred_in -> residual)
-  get_residual_avx2(ref_in, pred_in, residual, width, in_stride);
+  int y, x;
+  // ToDo: do something with the LMCS for better performance
+  if (state->encoder_control->cfg.lmcs_enable && color == COLOR_Y) {
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        residual[x + y * width] = (int16_t)(state->tile->frame->lmcs_aps->m_fwdLUT[ref_in[x + y * in_stride]] - pred_in[x + y * in_stride]);
+      }
+    }
+  } else {
+    // Get residual. (ref_in - pred_in -> residual)
+    get_residual_avx2(ref_in, pred_in, residual, width, in_stride);
+  }
 
   // Transform residual. (residual -> coeff)
   if (use_trskip) {
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index a171aff1..62609b34 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -30,6 +30,7 @@
 #include "strategyselector.h"
 #include "transform.h"
 #include "fast_coeff_cost.h"
+#include "reshape.h"
 
 #define QUANT_SHIFT 14
 /**
@@ -206,9 +207,18 @@ int kvz_quantize_residual_generic(encoder_state_t *const state,
   // Get residual. (ref_in - pred_in -> residual)
   {
     int y, x;
-    for (y = 0; y < width; ++y) {
-      for (x = 0; x < width; ++x) {
-        residual[x + y * width] = (int16_t)(ref_in[x + y * in_stride] - pred_in[x + y * in_stride]);
+    if (state->encoder_control->cfg.lmcs_enable && color == COLOR_Y) {
+      for (y = 0; y < width; ++y) {
+        for (x = 0; x < width; ++x) {
+          residual[x + y * width] = (int16_t)(state->tile->frame->lmcs_aps->m_fwdLUT[ref_in[x + y * in_stride]] - pred_in[x + y * in_stride]);
+        }
+      }
+    }
+    else {
+      for (y = 0; y < width; ++y) {
+        for (x = 0; x < width; ++x) {
+          residual[x + y * width] = (int16_t)(ref_in[x + y * in_stride] - pred_in[x + y * in_stride]);
+        }
       }
     }
   }