From fb763f7940edc9a4dfa0a3a76459c39dcf5318f3 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 12 May 2014 11:35:40 +0300
Subject: [PATCH] Move coefficient generation functions from encoder.c to
 transform.c.

- These functions probably should have been there to begin with.
---
 src/encoder.c   | 402 -----------------------------------------------
 src/encoder.h   |   3 +-
 src/transform.c | 406 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/transform.h |   2 +
 4 files changed, 410 insertions(+), 403 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index a6551d59..53a5c1d7 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -2812,64 +2812,6 @@ void encode_coding_tree(encoder_state * const encoder_state,
   /* end coding_unit */
 }
 
-static void transform_chroma(encoder_state * const encoder_state, cu_info *cur_cu,
-                             int depth, const pixel *base_u, pixel *pred_u,
-                             coefficient *coeff_u, int8_t scan_idx_chroma,
-                             coefficient *pre_quant_coeff, coefficient *block)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  int base_stride = LCU_WIDTH;
-  int pred_stride = LCU_WIDTH;
-
-  int8_t width_c = LCU_WIDTH >> (depth + 1);
-
-  int i = 0;
-  unsigned ac_sum = 0;
-
-  int y, x;
-
-  for (y = 0; y < width_c; y++) {
-    for (x = 0; x < width_c; x++) {
-      block[i] = ((int16_t)base_u[x + y * (base_stride >> 1)]) -
-                  pred_u[x + y * (pred_stride >> 1)];
-      i++;
-    }
-  }
-
-  transform2d(encoder, block, pre_quant_coeff, width_c, 65535);
-  if (encoder->rdoq_enable) {
-    rdoq(encoder_state, pre_quant_coeff, coeff_u, width_c, width_c, &ac_sum, 2,
-         scan_idx_chroma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
-  } else {
-    quant(encoder_state, pre_quant_coeff, coeff_u, width_c, width_c, &ac_sum, 2,
-          scan_idx_chroma, cur_cu->type);
-  }
-}
-
-
-static void reconstruct_chroma(const encoder_state * const encoder_state, cu_info *cur_cu,
-                               int depth, coefficient *coeff_u,
-                               pixel *recbase_u, pixel *pred_u, int color_type,
-                               coefficient *pre_quant_coeff, coefficient *block)
-{
-  int8_t width_c = LCU_WIDTH >> (depth + 1);
-
-  int i, y, x;
-
-  dequant(encoder_state, coeff_u, pre_quant_coeff, width_c, width_c, (int8_t)color_type, cur_cu->type);
-  itransform2d(encoder_state->encoder_control, block, pre_quant_coeff, width_c, 65535);
-
-  i = 0;
-
-  for (y = 0; y < width_c; y++) {
-    for (x = 0; x < width_c; x++) {
-      int16_t val = block[i++] + pred_u[x + y * LCU_WIDTH_C];
-      //TODO: support 10+bits
-      recbase_u[x + y * LCU_WIDTH_C] = (uint8_t)CLIP(0, 255, val);
-    }
-  }
-}
-
 
 coeff_scan_order_t get_scan_order(int8_t cu_type, int intra_mode, int depth)
 {
@@ -2888,350 +2830,6 @@ coeff_scan_order_t get_scan_order(int8_t cu_type, int intra_mode, int depth)
 }
 
 
-int quantize_residual_chroma(encoder_state * const encoder_state,
-                                     cu_info *cur_cu, int luma_depth, color_index color,
-                                     const pixel *base_u, pixel *recbase_u, coefficient *orig_coeff_u)
-{
-  pixel pred_u[LCU_WIDTH*LCU_WIDTH>>2];
-  coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
-
-  int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
-  int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-
-  const int chroma_depth = (luma_depth == MAX_PU_DEPTH ? luma_depth - 1 : luma_depth);
-  const int8_t width_c = LCU_WIDTH >> (chroma_depth + 1);
-
-  const coeff_scan_order_t scan_idx_chroma = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, luma_depth);
-
-  int has_coeffs = 0;
-
-  {
-    int y, x;
-    for (y = 0; y < width_c; y++) {
-      for (x = 0; x < width_c; x++) {
-        pred_u[x + y * LCU_WIDTH_C] = recbase_u[x + y * LCU_WIDTH_C];
-      }
-    }
-  }
-
-  transform_chroma(encoder_state, cur_cu, chroma_depth, base_u, pred_u, coeff_u, scan_idx_chroma, pre_quant_coeff, block);
-  {
-    int i;
-    for (i = 0; i < width_c * width_c; i++) {
-      if (coeff_u[i] != 0) {
-        has_coeffs = 1;
-        break;
-      }
-    }
-  }
-  // Copy coefficients, even if they are all zeroes.
-  {
-    int i = 0;
-    int y, x;
-    for (y = 0; y < width_c; y++) {
-      for (x = 0; x < width_c; x++) {
-        orig_coeff_u[x + y * LCU_WIDTH_C] = coeff_u[i];
-        i++;
-      }
-    }
-  }
-  if (has_coeffs) {
-    reconstruct_chroma(encoder_state, cur_cu, chroma_depth,
-                        coeff_u, recbase_u, pred_u, (color == COLOR_U ? 2 : 3),
-                        pre_quant_coeff, block);
-  }
-
-  return has_coeffs;
-}
-
-
-void decide_trskip(encoder_state * const encoder_state, cu_info *cur_cu, int8_t depth, int pu_index,
-                   int16_t *residual, uint32_t *ac_sum)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  const coeff_scan_order_t scan_idx_luma = get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
-  const int8_t width = LCU_WIDTH >> depth;
-
-  //int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
-  int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-
-  int i;
-  coefficient temp_block[16];  coefficient temp_coeff[16];
-  coefficient temp_block2[16]; coefficient temp_coeff2[16];
-  uint32_t cost = 0,cost2 = 0;
-  uint32_t coeffcost = 0,coeffcost2 = 0;
-
-  // Test for transform skip
-  transformskip(encoder, residual,pre_quant_coeff, width);
-  if (encoder->rdoq_enable) {
-    rdoq(encoder_state, pre_quant_coeff, temp_coeff, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type,0);
-  } else {
-    quant(encoder_state, pre_quant_coeff, temp_coeff, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type);
-  }
-  dequant(encoder_state, temp_coeff, pre_quant_coeff, 4, 4, 0, cur_cu->type);
-  itransformskip(encoder, temp_block,pre_quant_coeff,width);
-
-  transform2d(encoder, residual,pre_quant_coeff,width,0);
-  if (encoder->rdoq_enable) {
-    rdoq(encoder_state, pre_quant_coeff, temp_coeff2, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type,0);
-  } else {
-    quant(encoder_state, pre_quant_coeff, temp_coeff2, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type);
-  }
-  dequant(encoder_state, temp_coeff2, pre_quant_coeff, 4, 4, 0, cur_cu->type);
-  itransform2d(encoder, temp_block2,pre_quant_coeff,width,0);
-
-  // SSD between original and reconstructed
-  for (i = 0; i < 16; i++) {
-    int diff = temp_block[i] - residual[i];
-    cost += diff*diff;
-
-    diff = temp_block2[i] - residual[i];
-    cost2 += diff*diff;
-  }
-
-  // Simple RDO
-  if(encoder->rdo == 1) {
-    // SSD between reconstruction and original + sum of coeffs
-    for (i = 0; i < 16; i++) {
-      coeffcost += abs((int)temp_coeff[i]);
-      coeffcost2 += abs((int)temp_coeff2[i]);
-    }
-    cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
-    cost2 += (coeffcost2 + (coeffcost2>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
-    // Full RDO
-  } else if(encoder->rdo == 2) {
-    coeffcost = get_coeff_cost(encoder_state, temp_coeff, 4, 0, scan_idx_luma);
-    coeffcost2 = get_coeff_cost(encoder_state, temp_coeff2, 4, 0, scan_idx_luma);
-
-    cost  += coeffcost*((int)encoder_state->global->cur_lambda_cost+0.5);
-    cost2 += coeffcost2*((int)encoder_state->global->cur_lambda_cost+0.5);
-  }
-
-  cur_cu->intra[pu_index].tr_skip = (cost < cost2);
-}
-
-
-/**
- * This function calculates the residual coefficients for a region of the LCU
- * (defined by x, y and depth) and updates the reconstruction with the
- * kvantized residual.
- *
- * It handles recursion for transform split, but that is currently only work
- * for 64x64 inter to 32x32 transform blocks.
- *
- * Inputs are:
- * - lcu->rec  pixels after prediction for the area
- * - lcu->ref  reference pixels for the area
- * - lcu->cu   for the area
- *
- * Outputs are:
- * - lcu->rec  reconstruction after quantized residual
- * - lcu->coeff  quantized coefficients for the area
- * - lcu->cbf  coded block flags for the area
- * - lcu->cu.intra[].tr_skip  for the area
- */
-void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, lcu_t* lcu)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  // we have 64>>depth transform size
-  const vector2d lcu_px = {x & 0x3f, y & 0x3f};
-  const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
-  cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH];
-  const int8_t width = LCU_WIDTH>>depth;
-  
-  int i;
-  
-  // Tell clang-analyzer what is up. For some reason it can't figure out from
-  // asserting just depth.
-  assert(width == 4 || width == 8 || width == 16 || width == 32 || width == 64);
-
-  // Split transform and increase depth
-  if (depth == 0 || cur_cu->tr_depth > depth) {
-    int offset = width / 2;
-    encode_transform_tree(encoder_state, x,          y,          depth+1, lcu);
-    encode_transform_tree(encoder_state, x + offset, y,          depth+1, lcu);
-    encode_transform_tree(encoder_state, x,          y + offset, depth+1, lcu);
-    encode_transform_tree(encoder_state, x + offset, y + offset, depth+1, lcu);
-
-    // Propagate coded block flags from child CUs to parent CU.
-    if (depth < MAX_DEPTH) {
-      cu_info *cu_a =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) +  (lcu_px.y>>3)        *LCU_T_CU_WIDTH];
-      cu_info *cu_b =  &lcu->cu[LCU_CU_OFFSET +  (lcu_px.x>>3)           + ((lcu_px.y+offset)>>3)*LCU_T_CU_WIDTH];
-      cu_info *cu_c =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) + ((lcu_px.y+offset)>>3)*LCU_T_CU_WIDTH];
-      if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
-        cbf_set(&cur_cu->cbf.y, depth);
-      }
-      if (cbf_is_set(cu_a->cbf.u, depth+1) || cbf_is_set(cu_b->cbf.u, depth+1) || cbf_is_set(cu_c->cbf.u, depth+1)) {
-        cbf_set(&cur_cu->cbf.u, depth);
-      }
-      if (cbf_is_set(cu_a->cbf.v, depth+1) || cbf_is_set(cu_b->cbf.v, depth+1) || cbf_is_set(cu_c->cbf.v, depth+1)) {
-        cbf_set(&cur_cu->cbf.v, depth);
-      }
-    }
-
-    return;
-  }
-
-  {
-    const int luma_offset = lcu_px.x + lcu_px.y * LCU_WIDTH;
-
-    // Pointers to current location in arrays with prediction.
-    pixel *recbase_y = &lcu->rec.y[luma_offset];
-    // Pointers to current location in arrays with reference.
-    const pixel *base_y = &lcu->ref.y[luma_offset];
-    // Pointers to current location in arrays with kvantized coefficients.
-    coefficient *orig_coeff_y = &lcu->coeff.y[luma_offset];
-
-    // Temporary buffers. Not really used for much. Possibly unnecessary.
-    pixel pred_y[LCU_WIDTH*LCU_WIDTH];
-    // Buffers for coefficients.
-    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
-
-    // Temporary buffers for kvantization and transformation.
-    int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
-    int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-    
-    uint32_t ac_sum = 0;
-    uint8_t scan_idx_luma   = SCAN_DIAG;
-
-    #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
-    uint32_t residual_sum = 0;
-    #endif
-
-    // Clear coded block flag structures for depths lower than current depth.
-    // This should ensure that the CBF data doesn't get corrupted if this function
-    // is called more than once.
-    cbf_clear(&cur_cu->cbf.y, depth + pu_index);
-    if (pu_index == 0) {
-      cbf_clear(&cur_cu->cbf.u, depth);
-      cbf_clear(&cur_cu->cbf.v, depth);
-    }
-
-    // Pick coeff scan mode according to intra prediction mode.
-    if (cur_cu->type == CU_INTRA) {
-      int chroma_mode = cur_cu->intra[0].mode_chroma;
-      if (chroma_mode == 36) {
-        chroma_mode = cur_cu->intra[pu_index].mode;
-      }
-      scan_idx_luma = get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
-    }
-    
-    // Copy Luma and Chroma to the pred-block
-    for(y = 0; y < width; y++) {
-      for(x = 0; x < width; x++) {
-        pred_y[x+y*LCU_WIDTH]=recbase_y[x+y*LCU_WIDTH];
-      }
-    }
-
-    // Get residual by subtracting prediction
-    i = 0;
-    ac_sum = 0;
-
-    for (y = 0; y < width; y++) {
-      for (x = 0; x < width; x++) {
-        block[i] = ((int16_t)base_y[x + y * LCU_WIDTH]) -
-                   pred_y[x + y * LCU_WIDTH];
-        #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
-        residual_sum += block[i];
-        #endif
-        i++;
-      }
-    }
-    #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
-    #define RESIDUAL_THRESHOLD 500
-    if(residual_sum < RESIDUAL_THRESHOLD/(width)) {
-      memset(block, 0, sizeof(int16_t)*(width)*(width));
-    }
-    #endif
-
-    // For 4x4 blocks, check for transform skip
-    if(width == 4 && encoder->trskip_enable) {
-      decide_trskip(encoder_state, cur_cu, depth, pu_index, block, &ac_sum);
-    }
-
-    // Transform and quant residual to coeffs
-    if(width == 4 && cur_cu->intra[pu_index].tr_skip) {
-      transformskip(encoder, block,pre_quant_coeff,width);
-    } else {
-      transform2d(encoder, block,pre_quant_coeff,width,0);
-    }
-
-    if (encoder->rdoq_enable) {
-      rdoq(encoder_state, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0,
-           scan_idx_luma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
-    } else {
-      quant(encoder_state, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type);
-    }
-
-    // Check for non-zero coeffs
-    for (i = 0; i < width * width; i++) {
-      if (coeff_y[i] != 0) {
-        // Found one, we can break here
-        cbf_set(&cur_cu->cbf.y, depth + pu_index);
-        break;
-      }
-    }
-
-    // Copy coefficients, even if they are all zeroes. This takes care of the
-    // case where the original coefficients aren't already zeroed.
-    {
-      int i = 0;
-      for (y = 0; y < width; y++) {
-        for (x = 0; x < width; x++) {
-          orig_coeff_y[x + y * LCU_WIDTH] = coeff_y[i];
-          i++;
-        }
-      }
-    }
-
-    if (cbf_is_set(cur_cu->cbf.y, depth + pu_index)) {
-      // Combine inverese quantized coefficients with the prediction to get
-      // reconstructed image.
-      //picture_set_block_residual(cur_pic,x_cu,y_cu,depth,1);
-      int i;
-
-      dequant(encoder_state, coeff_y, pre_quant_coeff, width, width, 0, cur_cu->type);
-      if(width == 4 && cur_cu->intra[pu_index].tr_skip) {
-        itransformskip(encoder, block,pre_quant_coeff,width);
-      } else {
-        itransform2d(encoder, block,pre_quant_coeff,width,0);
-      }
-
-      i = 0;
-
-      for (y = 0; y < width; y++) {
-        for (x = 0; x < width; x++) {
-          int val = block[i++] + pred_y[x + y * LCU_WIDTH];
-          //TODO: support 10+bits
-          recbase_y[x + y * LCU_WIDTH] = (pixel)CLIP(0, 255, val);
-        }
-      }
-    }
-  }
-
-  // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
-  // left PU because the coordinates are correct.
-  if (depth <= MAX_DEPTH || pu_index == 0) {
-    const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH / 2;
-    pixel *recbase_u = &lcu->rec.u[chroma_offset];
-    pixel *recbase_v = &lcu->rec.v[chroma_offset];
-    const pixel *base_u = &lcu->ref.u[chroma_offset];
-    const pixel *base_v = &lcu->ref.v[chroma_offset];
-    coefficient *orig_coeff_u = &lcu->coeff.u[chroma_offset];
-    coefficient *orig_coeff_v = &lcu->coeff.v[chroma_offset];
-
-    if (cur_cu->intra[0].mode_chroma == 36) {
-      cur_cu->intra[0].mode_chroma = cur_cu->intra[0].mode;
-    }
-    if (quantize_residual_chroma(encoder_state, cur_cu, depth, COLOR_U, base_u, recbase_u, orig_coeff_u)) {
-      cbf_set(&cur_cu->cbf.u, depth);
-    }
-    if (quantize_residual_chroma(encoder_state, cur_cu, depth, COLOR_V, base_v, recbase_v, orig_coeff_v)) {
-      cbf_set(&cur_cu->cbf.v, depth);
-    }
-  }
-}
-
 static void encode_transform_unit(encoder_state * const encoder_state,
                                   int x_pu, int y_pu, int depth, int tr_depth)
 {
diff --git a/src/encoder.h b/src/encoder.h
index 2570fb66..f69c067c 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -253,12 +253,13 @@ void encode_last_significant_xy(encoder_state *encoder,
                                 uint8_t type, uint8_t scan);
 void encode_coeff_nxn(encoder_state *encoder, int16_t *coeff, uint8_t width,
                       uint8_t type, int8_t scan_mode, int8_t tr_skip);
-void encode_transform_tree(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, lcu_t* lcu );
 void encode_transform_coeff(encoder_state *encoder_state, int32_t x_cu, int32_t y_cu,
                             int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
 void encode_block_residual(const encoder_control * const encoder,
                            uint16_t x_ctb, uint16_t y_ctb, uint8_t depth);
 
+coeff_scan_order_t get_scan_order(int8_t cu_type, int intra_mode, int depth);
+
 static const uint8_t g_group_idx[32] = {
   0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
   6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
diff --git a/src/transform.c b/src/transform.c
index dabf971f..8469ebd8 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -26,9 +26,11 @@
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <assert.h>
 
 #include "config.h"
 #include "nal.h"
+#include "rdo.h"
 
 //////////////////////////////////////////////////////////////////////////
 // INITIALIZATIONS
@@ -799,3 +801,407 @@ void dequant(const encoder_state * const encoder_state, int16_t *q_coef, int16_t
   }
 }
 
+
+static void transform_chroma(encoder_state * const encoder_state, cu_info *cur_cu,
+                             int depth, const pixel *base_u, pixel *pred_u,
+                             coefficient *coeff_u, int8_t scan_idx_chroma,
+                             coefficient *pre_quant_coeff, coefficient *block)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  int base_stride = LCU_WIDTH;
+  int pred_stride = LCU_WIDTH;
+
+  int8_t width_c = LCU_WIDTH >> (depth + 1);
+
+  int i = 0;
+  unsigned ac_sum = 0;
+
+  int y, x;
+
+  for (y = 0; y < width_c; y++) {
+    for (x = 0; x < width_c; x++) {
+      block[i] = ((int16_t)base_u[x + y * (base_stride >> 1)]) -
+                  pred_u[x + y * (pred_stride >> 1)];
+      i++;
+    }
+  }
+
+  transform2d(encoder, block, pre_quant_coeff, width_c, 65535);
+  if (encoder->rdoq_enable) {
+    rdoq(encoder_state, pre_quant_coeff, coeff_u, width_c, width_c, &ac_sum, 2,
+         scan_idx_chroma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
+  } else {
+    quant(encoder_state, pre_quant_coeff, coeff_u, width_c, width_c, &ac_sum, 2,
+          scan_idx_chroma, cur_cu->type);
+  }
+}
+
+
+static void reconstruct_chroma(const encoder_state * const encoder_state, cu_info *cur_cu,
+                               int depth, coefficient *coeff_u,
+                               pixel *recbase_u, pixel *pred_u, int color_type,
+                               coefficient *pre_quant_coeff, coefficient *block)
+{
+  int8_t width_c = LCU_WIDTH >> (depth + 1);
+
+  int i, y, x;
+
+  dequant(encoder_state, coeff_u, pre_quant_coeff, width_c, width_c, (int8_t)color_type, cur_cu->type);
+  itransform2d(encoder_state->encoder_control, block, pre_quant_coeff, width_c, 65535);
+
+  i = 0;
+
+  for (y = 0; y < width_c; y++) {
+    for (x = 0; x < width_c; x++) {
+      int16_t val = block[i++] + pred_u[x + y * LCU_WIDTH_C];
+      //TODO: support 10+bits
+      recbase_u[x + y * LCU_WIDTH_C] = (uint8_t)CLIP(0, 255, val);
+    }
+  }
+}
+
+
+int quantize_residual_chroma(encoder_state * const encoder_state,
+                             cu_info *cur_cu, int luma_depth, color_index color,
+                             const pixel *base_u, pixel *recbase_u, coefficient *orig_coeff_u)
+{
+  pixel pred_u[LCU_WIDTH*LCU_WIDTH>>2];
+  coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
+
+  int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
+  int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
+
+  const int chroma_depth = (luma_depth == MAX_PU_DEPTH ? luma_depth - 1 : luma_depth);
+  const int8_t width_c = LCU_WIDTH >> (chroma_depth + 1);
+
+  const coeff_scan_order_t scan_idx_chroma = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, luma_depth);
+
+  int has_coeffs = 0;
+
+  {
+    int y, x;
+    for (y = 0; y < width_c; y++) {
+      for (x = 0; x < width_c; x++) {
+        pred_u[x + y * LCU_WIDTH_C] = recbase_u[x + y * LCU_WIDTH_C];
+      }
+    }
+  }
+
+  transform_chroma(encoder_state, cur_cu, chroma_depth, base_u, pred_u, coeff_u, scan_idx_chroma, pre_quant_coeff, block);
+  {
+    int i;
+    for (i = 0; i < width_c * width_c; i++) {
+      if (coeff_u[i] != 0) {
+        has_coeffs = 1;
+        break;
+      }
+    }
+  }
+  // Copy coefficients, even if they are all zeroes.
+  {
+    int i = 0;
+    int y, x;
+    for (y = 0; y < width_c; y++) {
+      for (x = 0; x < width_c; x++) {
+        orig_coeff_u[x + y * LCU_WIDTH_C] = coeff_u[i];
+        i++;
+      }
+    }
+  }
+  if (has_coeffs) {
+    reconstruct_chroma(encoder_state, cur_cu, chroma_depth,
+                        coeff_u, recbase_u, pred_u, (color == COLOR_U ? 2 : 3),
+                        pre_quant_coeff, block);
+  }
+
+  return has_coeffs;
+}
+
+
+void decide_trskip(encoder_state * const encoder_state, cu_info *cur_cu, int8_t depth, int pu_index,
+                   int16_t *residual, uint32_t *ac_sum)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  const coeff_scan_order_t scan_idx_luma = get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
+  const int8_t width = LCU_WIDTH >> depth;
+
+  //int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
+  int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
+
+  int i;
+  coefficient temp_block[16];  coefficient temp_coeff[16];
+  coefficient temp_block2[16]; coefficient temp_coeff2[16];
+  uint32_t cost = 0,cost2 = 0;
+  uint32_t coeffcost = 0,coeffcost2 = 0;
+
+  // Test for transform skip
+  transformskip(encoder, residual,pre_quant_coeff, width);
+  if (encoder->rdoq_enable) {
+    rdoq(encoder_state, pre_quant_coeff, temp_coeff, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type,0);
+  } else {
+    quant(encoder_state, pre_quant_coeff, temp_coeff, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type);
+  }
+  dequant(encoder_state, temp_coeff, pre_quant_coeff, 4, 4, 0, cur_cu->type);
+  itransformskip(encoder, temp_block,pre_quant_coeff,width);
+
+  transform2d(encoder, residual,pre_quant_coeff,width,0);
+  if (encoder->rdoq_enable) {
+    rdoq(encoder_state, pre_quant_coeff, temp_coeff2, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type,0);
+  } else {
+    quant(encoder_state, pre_quant_coeff, temp_coeff2, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type);
+  }
+  dequant(encoder_state, temp_coeff2, pre_quant_coeff, 4, 4, 0, cur_cu->type);
+  itransform2d(encoder, temp_block2,pre_quant_coeff,width,0);
+
+  // SSD between original and reconstructed
+  for (i = 0; i < 16; i++) {
+    int diff = temp_block[i] - residual[i];
+    cost += diff*diff;
+
+    diff = temp_block2[i] - residual[i];
+    cost2 += diff*diff;
+  }
+
+  // Simple RDO
+  if(encoder->rdo == 1) {
+    // SSD between reconstruction and original + sum of coeffs
+    for (i = 0; i < 16; i++) {
+      coeffcost += abs((int)temp_coeff[i]);
+      coeffcost2 += abs((int)temp_coeff2[i]);
+    }
+    cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
+    cost2 += (coeffcost2 + (coeffcost2>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
+    // Full RDO
+  } else if(encoder->rdo == 2) {
+    coeffcost = get_coeff_cost(encoder_state, temp_coeff, 4, 0, scan_idx_luma);
+    coeffcost2 = get_coeff_cost(encoder_state, temp_coeff2, 4, 0, scan_idx_luma);
+
+    cost  += coeffcost*((int)encoder_state->global->cur_lambda_cost+0.5);
+    cost2 += coeffcost2*((int)encoder_state->global->cur_lambda_cost+0.5);
+  }
+
+  cur_cu->intra[pu_index].tr_skip = (cost < cost2);
+}
+
+
+/**
+ * This function calculates the residual coefficients for a region of the LCU
+ * (defined by x, y and depth) and updates the reconstruction with the
+ * kvantized residual.
+ *
+ * It handles recursion for transform split, but that is currently only work
+ * for 64x64 inter to 32x32 transform blocks.
+ *
+ * Inputs are:
+ * - lcu->rec  pixels after prediction for the area
+ * - lcu->ref  reference pixels for the area
+ * - lcu->cu   for the area
+ *
+ * Outputs are:
+ * - lcu->rec  reconstruction after quantized residual
+ * - lcu->coeff  quantized coefficients for the area
+ * - lcu->cbf  coded block flags for the area
+ * - lcu->cu.intra[].tr_skip  for the area
+ */
+void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, lcu_t* lcu)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  // we have 64>>depth transform size
+  const vector2d lcu_px = {x & 0x3f, y & 0x3f};
+  const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
+  cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH];
+  const int8_t width = LCU_WIDTH>>depth;
+  
+  int i;
+  
+  // Tell clang-analyzer what is up. For some reason it can't figure out from
+  // asserting just depth.
+  assert(width == 4 || width == 8 || width == 16 || width == 32 || width == 64);
+
+  // Split transform and increase depth
+  if (depth == 0 || cur_cu->tr_depth > depth) {
+    int offset = width / 2;
+    encode_transform_tree(encoder_state, x,          y,          depth+1, lcu);
+    encode_transform_tree(encoder_state, x + offset, y,          depth+1, lcu);
+    encode_transform_tree(encoder_state, x,          y + offset, depth+1, lcu);
+    encode_transform_tree(encoder_state, x + offset, y + offset, depth+1, lcu);
+
+    // Propagate coded block flags from child CUs to parent CU.
+    if (depth < MAX_DEPTH) {
+      cu_info *cu_a =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) +  (lcu_px.y>>3)        *LCU_T_CU_WIDTH];
+      cu_info *cu_b =  &lcu->cu[LCU_CU_OFFSET +  (lcu_px.x>>3)           + ((lcu_px.y+offset)>>3)*LCU_T_CU_WIDTH];
+      cu_info *cu_c =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) + ((lcu_px.y+offset)>>3)*LCU_T_CU_WIDTH];
+      if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
+        cbf_set(&cur_cu->cbf.y, depth);
+      }
+      if (cbf_is_set(cu_a->cbf.u, depth+1) || cbf_is_set(cu_b->cbf.u, depth+1) || cbf_is_set(cu_c->cbf.u, depth+1)) {
+        cbf_set(&cur_cu->cbf.u, depth);
+      }
+      if (cbf_is_set(cu_a->cbf.v, depth+1) || cbf_is_set(cu_b->cbf.v, depth+1) || cbf_is_set(cu_c->cbf.v, depth+1)) {
+        cbf_set(&cur_cu->cbf.v, depth);
+      }
+    }
+
+    return;
+  }
+
+  {
+    const int luma_offset = lcu_px.x + lcu_px.y * LCU_WIDTH;
+
+    // Pointers to current location in arrays with prediction.
+    pixel *recbase_y = &lcu->rec.y[luma_offset];
+    // Pointers to current location in arrays with reference.
+    const pixel *base_y = &lcu->ref.y[luma_offset];
+    // Pointers to current location in arrays with kvantized coefficients.
+    coefficient *orig_coeff_y = &lcu->coeff.y[luma_offset];
+
+    // Temporary buffers. Not really used for much. Possibly unnecessary.
+    pixel pred_y[LCU_WIDTH*LCU_WIDTH];
+    // Buffers for coefficients.
+    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
+
+    // Temporary buffers for kvantization and transformation.
+    int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
+    int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
+    
+    uint32_t ac_sum = 0;
+    uint8_t scan_idx_luma   = SCAN_DIAG;
+
+    #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
+    uint32_t residual_sum = 0;
+    #endif
+
+    // Clear coded block flag structures for depths lower than current depth.
+    // This should ensure that the CBF data doesn't get corrupted if this function
+    // is called more than once.
+    cbf_clear(&cur_cu->cbf.y, depth + pu_index);
+    if (pu_index == 0) {
+      cbf_clear(&cur_cu->cbf.u, depth);
+      cbf_clear(&cur_cu->cbf.v, depth);
+    }
+
+    // Pick coeff scan mode according to intra prediction mode.
+    if (cur_cu->type == CU_INTRA) {
+      int chroma_mode = cur_cu->intra[0].mode_chroma;
+      if (chroma_mode == 36) {
+        chroma_mode = cur_cu->intra[pu_index].mode;
+      }
+      scan_idx_luma = get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
+    }
+    
+    // Copy Luma and Chroma to the pred-block
+    for(y = 0; y < width; y++) {
+      for(x = 0; x < width; x++) {
+        pred_y[x+y*LCU_WIDTH]=recbase_y[x+y*LCU_WIDTH];
+      }
+    }
+
+    // Get residual by subtracting prediction
+    i = 0;
+    ac_sum = 0;
+
+    for (y = 0; y < width; y++) {
+      for (x = 0; x < width; x++) {
+        block[i] = ((int16_t)base_y[x + y * LCU_WIDTH]) -
+                   pred_y[x + y * LCU_WIDTH];
+        #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
+        residual_sum += block[i];
+        #endif
+        i++;
+      }
+    }
+    #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
+    #define RESIDUAL_THRESHOLD 500
+    if(residual_sum < RESIDUAL_THRESHOLD/(width)) {
+      memset(block, 0, sizeof(int16_t)*(width)*(width));
+    }
+    #endif
+
+    // For 4x4 blocks, check for transform skip
+    if(width == 4 && encoder->trskip_enable) {
+      decide_trskip(encoder_state, cur_cu, depth, pu_index, block, &ac_sum);
+    }
+
+    // Transform and quant residual to coeffs
+    if(width == 4 && cur_cu->intra[pu_index].tr_skip) {
+      transformskip(encoder, block,pre_quant_coeff,width);
+    } else {
+      transform2d(encoder, block,pre_quant_coeff,width,0);
+    }
+
+    if (encoder->rdoq_enable) {
+      rdoq(encoder_state, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0,
+           scan_idx_luma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
+    } else {
+      quant(encoder_state, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type);
+    }
+
+    // Check for non-zero coeffs
+    for (i = 0; i < width * width; i++) {
+      if (coeff_y[i] != 0) {
+        // Found one, we can break here
+        cbf_set(&cur_cu->cbf.y, depth + pu_index);
+        break;
+      }
+    }
+
+    // Copy coefficients, even if they are all zeroes. This takes care of the
+    // case where the original coefficients aren't already zeroed.
+    {
+      int i = 0;
+      for (y = 0; y < width; y++) {
+        for (x = 0; x < width; x++) {
+          orig_coeff_y[x + y * LCU_WIDTH] = coeff_y[i];
+          i++;
+        }
+      }
+    }
+
+    if (cbf_is_set(cur_cu->cbf.y, depth + pu_index)) {
+      // Combine inverese quantized coefficients with the prediction to get
+      // reconstructed image.
+      //picture_set_block_residual(cur_pic,x_cu,y_cu,depth,1);
+      int i;
+
+      dequant(encoder_state, coeff_y, pre_quant_coeff, width, width, 0, cur_cu->type);
+      if(width == 4 && cur_cu->intra[pu_index].tr_skip) {
+        itransformskip(encoder, block,pre_quant_coeff,width);
+      } else {
+        itransform2d(encoder, block,pre_quant_coeff,width,0);
+      }
+
+      i = 0;
+
+      for (y = 0; y < width; y++) {
+        for (x = 0; x < width; x++) {
+          int val = block[i++] + pred_y[x + y * LCU_WIDTH];
+          //TODO: support 10+bits
+          recbase_y[x + y * LCU_WIDTH] = (pixel)CLIP(0, 255, val);
+        }
+      }
+    }
+  }
+
+  // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
+  // left PU because the coordinates are correct.
+  if (depth <= MAX_DEPTH || pu_index == 0) {
+    const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH / 2;
+    pixel *recbase_u = &lcu->rec.u[chroma_offset];
+    pixel *recbase_v = &lcu->rec.v[chroma_offset];
+    const pixel *base_u = &lcu->ref.u[chroma_offset];
+    const pixel *base_v = &lcu->ref.v[chroma_offset];
+    coefficient *orig_coeff_u = &lcu->coeff.u[chroma_offset];
+    coefficient *orig_coeff_v = &lcu->coeff.v[chroma_offset];
+
+    if (cur_cu->intra[0].mode_chroma == 36) {
+      cur_cu->intra[0].mode_chroma = cur_cu->intra[0].mode;
+    }
+    if (quantize_residual_chroma(encoder_state, cur_cu, depth, COLOR_U, base_u, recbase_u, orig_coeff_u)) {
+      cbf_set(&cur_cu->cbf.u, depth);
+    }
+    if (quantize_residual_chroma(encoder_state, cur_cu, depth, COLOR_V, base_v, recbase_v, orig_coeff_v)) {
+      cbf_set(&cur_cu->cbf.v, depth);
+    }
+  }
+}
+
diff --git a/src/transform.h b/src/transform.h
index 4cc8870a..f54f773c 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -46,4 +46,6 @@ void itransform2d(const encoder_control *encoder, int16_t *block,int16_t *coeff,
 
 int32_t get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset);
 
+void encode_transform_tree(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, lcu_t* lcu);
+
 #endif