diff --git a/src/global.h b/src/global.h
index 635f4115..22c05b4c 100644
--- a/src/global.h
+++ b/src/global.h
@@ -86,7 +86,10 @@ typedef int16_t coefficient;
 
 #define CU_MIN_SIZE_PIXELS (1 << MIN_SIZE) /*!< pow(2, MIN_SIZE) */
 #define LCU_WIDTH (1 << (MIN_SIZE + MAX_DEPTH)) /*!< spec: CtbSizeY */
-#define LCU_WIDTH_C (LCU_WIDTH / 2) /*!< CtbWidthC and CtbHeightC */
+#define LCU_WIDTH_C (LCU_WIDTH / 2) /*!< spec: CtbWidthC and CtbHeightC */
+
+#define TR_MAX_LOG2_SIZE 5 /*!< spec: Log2MaxTrafoSize <= Min(CtbLog2SizeY, 5) */
+#define TR_MAX_WIDTH (1 << 5) /*!< spec: Log2MaxTrafoSize */
 
 #if LCU_WIDTH != 64
   #error "Kvazaar only support LCU_WIDTH == 64"
diff --git a/src/picture.c b/src/picture.c
index 9ac83270..4098c059 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -949,3 +949,20 @@ unsigned calc_sad(const picture *pic, const picture *ref,
     return interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height);
   }
 }
+
+unsigned calc_ssd(const pixel *const ref, const pixel *const rec,
+                 const int ref_stride, const int rec_stride,
+                 const int width)
+{
+  int ssd = 0;
+  int y, x;
+
+  for (y = 0; y < width; ++y) {
+    for (x = 0; x < width; ++x) {
+      int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
+      ssd += diff * diff;
+    }
+  }
+
+  return ssd;
+}
diff --git a/src/picture.h b/src/picture.h
index 461a1029..0b468ed1 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -271,6 +271,10 @@ unsigned calc_sad(const picture *pic, const picture *ref,
                   int pic_x, int pic_y, int ref_x, int ref_y,
                   int block_width, int block_height);
 
+unsigned calc_ssd(const pixel *const ref, const pixel *const rec,
+                  const int ref_stride, const int rec_stride,
+                  const int width);
+
 double image_psnr(pixel *frame1, pixel *frame2, int32_t x, int32_t y);
 
 
diff --git a/src/rdo.c b/src/rdo.c
index 7c0fb81d..f9c2f76f 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -70,7 +70,6 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
     int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
     int16_t temp_block[LCU_WIDTH*LCU_WIDTH>>2];
     coefficient temp_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-    uint32_t ac_sum;
     uint32_t cost = 0;
     uint32_t coeffcost = 0;
     int8_t luma_scan_mode = SCAN_DIAG;
@@ -93,9 +92,9 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
     }
     transform2d(encoder, block,pre_quant_coeff,width,0);
     if(encoder->rdoq_enable) {
-      rdoq(encoder_state, pre_quant_coeff, temp_coeff, width, width, &ac_sum, 0, luma_scan_mode, CU_INTRA,0);
+      rdoq(encoder_state, pre_quant_coeff, temp_coeff, width, width, 0, luma_scan_mode, CU_INTRA,0);
     } else {
-      quant(encoder_state, pre_quant_coeff, temp_coeff, width, width, &ac_sum, 0, luma_scan_mode, CU_INTRA);
+      quant(encoder_state, pre_quant_coeff, temp_coeff, width, width, 0, luma_scan_mode, CU_INTRA);
     }
     dequant(encoder_state, temp_coeff, pre_quant_coeff, width, width, 0, CU_INTRA);
     itransform2d(encoder, temp_block,pre_quant_coeff,width,0);
@@ -122,6 +121,7 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
     return cost;
 }
 
+
 /** Calculate actual (or really close to actual) bitcost for coding coefficients
  * \param coeff coefficient array
  * \param width coeff block width
@@ -391,7 +391,7 @@ static void calc_last_bits(encoder_state * const encoder_state, int32_t width, i
  * From HM 12.0
  */
 void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *dest_coeff, int32_t width,
-           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth)
+           int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth)
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
   cabac_data * const cabac = &encoder_state->cabac;
@@ -403,6 +403,7 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
 
   int32_t qp_scaled = get_scaled_qp(type, encoder_state->global->QP, 0);
+  uint32_t abs_sum = 0;
 
   {
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
@@ -700,7 +701,7 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
   for ( scanpos = 0; scanpos < best_last_idx_p1; scanpos++ ) {
     int32_t blkPos = scan[ scanpos ];
     int32_t level  = dest_coeff[ blkPos ];
-    *abs_sum += level;
+    abs_sum += level;
     dest_coeff[ blkPos ] = (coefficient)(( coef[ blkPos ] < 0 ) ? -level : level);
   }
 
@@ -709,7 +710,7 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
     dest_coeff[ scan[ scanpos ] ] = 0;
   }
 #if ENABLE_SIGN_HIDING == 1
-  if(*abs_sum >= 2) {
+  if(abs_sum >= 2) {
     int64_t rd_factor = (int64_t) (
                      g_inv_quant_scales[qp_scaled%6] * g_inv_quant_scales[qp_scaled%6] * (1<<(2*(qp_scaled/6)))
                    /  encoder_state->global->cur_lambda_cost / 16 / (1<<(2*(encoder->bitdepth-8)))
diff --git a/src/rdo.h b/src/rdo.h
index 598018e8..e188c395 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -43,7 +43,7 @@ extern const uint32_t g_go_rice_prefix_len[5];
 
 
 void  rdoq(encoder_state *encoder_state, coefficient *coef, coefficient *dest_coeff, int32_t width,
-           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
+           int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
 uint32_t rdo_cost_intra(encoder_state *encoder, pixel* pred, pixel* orig_block, int width, int8_t mode);
 
diff --git a/src/transform.c b/src/transform.c
index 8469ebd8..271b8413 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -626,11 +626,12 @@ void itransform2d(const encoder_control * const encoder,int16_t *block,int16_t *
  *
  */
 void quant(const encoder_state * const encoder_state, int16_t *coef, int16_t *q_coef, int32_t width,
-           int32_t height, uint32_t *ac_sum, int8_t type, int8_t scan_idx, int8_t block_type )
+           int32_t height, int8_t type, int8_t scan_idx, int8_t block_type )
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
   const uint32_t log2_block_size = g_convert_to_bit[ width ] + 2;
   const uint32_t * const scan = g_sig_last_scan[ scan_idx ][ log2_block_size - 1 ];
+  uint32_t ac_sum = 0;
 
   #if ENABLE_SIGN_HIDING == 1
   int32_t delta_u[LCU_WIDTH*LCU_WIDTH>>2];
@@ -662,7 +663,7 @@ void quant(const encoder_state * const encoder_state, int16_t *coef, int16_t *q_
 
     #if ENABLE_SIGN_HIDING == 1
     delta_u[n] = (int32_t)( ((int64_t)abs(coef[n]) * quant_coeff[n] - (level<<q_bits) )>> q_bits8 );
-    *ac_sum += level;
+    ac_sum += level;
     #endif
 
     level *= sign;
@@ -670,7 +671,7 @@ void quant(const encoder_state * const encoder_state, int16_t *coef, int16_t *q_
   }
 
   #if ENABLE_SIGN_HIDING == 1
-  if(*ac_sum >= 2) {
+  if(ac_sum >= 2) {
     #define SCAN_SET_SIZE 16
     #define LOG2_SCAN_SET_SIZE 4
     int32_t n,last_cg = -1, abssum = 0, subset, subpos;
@@ -802,185 +803,174 @@ void dequant(const encoder_state * const encoder_state, int16_t *q_coef, int16_t
 }
 
 
-static void transform_chroma(encoder_state * const encoder_state, cu_info *cur_cu,
-                             int depth, const pixel *base_u, pixel *pred_u,
-                             coefficient *coeff_u, int8_t scan_idx_chroma,
-                             coefficient *pre_quant_coeff, coefficient *block)
+/**
+ * \brief Quantize residual and get both the reconstruction and coeffs.
+ * 
+ * \param width  Transform width.
+ * \param color  Color.
+ * \param scan_order  Coefficient scan order.
+ * \param use_trskip  Whether transform skip is used.
+ * \param stride  Stride for ref_in, pred_in rec_out and coeff_out.
+ * \param ref_in  Reference pixels.
+ * \param pred_in  Predicted pixels.
+ * \param rec_out  Reconstructed pixels.
+ * \param coeff_out  Coefficients used for reconstruction of rec_out.
+ *
+ * \returns  Whether coeff_out contains any non-zero coefficients.
+ */
+int quantize_residual(encoder_state *const encoder_state,
+                      const cu_info *const cur_cu, const int width, const color_index color,
+                      const coeff_scan_order_t scan_order, const int use_trskip, 
+                      const int in_stride, const int out_stride,
+                      const pixel *const ref_in, const pixel *const pred_in, 
+                      pixel *rec_out, coefficient *coeff_out)
 {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  int base_stride = LCU_WIDTH;
-  int pred_stride = LCU_WIDTH;
-
-  int8_t width_c = LCU_WIDTH >> (depth + 1);
-
-  int i = 0;
-  unsigned ac_sum = 0;
-
-  int y, x;
-
-  for (y = 0; y < width_c; y++) {
-    for (x = 0; x < width_c; x++) {
-      block[i] = ((int16_t)base_u[x + y * (base_stride >> 1)]) -
-                  pred_u[x + y * (pred_stride >> 1)];
-      i++;
-    }
-  }
-
-  transform2d(encoder, block, pre_quant_coeff, width_c, 65535);
-  if (encoder->rdoq_enable) {
-    rdoq(encoder_state, pre_quant_coeff, coeff_u, width_c, width_c, &ac_sum, 2,
-         scan_idx_chroma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
-  } else {
-    quant(encoder_state, pre_quant_coeff, coeff_u, width_c, width_c, &ac_sum, 2,
-          scan_idx_chroma, cur_cu->type);
-  }
-}
-
-
-static void reconstruct_chroma(const encoder_state * const encoder_state, cu_info *cur_cu,
-                               int depth, coefficient *coeff_u,
-                               pixel *recbase_u, pixel *pred_u, int color_type,
-                               coefficient *pre_quant_coeff, coefficient *block)
-{
-  int8_t width_c = LCU_WIDTH >> (depth + 1);
-
-  int i, y, x;
-
-  dequant(encoder_state, coeff_u, pre_quant_coeff, width_c, width_c, (int8_t)color_type, cur_cu->type);
-  itransform2d(encoder_state->encoder_control, block, pre_quant_coeff, width_c, 65535);
-
-  i = 0;
-
-  for (y = 0; y < width_c; y++) {
-    for (x = 0; x < width_c; x++) {
-      int16_t val = block[i++] + pred_u[x + y * LCU_WIDTH_C];
-      //TODO: support 10+bits
-      recbase_u[x + y * LCU_WIDTH_C] = (uint8_t)CLIP(0, 255, val);
-    }
-  }
-}
-
-
-int quantize_residual_chroma(encoder_state * const encoder_state,
-                             cu_info *cur_cu, int luma_depth, color_index color,
-                             const pixel *base_u, pixel *recbase_u, coefficient *orig_coeff_u)
-{
-  pixel pred_u[LCU_WIDTH*LCU_WIDTH>>2];
-  coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
-
-  int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
-  int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-
-  const int chroma_depth = (luma_depth == MAX_PU_DEPTH ? luma_depth - 1 : luma_depth);
-  const int8_t width_c = LCU_WIDTH >> (chroma_depth + 1);
-
-  const coeff_scan_order_t scan_idx_chroma = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, luma_depth);
+  // Temporary arrays to pass data to and from quant and transform functions.
+  int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  coefficient quant_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  coefficient coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
 
   int has_coeffs = 0;
 
+  // Get residual. (ref_in - pred_in -> residual)
   {
     int y, x;
-    for (y = 0; y < width_c; y++) {
-      for (x = 0; x < width_c; x++) {
-        pred_u[x + y * LCU_WIDTH_C] = recbase_u[x + y * LCU_WIDTH_C];
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        residual[x + y * width] = (int16_t)(ref_in[x + y * in_stride] - pred_in[x + y * in_stride]);
       }
     }
   }
+  
+  // Transform residual. (residual -> coeff)
+  if (use_trskip) {
+    transformskip(encoder_state->encoder_control, residual, coeff, width);
+  } else {
+    transform2d(encoder_state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+  }
 
-  transform_chroma(encoder_state, cur_cu, chroma_depth, base_u, pred_u, coeff_u, scan_idx_chroma, pre_quant_coeff, block);
+  // Quantize coeffs. (coeff -> quant_coeff)
+  if (encoder_state->encoder_control->rdoq_enable) {
+    rdoq(encoder_state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
+         scan_order, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
+  } else {
+    quant(encoder_state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
+          scan_order, cur_cu->type);
+  }
+
+  // Check if there are any non-zero coefficients.
   {
     int i;
-    for (i = 0; i < width_c * width_c; i++) {
-      if (coeff_u[i] != 0) {
+    for (i = 0; i < width * width; ++i) {
+      if (quant_coeff[i] != 0) {
         has_coeffs = 1;
         break;
       }
     }
   }
-  // Copy coefficients, even if they are all zeroes.
-  {
-    int i = 0;
+
+  // Copy coefficients to coeff_out.
+  picture_blit_coeffs(quant_coeff, coeff_out, width, width, width, out_stride);
+
+  // Do the inverse quantization and transformation and the reconstruction to
+  // rec_out.
+  if (has_coeffs) {
     int y, x;
-    for (y = 0; y < width_c; y++) {
-      for (x = 0; x < width_c; x++) {
-        orig_coeff_u[x + y * LCU_WIDTH_C] = coeff_u[i];
-        i++;
+
+    // Get quantized residual. (quant_coeff -> coeff -> residual)
+    dequant(encoder_state, quant_coeff, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)), cur_cu->type);
+    if (use_trskip) {
+      itransformskip(encoder_state->encoder_control, residual, coeff, width);
+    } else {
+      itransform2d(encoder_state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+    }
+
+    // Get quantized reconstruction. (residual + pred_in -> rec_out)
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
+        rec_out[x + y * out_stride] = (uint8_t)CLIP(0, 255, val);
+      }
+    }
+  } else if (rec_out != pred_in) {
+    // With no coeffs and rec_out == pred_int we skip copying the coefficients
+    // because the reconstruction is just the prediction.
+    int y, x;
+
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
       }
     }
-  }
-  if (has_coeffs) {
-    reconstruct_chroma(encoder_state, cur_cu, chroma_depth,
-                        coeff_u, recbase_u, pred_u, (color == COLOR_U ? 2 : 3),
-                        pre_quant_coeff, block);
   }
 
   return has_coeffs;
 }
 
 
-void decide_trskip(encoder_state * const encoder_state, cu_info *cur_cu, int8_t depth, int pu_index,
-                   int16_t *residual, uint32_t *ac_sum)
+/**
+ * \brief Like quantize_residual except that this uses trskip if that is better.
+ *
+ * Using this function saves one step of quantization and inverse quantization
+ * compared to doing the decision separately from the actual operation.
+ *
+ * \param width  Transform width.
+ * \param color  Color.
+ * \param scan_order  Coefficient scan order.
+ * \param trskip_out  Whether transform skip is used.
+ * \param stride  Stride for ref_in, pred_in rec_out and coeff_out.
+ * \param ref_in  Reference pixels.
+ * \param pred_in  Predicted pixels.
+ * \param rec_out  Reconstructed pixels.
+ * \param coeff_out  Coefficients used for reconstruction of rec_out.
+ *
+ * \returns  Whether coeff_out contains any non-zero coefficients.
+ */
+int quantize_residual_trskip(
+    encoder_state *const encoder_state,
+    const cu_info *const cur_cu, const int width, const color_index color,
+    const coeff_scan_order_t scan_order, int8_t *trskip_out, 
+    const int in_stride, const int out_stride,
+    const pixel *const ref_in, const pixel *const pred_in, 
+    pixel *rec_out, coefficient *coeff_out)
 {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  const coeff_scan_order_t scan_idx_luma = get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
-  const int8_t width = LCU_WIDTH >> depth;
+  struct {
+    pixel rec[4*4];
+    coefficient coeff[4*4];
+    unsigned cost;
+    int has_coeffs;
+  } skip, noskip, *best;
+  
+  noskip.has_coeffs = quantize_residual(
+      encoder_state, cur_cu, width, color, scan_order,
+      0, in_stride, 4,
+      ref_in, pred_in, noskip.rec, noskip.coeff);
+  noskip.cost = calc_ssd(ref_in, noskip.rec, in_stride, 4, 4);
+  noskip.cost += get_coeff_cost(encoder_state, noskip.coeff, 4, 0, scan_order) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
 
-  //int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
-  int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
+  skip.has_coeffs = quantize_residual(
+      encoder_state, cur_cu, width, color, scan_order,
+      1, in_stride, 4,
+      ref_in, pred_in, skip.rec, skip.coeff);
+  skip.cost = calc_ssd(ref_in, skip.rec, in_stride, 4, 4);
+  skip.cost += get_coeff_cost(encoder_state, skip.coeff, 4, 0, scan_order) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
 
-  int i;
-  coefficient temp_block[16];  coefficient temp_coeff[16];
-  coefficient temp_block2[16]; coefficient temp_coeff2[16];
-  uint32_t cost = 0,cost2 = 0;
-  uint32_t coeffcost = 0,coeffcost2 = 0;
-
-  // Test for transform skip
-  transformskip(encoder, residual,pre_quant_coeff, width);
-  if (encoder->rdoq_enable) {
-    rdoq(encoder_state, pre_quant_coeff, temp_coeff, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type,0);
+  if (noskip.cost <= skip.cost) {
+    *trskip_out = 0;
+    best = &noskip;
   } else {
-    quant(encoder_state, pre_quant_coeff, temp_coeff, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type);
-  }
-  dequant(encoder_state, temp_coeff, pre_quant_coeff, 4, 4, 0, cur_cu->type);
-  itransformskip(encoder, temp_block,pre_quant_coeff,width);
-
-  transform2d(encoder, residual,pre_quant_coeff,width,0);
-  if (encoder->rdoq_enable) {
-    rdoq(encoder_state, pre_quant_coeff, temp_coeff2, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type,0);
-  } else {
-    quant(encoder_state, pre_quant_coeff, temp_coeff2, 4, 4, ac_sum, 0, scan_idx_luma, cur_cu->type);
-  }
-  dequant(encoder_state, temp_coeff2, pre_quant_coeff, 4, 4, 0, cur_cu->type);
-  itransform2d(encoder, temp_block2,pre_quant_coeff,width,0);
-
-  // SSD between original and reconstructed
-  for (i = 0; i < 16; i++) {
-    int diff = temp_block[i] - residual[i];
-    cost += diff*diff;
-
-    diff = temp_block2[i] - residual[i];
-    cost2 += diff*diff;
+    *trskip_out = 1;
+    best = &skip;
   }
 
-  // Simple RDO
-  if(encoder->rdo == 1) {
-    // SSD between reconstruction and original + sum of coeffs
-    for (i = 0; i < 16; i++) {
-      coeffcost += abs((int)temp_coeff[i]);
-      coeffcost2 += abs((int)temp_coeff2[i]);
-    }
-    cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
-    cost2 += (coeffcost2 + (coeffcost2>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
-    // Full RDO
-  } else if(encoder->rdo == 2) {
-    coeffcost = get_coeff_cost(encoder_state, temp_coeff, 4, 0, scan_idx_luma);
-    coeffcost2 = get_coeff_cost(encoder_state, temp_coeff2, 4, 0, scan_idx_luma);
-
-    cost  += coeffcost*((int)encoder_state->global->cur_lambda_cost+0.5);
-    cost2 += coeffcost2*((int)encoder_state->global->cur_lambda_cost+0.5);
+  if (best->has_coeffs || rec_out != pred_in) {
+    // If there is no residual and reconstruction is already in rec_out, 
+    // we can skip this.
+    picture_blit_pixels(best->rec, rec_out, width, width, 4, out_stride);
   }
+  picture_blit_coeffs(best->coeff, coeff_out, width, width, 4, out_stride);
 
-  cur_cu->intra[pu_index].tr_skip = (cost < cost2);
+  return best->has_coeffs;
 }
 
 
@@ -1005,15 +995,12 @@ void decide_trskip(encoder_state * const encoder_state, cu_info *cur_cu, int8_t
  */
 void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, lcu_t* lcu)
 {
-  const encoder_control * const encoder = encoder_state->encoder_control;
   // we have 64>>depth transform size
   const vector2d lcu_px = {x & 0x3f, y & 0x3f};
   const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
   cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH];
   const int8_t width = LCU_WIDTH>>depth;
   
-  int i;
-  
   // Tell clang-analyzer what is up. For some reason it can't figure out from
   // asserting just depth.
   assert(width == 4 || width == 8 || width == 16 || width == 32 || width == 64);
@@ -1055,17 +1042,7 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
     // Pointers to current location in arrays with kvantized coefficients.
     coefficient *orig_coeff_y = &lcu->coeff.y[luma_offset];
 
-    // Temporary buffers. Not really used for much. Possibly unnecessary.
-    pixel pred_y[LCU_WIDTH*LCU_WIDTH];
-    // Buffers for coefficients.
-    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
-
-    // Temporary buffers for kvantization and transformation.
-    int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
-    int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-    
-    uint32_t ac_sum = 0;
-    uint8_t scan_idx_luma   = SCAN_DIAG;
+    coeff_scan_order_t scan_idx_luma = get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
 
     #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
     uint32_t residual_sum = 0;
@@ -1080,104 +1057,26 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
       cbf_clear(&cur_cu->cbf.v, depth);
     }
 
-    // Pick coeff scan mode according to intra prediction mode.
-    if (cur_cu->type == CU_INTRA) {
-      int chroma_mode = cur_cu->intra[0].mode_chroma;
-      if (chroma_mode == 36) {
-        chroma_mode = cur_cu->intra[pu_index].mode;
-      }
-      scan_idx_luma = get_scan_order(cur_cu->type, cur_cu->intra[pu_index].mode, depth);
-    }
-    
-    // Copy Luma and Chroma to the pred-block
-    for(y = 0; y < width; y++) {
-      for(x = 0; x < width; x++) {
-        pred_y[x+y*LCU_WIDTH]=recbase_y[x+y*LCU_WIDTH];
-      }
-    }
-
-    // Get residual by subtracting prediction
-    i = 0;
-    ac_sum = 0;
-
-    for (y = 0; y < width; y++) {
-      for (x = 0; x < width; x++) {
-        block[i] = ((int16_t)base_y[x + y * LCU_WIDTH]) -
-                   pred_y[x + y * LCU_WIDTH];
-        #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
-        residual_sum += block[i];
-        #endif
-        i++;
-      }
-    }
-    #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
-    #define RESIDUAL_THRESHOLD 500
-    if(residual_sum < RESIDUAL_THRESHOLD/(width)) {
-      memset(block, 0, sizeof(int16_t)*(width)*(width));
-    }
-    #endif
-
-    // For 4x4 blocks, check for transform skip
-    if(width == 4 && encoder->trskip_enable) {
-      decide_trskip(encoder_state, cur_cu, depth, pu_index, block, &ac_sum);
-    }
-
-    // Transform and quant residual to coeffs
-    if(width == 4 && cur_cu->intra[pu_index].tr_skip) {
-      transformskip(encoder, block,pre_quant_coeff,width);
-    } else {
-      transform2d(encoder, block,pre_quant_coeff,width,0);
-    }
-
-    if (encoder->rdoq_enable) {
-      rdoq(encoder_state, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0,
-           scan_idx_luma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
-    } else {
-      quant(encoder_state, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type);
-    }
-
-    // Check for non-zero coeffs
-    for (i = 0; i < width * width; i++) {
-      if (coeff_y[i] != 0) {
-        // Found one, we can break here
+    if (width == 4 && encoder_state->encoder_control->trskip_enable) {
+      // Try quantization with trskip and use it if it's better.
+      int has_coeffs = quantize_residual_trskip(
+          encoder_state, cur_cu, width, COLOR_Y, scan_idx_luma,
+          &cur_cu->intra[pu_index].tr_skip,
+          LCU_WIDTH, LCU_WIDTH,
+          base_y, recbase_y, recbase_y, orig_coeff_y
+      );
+      if (has_coeffs) {
         cbf_set(&cur_cu->cbf.y, depth + pu_index);
-        break;
       }
-    }
-
-    // Copy coefficients, even if they are all zeroes. This takes care of the
-    // case where the original coefficients aren't already zeroed.
-    {
-      int i = 0;
-      for (y = 0; y < width; y++) {
-        for (x = 0; x < width; x++) {
-          orig_coeff_y[x + y * LCU_WIDTH] = coeff_y[i];
-          i++;
-        }
-      }
-    }
-
-    if (cbf_is_set(cur_cu->cbf.y, depth + pu_index)) {
-      // Combine inverese quantized coefficients with the prediction to get
-      // reconstructed image.
-      //picture_set_block_residual(cur_pic,x_cu,y_cu,depth,1);
-      int i;
-
-      dequant(encoder_state, coeff_y, pre_quant_coeff, width, width, 0, cur_cu->type);
-      if(width == 4 && cur_cu->intra[pu_index].tr_skip) {
-        itransformskip(encoder, block,pre_quant_coeff,width);
-      } else {
-        itransform2d(encoder, block,pre_quant_coeff,width,0);
-      }
-
-      i = 0;
-
-      for (y = 0; y < width; y++) {
-        for (x = 0; x < width; x++) {
-          int val = block[i++] + pred_y[x + y * LCU_WIDTH];
-          //TODO: support 10+bits
-          recbase_y[x + y * LCU_WIDTH] = (pixel)CLIP(0, 255, val);
-        }
+    } else {
+      int has_coeffs = quantize_residual(
+          encoder_state, cur_cu, width, COLOR_Y, scan_idx_luma,
+          0,
+          LCU_WIDTH, LCU_WIDTH,
+          base_y, recbase_y, recbase_y, orig_coeff_y
+      );
+      if (has_coeffs) {
+        cbf_set(&cur_cu->cbf.y, depth + pu_index);
       }
     }
   }
@@ -1185,21 +1084,26 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
   // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
   // left PU because the coordinates are correct.
   if (depth <= MAX_DEPTH || pu_index == 0) {
-    const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH / 2;
+    const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH_C;
     pixel *recbase_u = &lcu->rec.u[chroma_offset];
     pixel *recbase_v = &lcu->rec.v[chroma_offset];
     const pixel *base_u = &lcu->ref.u[chroma_offset];
     const pixel *base_v = &lcu->ref.v[chroma_offset];
     coefficient *orig_coeff_u = &lcu->coeff.u[chroma_offset];
     coefficient *orig_coeff_v = &lcu->coeff.v[chroma_offset];
+    coeff_scan_order_t scan_idx_chroma;
+    int tr_skip = 0;
+    int chroma_depth = (depth == MAX_PU_DEPTH ? depth - 1 : depth);
+    int chroma_width = LCU_WIDTH_C >> chroma_depth;
 
     if (cur_cu->intra[0].mode_chroma == 36) {
       cur_cu->intra[0].mode_chroma = cur_cu->intra[0].mode;
     }
-    if (quantize_residual_chroma(encoder_state, cur_cu, depth, COLOR_U, base_u, recbase_u, orig_coeff_u)) {
+    scan_idx_chroma = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, depth);
+    if (quantize_residual(encoder_state, cur_cu, chroma_width, COLOR_U, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_u, recbase_u, recbase_u, orig_coeff_u)) {
       cbf_set(&cur_cu->cbf.u, depth);
     }
-    if (quantize_residual_chroma(encoder_state, cur_cu, depth, COLOR_V, base_v, recbase_v, orig_coeff_v)) {
+    if (quantize_residual(encoder_state, cur_cu, chroma_width, COLOR_V, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_v, recbase_v, recbase_v, orig_coeff_v)) {
       cbf_set(&cur_cu->cbf.v, depth);
     }
   }
diff --git a/src/transform.h b/src/transform.h
index f54f773c..9773dd12 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -35,7 +35,7 @@ extern const int16_t g_inv_quant_scales[6];
 
 
 void quant(const encoder_state *encoder_state, int16_t *coef, int16_t *q_coef, int32_t width,
-           int32_t height, uint32_t *ac_sum, int8_t type, int8_t scan_idx, int8_t block_type);
+           int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
 void dequant(const encoder_state *encoder_state, int16_t *q_coef, int16_t *coef, int32_t width, int32_t height,int8_t type, int8_t block_type);
 
 void transformskip(const encoder_control *encoder, int16_t *block,int16_t *coeff, int8_t block_size);