diff --git a/src/picture.c b/src/picture.c
index 9ac83270..4098c059 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -949,3 +949,20 @@ unsigned calc_sad(const picture *pic, const picture *ref,
     return interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height);
   }
 }
+
+unsigned calc_ssd(const pixel *const ref, const pixel *const rec,
+                 const int ref_stride, const int rec_stride,
+                 const int width)
+{
+  int ssd = 0;
+  int y, x;
+
+  for (y = 0; y < width; ++y) {
+    for (x = 0; x < width; ++x) {
+      int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
+      ssd += diff * diff;
+    }
+  }
+
+  return ssd;
+}
diff --git a/src/picture.h b/src/picture.h
index 461a1029..0b468ed1 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -271,6 +271,10 @@ unsigned calc_sad(const picture *pic, const picture *ref,
                   int pic_x, int pic_y, int ref_x, int ref_y,
                   int block_width, int block_height);
 
+unsigned calc_ssd(const pixel *const ref, const pixel *const rec,
+                  const int ref_stride, const int rec_stride,
+                  const int width);
+
 double image_psnr(pixel *frame1, pixel *frame2, int32_t x, int32_t y);
 
 
diff --git a/src/rdo.c b/src/rdo.c
index 39abd6f4..f9c2f76f 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -121,6 +121,7 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
     return cost;
 }
 
+
 /** Calculate actual (or really close to actual) bitcost for coding coefficients
  * \param coeff coefficient array
  * \param width coeff block width
diff --git a/src/transform.c b/src/transform.c
index fdb94ea8..714701a5 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -821,7 +821,7 @@ void dequant(const encoder_state * const encoder_state, int16_t *q_coef, int16_t
 int quantize_residual(encoder_state *const encoder_state,
                       const cu_info *const cur_cu, const int width, const color_index color,
                       const coeff_scan_order_t scan_order, const int use_trskip, 
-                      const int stride,
+                      const int in_stride, const int out_stride,
                       const pixel *const ref_in, const pixel *const pred_in, 
                       pixel *rec_out, coefficient *coeff_out)
 {
@@ -837,7 +837,7 @@ int quantize_residual(encoder_state *const encoder_state,
     int y, x;
     for (y = 0; y < width; ++y) {
       for (x = 0; x < width; ++x) {
-        residual[x + y * width] = (int16_t)(ref_in[x + y * stride] - pred_in[x + y * stride]);
+        residual[x + y * width] = (int16_t)(ref_in[x + y * in_stride] - pred_in[x + y * in_stride]);
       }
     }
   }
@@ -870,7 +870,7 @@ int quantize_residual(encoder_state *const encoder_state,
   }
 
   // Copy coefficients to coeff_out.
-  picture_blit_coeffs(quant_coeff, coeff_out, width, width, width, stride);
+  picture_blit_coeffs(quant_coeff, coeff_out, width, width, width, out_stride);
 
   // Do the inverse quantization and transformation and the reconstruction to
   // rec_out.
@@ -888,8 +888,8 @@ int quantize_residual(encoder_state *const encoder_state,
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
     for (y = 0; y < width; ++y) {
       for (x = 0; x < width; ++x) {
-        int16_t val = residual[x + y * width] + pred_in[x + y * stride];
-        rec_out[x + y * stride] = (uint8_t)CLIP(0, 255, val);
+        int16_t val = residual[x + y * width] + pred_in[x + y * in_stride];
+        rec_out[x + y * out_stride] = (uint8_t)CLIP(0, 255, val);
       }
     }
   } else if (rec_out != pred_in) {
@@ -899,7 +899,7 @@ int quantize_residual(encoder_state *const encoder_state,
 
     for (y = 0; y < width; ++y) {
       for (x = 0; x < width; ++x) {
-        rec_out[x + y * stride] = pred_in[x + y * stride];
+        rec_out[x + y * out_stride] = pred_in[x + y * in_stride];
       }
     }
   }
@@ -1058,6 +1058,7 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
       cbf_clear(&cur_cu->cbf.v, depth);
     }
 
+#if 0
     if (width == 4 && encoder_state->encoder_control->trskip_enable) {
       int16_t residual[4*4];
       int x, y;
@@ -1067,17 +1068,73 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
         }
       }
       cur_cu->intra[pu_index].tr_skip = decide_trskip(encoder_state, cur_cu, depth, scan_idx_luma, residual);
+    } else {
+      cur_cu->intra[pu_index].tr_skip = 0;
     }
     {
       int has_coeffs = quantize_residual(
           encoder_state, cur_cu, width, COLOR_Y, scan_idx_luma,
-          cur_cu->intra[pu_index].tr_skip, LCU_WIDTH,
+          cur_cu->intra[pu_index].tr_skip, LCU_WIDTH, LCU_WIDTH,
           base_y, recbase_y, recbase_y, orig_coeff_y
       );
       if (has_coeffs) {
         cbf_set(&cur_cu->cbf.y, depth + pu_index);
       }
     }
+#else
+    if (width == 4 && encoder_state->encoder_control->trskip_enable) {
+      pixel no_trskip_rec[4*4];
+      pixel trskip_rec[4*4];
+      coefficient no_trskip_coeff[4*4];
+      coefficient trskip_coeff[4*4];
+      
+      unsigned no_trskip_cost = 0;
+      unsigned trskip_cost = 0;
+      int no_trskip_has_coeffs;
+      int trskip_has_coeffs;
+
+      no_trskip_has_coeffs = quantize_residual(
+          encoder_state, cur_cu, width, COLOR_Y, scan_idx_luma,
+          0, LCU_WIDTH, 4,
+          base_y, recbase_y, no_trskip_rec, no_trskip_coeff
+      );
+      no_trskip_cost += calc_ssd(base_y, no_trskip_rec, LCU_WIDTH, 4, 4);
+      no_trskip_cost += get_coeff_cost(encoder_state, no_trskip_coeff, 4, 0, scan_idx_luma) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
+
+      trskip_has_coeffs = quantize_residual(
+          encoder_state, cur_cu, width, COLOR_Y, scan_idx_luma,
+          1, LCU_WIDTH, 4,
+          base_y, recbase_y, trskip_rec, trskip_coeff
+      );
+      trskip_cost += calc_ssd(base_y, trskip_rec, LCU_WIDTH, 4, 4);
+      trskip_cost += get_coeff_cost(encoder_state, trskip_coeff, 4, 0, scan_idx_luma) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
+
+      if (no_trskip_cost <= trskip_cost) {
+        cur_cu->intra[pu_index].tr_skip = 0;
+        picture_blit_pixels(no_trskip_rec, recbase_y, width, width, 4, LCU_WIDTH);
+        picture_blit_coeffs(no_trskip_coeff, orig_coeff_y, width, width, 4, LCU_WIDTH);
+        if (no_trskip_has_coeffs) {
+          cbf_set(&cur_cu->cbf.y, depth + pu_index);
+        }
+      } else {
+        cur_cu->intra[pu_index].tr_skip = 1;
+        picture_blit_pixels(trskip_rec, recbase_y, width, width, 4, LCU_WIDTH);
+        picture_blit_coeffs(trskip_coeff, orig_coeff_y, width, width, 4, LCU_WIDTH);
+        if (trskip_has_coeffs) {
+          cbf_set(&cur_cu->cbf.y, depth + pu_index);
+        }
+      }
+    } else {
+      int has_coeffs = quantize_residual(
+          encoder_state, cur_cu, width, COLOR_Y, scan_idx_luma,
+          0, LCU_WIDTH, LCU_WIDTH,
+          base_y, recbase_y, recbase_y, orig_coeff_y
+      );
+      if (has_coeffs) {
+        cbf_set(&cur_cu->cbf.y, depth + pu_index);
+      }
+    }
+#endif
   }
 
   // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
@@ -1099,10 +1156,10 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
       cur_cu->intra[0].mode_chroma = cur_cu->intra[0].mode;
     }
     scan_idx_chroma = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, depth);
-    if (quantize_residual(encoder_state, cur_cu, chroma_width, COLOR_U, scan_idx_chroma, tr_skip, LCU_WIDTH_C, base_u, recbase_u, recbase_u, orig_coeff_u)) {
+    if (quantize_residual(encoder_state, cur_cu, chroma_width, COLOR_U, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_u, recbase_u, recbase_u, orig_coeff_u)) {
       cbf_set(&cur_cu->cbf.u, depth);
     }
-    if (quantize_residual(encoder_state, cur_cu, chroma_width, COLOR_V, scan_idx_chroma, tr_skip, LCU_WIDTH_C, base_v, recbase_v, recbase_v, orig_coeff_v)) {
+    if (quantize_residual(encoder_state, cur_cu, chroma_width, COLOR_V, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_v, recbase_v, recbase_v, orig_coeff_v)) {
       cbf_set(&cur_cu->cbf.v, depth);
     }
   }