diff --git a/src/encoderstate.c b/src/encoderstate.c
index 4897df15..11f544d0 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -1613,12 +1613,10 @@ void kvz_encode_coeff_nxn(encoder_state_t * const state, coeff_t *coeff, uint8_t
   uint8_t last_coeff_x = 0;
   uint8_t last_coeff_y = 0;
   int32_t i;
-  uint32_t sig_coeffgroup_flag[64];
+  uint32_t sig_coeffgroup_flag[8 * 8] = { 0 };
 
   uint32_t num_nonzero = 0;
-  int32_t scan_pos_last = -1;
   int32_t pos_last = 0;
-  int32_t shift   = 4>>1;
   int8_t be_valid = encoder->sign_hiding;
   int32_t scan_pos_sig;
   int32_t last_scan_set;
@@ -1626,7 +1624,7 @@ void kvz_encode_coeff_nxn(encoder_state_t * const state, coeff_t *coeff, uint8_t
   uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
 
   // CONSTANTS
-  const uint32_t num_blk_side    = width >> shift;
+  const uint32_t num_blk_side    = width >> 2;
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
   const uint32_t *scan           =
     kvz_g_sig_last_scan[scan_mode][log2_block_size - 1];
@@ -1636,35 +1634,42 @@ void kvz_encode_coeff_nxn(encoder_state_t * const state, coeff_t *coeff, uint8_t
   cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]);
   cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) :
                                  &(cabac->ctx.cu_sig_model_chroma[0]);
-  FILL(sig_coeffgroup_flag, 0);
 
-  // Count non-zero coeffs
-  for (i = 0; i < width * width; i+=4) {
-
-    // Load 4 coeffs
-    uint64_t packed = *(uint64_t*)(&coeff[i]);
-    // Or bits from upper byte to lower
-    packed |= packed >> 8;
-    // Zero upper byte for overflow
-    packed &= 0x00FF00FF00FF00FF;
-    // Any bits in lower byte results in overflow
-    packed += 0x00FF00FF00FF00FF;
-    // Pick only overflow bits, overflow means there were bits in that coeff
-    packed &= 0x0100010001000100;
-    // Add bits of two coeffs via possible overflow
-    packed += 0x00FFFF0000FFFF00;
-    // Preserve only the two numbers of nonzero coeffs
-    packed &= ~0x00FFFF0000FFFF00;
-    // Add these numbers
-    packed += (packed << 32);
-    // Shift to the right position and discard extra bits
-    packed >>= 56;
-
-    num_nonzero += packed;
+  // Scan all coeff groups to find out which of them have coeffs.
+  // Populate sig_coeffgroup_flag with that info.
+  unsigned sig_cg_cnt = 0;
+  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
+    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
+      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
+      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
+        // Load four 16-bit coeffs and see if any of them are non-zero.
+        unsigned coeff_pos = cg_pos + coeff_row * width;
+        uint64_t four_coeffs = *(uint64_t*)(&coeff[coeff_pos]);
+        if (four_coeffs) {
+          ++sig_cg_cnt;
+          unsigned cg_pos_y = cg_pos >> log2_block_size;
+          unsigned cg_pos_x = cg_pos - (cg_pos_y << log2_block_size);
+          sig_coeffgroup_flag[(cg_pos_x >> 2) + (cg_pos_y >> 2) * num_blk_side] = 1;
+          break;
+        }
+      }
+    }
   }
 
-  // Transforms with no non-zero coefficients are indicated with CBFs.
-  assert(num_nonzero != 0);
+  // Rest of the code assumes at least one non-zero coeff.
+  assert(sig_cg_cnt > 0);
+
+  // Find the last coeff group by going backwards in scan order.
+  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
+  while (!sig_coeffgroup_flag[scan_cg[scan_cg_last]]) {
+    --scan_cg_last;
+  }
+
+  // Find the last coeff by going backwards in scan order.
+  unsigned scan_coeff_last = scan_cg_last * 16 + 15;
+  while (!coeff[scan[scan_coeff_last]]) {
+    --scan_coeff_last;
+  }
 
   // transform skip flag
   if(width == 4 && encoder->trskip_enable) {
@@ -1672,23 +1677,6 @@ void kvz_encode_coeff_nxn(encoder_state_t * const state, coeff_t *coeff, uint8_t
     CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
   }
 
-  scan_pos_last = -1;
-
-  // Significance mapping
-  while (num_nonzero > 0) {
-    pos_last = scan[++scan_pos_last];
-#define POSY (pos_last >> log2_block_size)
-#define POSX (pos_last - ( POSY << log2_block_size ))
-
-    if (coeff[pos_last] != 0) {
-      sig_coeffgroup_flag[(num_blk_side * (POSY >> shift) + (POSX >> shift))] = 1;
-    }
-
-    num_nonzero -= (coeff[pos_last] != 0) ? 1 : 0;
-    #undef POSY
-    #undef POSX
-  }
-
   last_coeff_x = pos_last & (width - 1);
   last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
 
@@ -1696,8 +1684,8 @@ void kvz_encode_coeff_nxn(encoder_state_t * const state, coeff_t *coeff, uint8_t
   kvz_encode_last_significant_xy(state, last_coeff_x, last_coeff_y, width, width,
                              type, scan_mode);
 
-  scan_pos_sig  = scan_pos_last;
-  last_scan_set = (scan_pos_last >> 4);
+  scan_pos_sig  = scan_coeff_last;
+  last_scan_set = scan_cg_last;
 
   // significant_coeff_flag
   for (i = last_scan_set; i >= 0; i--) {
@@ -1713,7 +1701,7 @@ void kvz_encode_coeff_nxn(encoder_state_t * const state, coeff_t *coeff, uint8_t
     int32_t num_non_zero = 0;
     go_rice_param = 0;
 
-    if (scan_pos_sig == scan_pos_last) {
+    if (scan_pos_sig == scan_coeff_last) {
       abs_coeff[0] = abs(coeff[pos_last]);
       coeff_signs  = (coeff[pos_last] < 0);
       num_non_zero = 1;