diff --git a/src/filter.c b/src/filter.c
index d96db710..7ff98ea2 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -268,6 +268,14 @@ static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir
   return (qp_p + qp_q + 1) >> 1;
 }
 
+static INLINE void gather_deblock_pixels(const kvz_pixel *src, int stride, int offset, kvz_pixel result[2][8])
+{
+  for (int i = 0; i < 8; ++i) {
+    result[0][i] = src[(i - 4) * stride];
+    result[1][i] = src[(i - 4) * stride + offset];
+  }
+}
+
 /**
  * \brief Apply the deblocking filter to luma pixels on a single edge.
  *
@@ -331,8 +339,6 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
 
     // For each 4-pixel part in the edge
     for (uint32_t block_idx = 0; block_idx < num_4px_parts; ++block_idx) {
-      int32_t dp0, dq0, dp3, dq3, d0, d3, dp, dq, d;
-
       {
         // CUs on both sides of the edge
         cu_info_t *cu_p;
@@ -432,30 +438,36 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
         tc              = kvz_g_tc_table_8x8[tc_index] * bitdepth_scale;
         thr_cut         = tc * 10;
       }
-      if(!strength) continue;
-      // Check conditions for filtering
-      // TODO: Get rid of these inline defines.
-      #define calc_DP(s,o) abs( (int16_t)s[-o*3] - (int16_t)2*s[-o*2] + (int16_t)s[-o] )
-      #define calc_DQ(s,o) abs( (int16_t)s[0]    - (int16_t)2*s[o]    + (int16_t)s[o*2] )
 
-      dp0 = calc_DP((src+step*(block_idx*4+0)), offset);
-      dq0 = calc_DQ((src+step*(block_idx*4+0)), offset);
-      dp3 = calc_DP((src+step*(block_idx*4+3)), offset);
-      dq3 = calc_DQ((src+step*(block_idx*4+3)), offset);
-      d0 = dp0 + dq0;
-      d3 = dp3 + dq3;
-      dp = dp0 + dp3;
-      dq = dq0 + dq3;
-      d  =  d0 + d3;
+      if (strength == 0) continue;
 
-      if (d < beta) {
+      // Gather the 6 pixels from each line required for the filter on/off
+      // decision. Include 2 more for the weak/strong filtering decision.
+      kvz_pixel b[2][8];
+      if (dir == EDGE_VER) {
+        gather_deblock_pixels(&src[block_idx * 4 * step], 1, 3 * stride, b);
+      } else {
+        gather_deblock_pixels(&src[block_idx * 4 * step], stride, 3, b);
+      }
+
+      int_fast32_t dp0 = abs(b[0][1] - 2 * b[0][2] + b[0][3]);
+      int_fast32_t dq0 = abs(b[0][4] - 2 * b[0][5] + b[0][6]);
+      int_fast32_t dp3 = abs(b[1][1] - 2 * b[1][2] + b[1][3]);
+      int_fast32_t dq3 = abs(b[1][4] - 2 * b[1][5] + b[1][6]);
+      int_fast32_t dp = dp0 + dp3;
+      int_fast32_t dq = dq0 + dq3;
+
+      if (dp + dq < beta) {
         int8_t filter_P = (dp < side_threshold) ? 1 : 0;
         int8_t filter_Q = (dq < side_threshold) ? 1 : 0;
 
         // Strong filtering flag checking
-        #define useStrongFiltering(o,d,s) ( ((abs(s[-o*4]-s[-o]) + abs(s[o*3]-s[0])) < (beta>>3)) && (d<(beta>>2)) && ( abs(s[-o]-s[0]) < ((tc*5+1)>>1)) )
-        int8_t sw = useStrongFiltering(offset, 2*d0, (src+step*(block_idx*4+0))) &&
-                    useStrongFiltering(offset, 2*d3, (src+step*(block_idx*4+3)));
+        int8_t sw = 2 * (dp0 + dq0) < beta >> 2 &&
+                    2 * (dp3 + dq3) < beta >> 2 &&
+                    abs(b[0][3] - b[0][4]) < (5 * tc + 1) >> 1 &&
+                    abs(b[1][3] - b[1][4]) < (5 * tc + 1) >> 1 &&
+                    abs(b[0][0] - b[0][3]) + abs(b[0][4] - b[0][7]) < beta >> 3 &&
+                    abs(b[1][0] - b[1][3]) + abs(b[1][4] - b[1][7]) < beta >> 3;
 
         // Filter four rows/columns
         for (int i = 0; i < 4; i++) {