diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
index aae28a7c..0ba7c189 100644
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@@ -34,6 +34,77 @@
 #include "strategyselector.h"
 #include "strategies/generic/picture-generic.h"
 
+/**
+ * \brief Calculate Sum of Absolute Differences (SAD)
+ *
+ * Calculate Sum of Absolute Differences (SAD) between two rectangular regions
+ * located in arbitrary points in the picture.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param stride  Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+
+uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                          const int width, const int height, const unsigned stride1, const unsigned stride2)
+{
+  int32_t y, x;
+
+  // Bytes in block in 256-bit blocks per each scanline, and remainder
+  const int largeblock_bytes = width & ~31;
+  const int residual_bytes_1 = width &  31;
+  const int residual_xmms    = residual_bytes_1 >> 4;
+  const int residual_bytes   = residual_bytes_1 &  15;
+
+  const __m128i rds    = _mm_set1_epi8(residual_bytes);
+  const __m128i ns     = _mm_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
+                                       8,  9,  10, 11, 12, 13, 14, 15);
+  const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
+
+  __m256i avx_inc      = _mm256_setzero_si256();
+  __m128i sse_inc      = _mm_setzero_si128();
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < largeblock_bytes; x += 32) {
+      __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x)));
+      __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x)));
+      __m256i curr_sads = _mm256_sad_epu8(a, b);
+      avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
+    }
+    if (residual_xmms) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x)));
+      __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x)));
+      __m128i curr_sads = _mm_sad_epu8   (a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+      x += 16;
+    }
+    if (residual_bytes) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x)));
+      __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x)));
+
+      __m128i b_masked  = _mm_blendv_epi8(a, b, rdmask);
+      __m128i curr_sads = _mm_sad_epu8(a, b_masked);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m256i avx_inc_2   = _mm256_permute4x64_epi64(avx_inc,   _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i avx_inc_3   = _mm256_add_epi64        (avx_inc,   avx_inc_2);
+  __m256i avx_inc_4   = _mm256_shuffle_epi32    (avx_inc_3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i avx_inc_5   = _mm256_add_epi64        (avx_inc_3, avx_inc_4);
+  __m128i avx_inc_128 = _mm256_castsi256_si128  (avx_inc_5);
+
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sse_sads  = _mm_add_epi64    (sse_inc, sse_inc_2);
+  __m128i sads      = _mm_add_epi64    (sse_sads, avx_inc_128);
+
+  // 32 bits should always be enough for even the largest blocks with a SAD of
+  // 255 in each pixel, even though the SAD results themselves are 64 bits
+  return _mm_cvtsi128_si32(sads);
+}
 
 /**
 * \brief Calculate SAD for 8x8 bytes in continuous memory.
@@ -1230,6 +1301,11 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
   // simplest code to look at for anyone interested in doing more
   // optimizations, so it's worth it to keep this maintained.
   if (bitdepth == 8){
+
+    // It currently appears that this is actually slower than the SSE4.1
+    // version.. Go figure
+    success &= kvz_strategyselector_register(opaque, "reg_sad", "avx2", 19, &kvz_reg_sad_avx2);
+
     success &= kvz_strategyselector_register(opaque, "sad_8x8", "avx2", 40, &sad_8bit_8x8_avx2);
     success &= kvz_strategyselector_register(opaque, "sad_16x16", "avx2", 40, &sad_8bit_16x16_avx2);
     success &= kvz_strategyselector_register(opaque, "sad_32x32", "avx2", 40, &sad_8bit_32x32_avx2);
@@ -1250,7 +1326,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
     success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);
 
     success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
-	   success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
+	  success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
 
   }
 #endif
diff --git a/src/strategies/sse41/picture-sse41.c b/src/strategies/sse41/picture-sse41.c
index d75b4d58..b5559e33 100644
--- a/src/strategies/sse41/picture-sse41.c
+++ b/src/strategies/sse41/picture-sse41.c
@@ -28,63 +28,43 @@
 #include "strategyselector.h"
 
 
-unsigned kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
-                           const int width, const int height, const unsigned stride1, const unsigned stride2)
+uint32_t kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t width, const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
 {
-  int y, x;
-  unsigned sad = 0;
-  __m128i sse_inc = _mm_setzero_si128 ();
-  long long int sse_inc_array[2];
+  int32_t y, x;
+  __m128i sse_inc = _mm_setzero_si128();
   
+  // Bytes in block in 128-bit blocks per each scanline, and remainder
+  const int32_t largeblock_bytes = width & ~15;
+  const int32_t residual_bytes   = width &  15;
+
+  const __m128i rds    = _mm_set1_epi8 (residual_bytes);
+  const __m128i ns     = _mm_setr_epi8 (0,  1,  2,  3,  4,  5,  6,  7,
+                                        8,  9,  10, 11, 12, 13, 14, 15);
+  const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
+
   for (y = 0; y < height; ++y) {
-    for (x = 0; x <= width-16; x+=16) {
-      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
-      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
-      sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
+    for (x = 0; x < largeblock_bytes; x += 16) {
+      __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
+      __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
+      __m128i curr_sads = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi32(sse_inc, curr_sads);
     }
     
-    {
-      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
-      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
-      switch (((width - (width%2)) - x)/2) {
-        case 0:
-          break;
-        case 1:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01)));
-          break;
-        case 2:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03)));
-          break;
-        case 3:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07)));
-          break;
-        case 4:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f)));
-          break;
-        case 5:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f)));
-          break;
-        case 6:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f)));
-          break;
-        case 7:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f)));
-          break;
-        default:
-          //Should not happen
-          assert(0);
-      }
-      x = (width - (width%2));
-    }
+    if (residual_bytes) {
+      __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
+      __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
 
-    for (; x < width; ++x) {
-      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
+      __m128i b_masked  = _mm_blendv_epi8(a, b, rdmask);
+      __m128i curr_sads = _mm_sad_epu8(a, b_masked);
+      sse_inc = _mm_add_epi32(sse_inc, curr_sads);
     }
   }
-  _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
-  sad += sse_inc_array[0] + sse_inc_array[1];
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
 
-  return sad;
+  return _mm_cvtsi128_si32(sad);
 }
 
 #endif //COMPILE_INTEL_SSE41