diff --git a/src/Makefile b/src/Makefile
index b7a63518..b2353db0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -5,7 +5,7 @@ ifeq (, $(ARCH))
endif
SYSTEM = $(shell uname -s)
ASMFLAGS =
-DFLAGS = -O2 -g -Werror -DUSE_TILES=1
+DFLAGS = -O2 -g -Werror -DUSE_TILES=1 -march=native
# ARCH related flags
ifeq ($(ARCH), x86_64)
diff --git a/src/picture-sse2.c b/src/picture-sse2.c
new file mode 100644
index 00000000..f15c259c
--- /dev/null
+++ b/src/picture-sse2.c
@@ -0,0 +1,90 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar. If not, see .
+ ****************************************************************************/
+
+/*
+ * \file
+ */
+
+#include "picture.h"
+#include
+
+#ifdef __SSE2__
+static unsigned reg_sad(const pixel * const data1, const pixel * const data2,
+ const int width, const int height, const unsigned stride1, const unsigned stride2)
+{
+ int y, x;
+ unsigned sad = 0;
+ __m128i sse_inc = _mm_setzero_si128 ();
+ long long int sse_inc_array[2];
+
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x <= width-16; x+=16) {
+ const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
+ const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
+ }
+
+#ifdef __SSE4_1__
+ {
+ const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
+ const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
+ switch (((width - (width%2)) - x)/2) {
+ case 0:
+ break;
+ case 1:
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01)));
+ break;
+ case 2:
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03)));
+ break;
+ case 3:
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07)));
+ break;
+ case 4:
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f)));
+ break;
+ case 5:
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f)));
+ break;
+ case 6:
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f)));
+ break;
+ case 7:
+ sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f)));
+ break;
+ default:
+ //Should not happen
+ assert(0);
+ }
+ x = (width - (width%2));
+ }
+#endif //__SSE4_1__
+
+ for (; x < width; ++x) {
+ sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
+ }
+ }
+ _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
+ sad += sse_inc_array[0] + sse_inc_array[1];
+
+ return sad;
+}
+#else
+#error picture-sse2.c requires __SSE2__
+#endif //__SSE2__
diff --git a/src/picture.c b/src/picture.c
index 87e6fc13..8b00cb61 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -768,6 +768,13 @@ static unsigned hor_sad(const pixel *pic_data, const pixel *ref_data,
return sad;
}
+
+#if defined(__SSE2__)
+#include "picture-sse2.c"
+#elif defined(__ALTIVEC__)
+#include "picture-altivec.c"
+#else
+//Generic implementations
/**
* \brief Calculate Sum of Absolute Differences (SAD)
*
@@ -782,20 +789,21 @@ static unsigned hor_sad(const pixel *pic_data, const pixel *ref_data,
*
* \returns Sum of Absolute Differences
*/
-static unsigned reg_sad(const pixel *data1, const pixel *data2,
- int width, int height, unsigned stride1, unsigned stride2)
+static unsigned reg_sad(const pixel * const data1, const pixel * const data2,
+ const int width, const int height, const unsigned stride1, const unsigned stride2)
{
int y, x;
unsigned sad = 0;
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
- sad += abs((int)data1[y * stride1 + x] - (int)data2[y * stride2 + x]);
+ sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
}
}
return sad;
}
+#endif
/**
* \brief Handle special cases of comparing blocks that are not completely