Added AVX strategy

2024-11-27 11:24:05 +00:00 · 2014-06-16 18:14:11 +03:00 · 2014-06-16 18:14:11 +03:00 · bdef5384ef
parent ab3845c9c7
commit bdef5384ef
4 changed files with 823 additions and 0 deletions
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@ -120,6 +120,9 @@
    <ClCompile Include="..\..\src\strategies\strategies-picture.c">
      <Filter>Source Files\strategies</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\picture\picture-avx.c">
+      <Filter>Source Files\strategies\picture</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\src\global.h">
@ -236,10 +239,19 @@
    <ClInclude Include="..\..\src\strategies\nal\nal-generic.c">
      <Filter>Source Files\strategies\nal</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\x86\picture_x86.h">
+      <Filter>Header Files\x86</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <YASM Include="..\..\src\x86\cpu.asm">
      <Filter>Source Files\x86</Filter>
    </YASM>
+    <YASM Include="..\..\src\x86\picture_x86.asm">
+      <Filter>Source Files\x86</Filter>
+    </YASM>
+    <YASM Include="..\..\src\x86\x86inc.asm">
+      <Filter>Source Files\x86</Filter>
+    </YASM>
  </ItemGroup>
 </Project>
--- a/src/strategies/picture/picture-avx.c
+++ b/src/strategies/picture/picture-avx.c
@ -0,0 +1,152 @@
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2013-2014 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2 as published
+* by the Free Software Foundation.
+*
+* Kvazaar is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+/*
+* \file
+*/
+#include "../strategyselector.h"
+//#include "../picture.h"
+#include "../x86/picture_x86.h"
+#include <math.h>
+
+#ifdef __GNUC__
+__attribute__((__target__("avx")))
+#endif
+
+static unsigned kvz_sad_32x32(const pixel *data1, const pixel *data2)
+{
+  unsigned sad = 0;
+  sad += kvz_sad_16x16(data1, data2);
+  sad += kvz_sad_16x16(data1 + 8 * 32, data2 + 8 * 32);
+  sad += kvz_sad_16x16(data1 + 16 * 32, data2 + 16 * 32);
+  sad += kvz_sad_16x16(data1 + 24 * 32, data2 + 24 * 32);
+  return sad;
+}
+
+static unsigned kvz_sad_32x32_stride(const pixel *data1, const pixel *data2, unsigned stride)
+{
+  unsigned sad = 0;
+  sad += kvz_sad_16x16_stride(data1, data2, stride);
+  sad += kvz_sad_16x16_stride(data1 + 16, data2 + 16, stride);
+  sad += kvz_sad_16x16_stride(data1 + 16 * stride, data2 + 16 * stride, stride);
+  sad += kvz_sad_16x16_stride(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride);
+  return sad;
+}
+
+static unsigned kvz_sad_64x64(const pixel *data1, const pixel *data2)
+{
+  unsigned sad = 0;
+  sad += kvz_sad_32x32(data1, data2);
+  sad += kvz_sad_32x32(data1 + 16 * 64, data2 + 16 * 64);
+  sad += kvz_sad_32x32(data1 + 32 * 64, data2 + 32 * 64);
+  sad += kvz_sad_32x32(data1 + 48 * 64, data2 + 48 * 64);
+  return sad;
+}
+
+static unsigned kvz_sad_64x64_stride(const pixel *data1, const pixel *data2, unsigned stride)
+{
+  unsigned sad = 0;
+  sad += kvz_sad_32x32_stride(data1, data2, stride);
+  sad += kvz_sad_32x32_stride(data1 + 32, data2 + 32, stride);
+  sad += kvz_sad_32x32_stride(data1 + 32 * stride, data2 + 32 * stride, stride);
+  sad += kvz_sad_32x32_stride(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride);
+  return sad;
+}
+
+static unsigned kvz_sad_generic(const pixel * const data1, const pixel * const data2,
+  const int width, const int height, const unsigned stride1, const unsigned stride2)
+{
+  int y, x;
+  unsigned sad = 0;
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
+    }
+  }
+
+  return sad;
+}
+
+static unsigned reg_sad_avx(const pixel * const data1, const pixel * const data2,
+const int width, const int height, const unsigned stride1, const unsigned stride2)
+{
+  if (width == 4 && height == 4) {
+    return kvz_sad_4x4_stride(data1, data2, stride1);
+  } else if (width == 8 && height == 8) {
+    return kvz_sad_8x8_stride(data1, data2, stride1);
+  } else if (width == 16 && height == 16) {
+    return kvz_sad_16x16_stride(data1, data2, stride1);
+  } else if (width == 32 && height == 32) {
+    return kvz_sad_32x32_stride(data1, data2, stride1);
+  } else if (width == 64 && height == 64) {
+    return kvz_sad_64x64_stride(data1, data2, stride1);
+  } else {
+    return kvz_sad_generic(data1, data2, width, height, stride1, stride2);
+  }
+}
+
+// Function macro for defining hadamard calculating functions
+// for fixed size blocks. They calculate hadamard for integer
+// multiples of 8x8 with the 8x8 hadamard function.
+#define KVZ_SATD_NXN(n, pixel_type, suffix) \
+  static unsigned kvz_satd_ ## suffix ## _ ## n ## x ## n ## _stride( \
+  const pixel_type * const block1, const pixel_type * const block2) \
+{ \
+  unsigned x, y; \
+  unsigned sum = 0; \
+  for (y = 0; y < (n); y += 8) { \
+  unsigned row = y * (n); \
+  for (x = 0; x < (n); x += 8) { \
+  sum += kvz_satd_8x8_stride(&block1[row + x], (n), &block2[row + x], (n)); \
+  } \
+  } \
+  return sum; \
+}
+
+// Declare these functions to make sure the signature of the macro matches.
+static cost_pixel_nxn_func kvz_satd_8bit_16x16;
+static cost_pixel_nxn_func kvz_satd_8bit_32x32;
+static cost_pixel_nxn_func kvz_satd_8bit_64x64;
+
+// These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
+KVZ_SATD_NXN(16, pixel, 8bit)
+KVZ_SATD_NXN(32, pixel, 8bit)
+KVZ_SATD_NXN(64, pixel, 8bit)
+
+
+static int strategy_register_picture_avx(void* opaque) {
+  bool success = true;
+
+  success &= strategyselector_register(opaque, "reg_sad", "avx", 30, &reg_sad_avx);
+
+  success &= strategyselector_register(opaque, "sad_8bit_4x4", "avx", 30, &kvz_sad_4x4);
+  success &= strategyselector_register(opaque, "sad_8bit_8x8", "avx", 30, &kvz_sad_8x8);
+  success &= strategyselector_register(opaque, "sad_8bit_16x16", "avx", 30, &kvz_sad_16x16);
+  success &= strategyselector_register(opaque, "sad_8bit_32x32", "avx", 30, &kvz_sad_32x32);
+  success &= strategyselector_register(opaque, "sad_8bit_64x64", "avx", 30, &kvz_sad_64x64);
+
+  success &= strategyselector_register(opaque, "satd_8bit_4x4", "avx", 30, &kvz_satd_4x4);
+  success &= strategyselector_register(opaque, "satd_8bit_8x8", "avx", 30, &kvz_satd_8x8_stride);
+  success &= strategyselector_register(opaque, "satd_8bit_16x16", "avx", 30, &kvz_satd_8bit_16x16_stride);
+  success &= strategyselector_register(opaque, "satd_8bit_32x32", "avx", 30, &kvz_satd_8bit_32x32_stride);
+  success &= strategyselector_register(opaque, "satd_8bit_64x64", "avx", 30, &kvz_satd_8bit_64x64_stride);
+
+  return success;
+}
--- a/src/x86/picture_x86.asm
+++ b/src/x86/picture_x86.asm
@ -0,0 +1,620 @@
+;/*****************************************************************************
+;* This file is part of Kvazaar HEVC encoder.
+;* 
+;* Copyright (C) 2013-2014 Tampere University of Technology and others (see 
+;* COPYING file).
+;*
+;* Kvazaar is free software: you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License version 2 as published
+;* by the Free Software Foundation.
+;*
+;* Kvazaar is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+;****************************************************************************/
+
+%include "x86inc.asm"
+
+
+
+SECTION .text
+
+;KVZ_SAD_4X4
+;Calculates SAD of the 16 consequtive bytes in memory
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+
+cglobal sad_4x4, 2, 2, 2
+
+vmovdqu m0, [r0]
+vmovdqu m1, [r1]
+
+vpsadbw m0, m1
+
+vmovhlps m1, m0
+vpaddw m0, m1
+
+vmovd eax, m0
+
+RET
+
+
+;KVZ_SAD_4X4_STRIDE
+;Calculates SAD of a 4x4 block inside a frame with stride
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+;r2 stride
+
+cglobal sad_4x4_stride, 3, 3, 2
+
+vpinsrd m0, [r0], 0
+add r0, r2
+vpinsrd m0, [r0], 1
+vpinsrd m0, [r0+r2], 2
+vpinsrd m0, [r0+r2*2], 3
+
+vpinsrd m1, [r1], 0
+add r1, r2
+vpinsrd m1, [r1], 1
+vpinsrd m1, [r1+r2], 2
+vpinsrd m1, [r1+r2*2], 3
+
+vpsadbw m0, m1
+
+vmovhlps m1, m0
+vpaddw m0, m1
+
+vmovd eax, m0
+
+RET
+
+
+;KVZ_SAD_8X8
+;Calculates SAD of the 64 consequtive bytes in memory
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+
+cglobal sad_8x8, 2, 2, 5
+
+vpxor m0, m0
+
+%rep 2
+
+vmovdqu m1, [r0]
+vmovdqu m3, [r0+16]
+add r0, 32
+
+vmovdqu m2, [r1]
+vmovdqu m4, [r1+16]
+add r1, 32
+
+vpsadbw m1, m2
+vpsadbw m3, m4
+
+vpaddw m0, m1
+vpaddw m0, m3
+
+%endrep
+
+vmovhlps m1, m0
+vpaddw m0, m1
+
+vmovd eax, m0
+
+RET
+
+
+;KVZ_SAD_8X8_STRIDE
+;Calculates SAD of a 8x8 block inside a frame with stride
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+;r2 stride
+
+cglobal sad_8x8_stride, 3, 3, 5
+
+vpxor m0, m0
+
+vmovhpd m1, [r0]
+add r0, r2
+vmovlpd m1, [r0] 
+
+vmovhpd m3, [r0+r2]
+vmovlpd m3, [r0+r2*2] 
+lea r0, [r0+r2*2]
+add r0, r2
+
+vmovhpd m2, [r1]
+add r1, r2
+vmovlpd m2, [r1] 
+
+vmovhpd m4, [r1+r2]
+vmovlpd m4, [r1+r2*2] 
+lea r1, [r1+r2*2]
+add r1, r2
+
+vpsadbw m1, m2
+vpsadbw m3, m4
+
+vpaddw m0, m1
+vpaddw m0, m3
+
+vmovhpd m1, [r0]
+add r0, r2
+vmovlpd m1, [r0] 
+
+vmovhpd m3, [r0+r2]
+vmovlpd m3, [r0+r2*2] 
+lea r0, [r0+r2*2]
+add r0, r2
+
+vmovhpd m2, [r1]
+add r1, r2
+vmovlpd m2, [r1] 
+
+vmovhpd m4, [r1+r2]
+vmovlpd m4, [r1+r2*2] 
+lea r1, [r1+r2*2]
+add r1, r2
+
+vpsadbw m1, m2
+vpsadbw m3, m4
+
+vpaddw m0, m1
+vpaddw m0, m3
+
+vmovhlps m1, m0
+vpaddw m0, m1
+
+vmovd eax, m0
+
+RET
+
+
+;KVZ_SAD_16X16
+;Calculates SAD of the 256 consequtive bytes in memory
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+
+cglobal sad_16x16, 2, 2, 5
+
+vpxor m4, m4
+
+%rep 8
+
+; Load 2 rows from rec_buf to m0 and m2
+vmovdqu m0, [r0]
+vmovdqu m2, [r0 + 16]
+add r0, 32
+
+; Load 2 rows from ref_buf to m1 and m3
+vmovdqu m1, [r1]
+vmovdqu m3, [r1 + 16]
+add r1, 32
+
+vpsadbw m0, m1
+vpsadbw m2, m3
+
+vpaddw m4, m0
+vpaddw m4, m2
+
+%endrep
+
+vmovhlps m0, m4
+vpaddw m4, m0
+
+vmovd eax, m4
+
+RET
+
+
+;KVZ_SAD_16X16_STRIDE
+;Calculates SAD of a 16x16 block inside a frame with stride
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+;r2 stride
+
+cglobal sad_16x16_stride, 3, 3, 5
+
+vpxor m4, m4
+
+%rep 8
+
+; Load 2 rows from rec_buf to m0 and m2
+vmovdqu m0, [r0]
+vmovdqu m2, [r0 + r2]
+lea r0, [r0 + r2*2]
+
+; Load 2 rows from ref_buf to m1 and m3
+vmovdqu m1, [r1]
+vmovdqu m3, [r1 + r2]
+lea r1, [r1 + r2*2]
+ 
+vpsadbw m0, m1
+vpsadbw m2, m3
+
+vpaddw m4, m0
+vpaddw m4, m2
+
+%endrep
+
+vmovhlps m0, m4
+vpaddw m4, m0
+
+vmovd eax, m4
+
+RET
+
+
+;KVZ_SATD_4X4
+;Calculates SATD of the 16 consequtive bytes in memory
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+
+cglobal satd_4x4, 2, 2, 6
+
+vpmovzxbw m0, [r0]
+vpmovzxbw m2, [r1]
+vpsubw m0, m2
+
+vpmovzxbw m1, [r0+8]
+vpmovzxbw m3, [r1+8]
+vpsubw m1, m3
+
+;Horizontal phase
+;rows 1-2
+vphaddw m4, m0, m1
+vphsubw m5, m0, m1
+
+vphaddw m0, m4, m5
+vphsubw m1, m4, m5
+
+;Vertical phase
+vphaddw m4, m0, m1
+vphsubw m5, m0, m1
+
+vphaddw m0, m4, m5
+vphsubw m1, m4, m5
+
+vpabsw m0, m0
+vpabsw m1, m1
+
+vpaddw m0, m1
+
+vphaddw m0, m0
+vphaddw m0, m0
+vphaddw m0, m0
+
+vpextrw eax, m0, 0
+
+;Uncomment if transformed values not divided elsewhere
+;add eax, 1
+;shr eax, 1
+
+RET
+
+
+;KVZ_SATD_8X8_STRIDE
+;Calculates SATD of a 8x8 block inside a frame with stride
+;r0 address of the first value(reference)
+;r1 address of the first value(current)
+;r2 stride
+
+%if ARCH_X86_64
+    cglobal satd_8x8_stride, 4, 4, 16
+%else
+    cglobal satd_8x8_stride, 4, 4, 8
+%endif
+
+vpmovzxbw m0, [r0]
+vpmovzxbw m7, [r2]
+vpsubw m0, m7
+
+vpmovzxbw m1, [r0+r1]
+lea r0, [r0+r1*2]
+vpmovzxbw m7, [r2+r3]
+lea r2, [r2+r3*2]
+vpsubw m1, m7
+
+vpmovzxbw m2, [r0]
+vpmovzxbw m7, [r2]
+vpsubw m2, m7
+
+vpmovzxbw m3, [r0+r1]
+lea r0, [r0+r1*2]
+vpmovzxbw m7, [r2+r3]
+lea r2, [r2+r3*2]
+vpsubw m3, m7
+
+vpmovzxbw m4, [r0]
+vpmovzxbw m7, [r2]
+vpsubw m4, m7
+
+vpmovzxbw m5, [r0+r1]
+lea r0, [r0+r1*2]
+vpmovzxbw m7, [r2+r3]
+lea r2, [r2+r3*2]
+vpsubw m5, m7
+
+vpmovzxbw m6, [r0]
+vpmovzxbw m7, [r2]
+vpsubw m6, m7
+
+
+%if ARCH_X86_64
+    vpmovzxbw m7, [r0+r1]
+    vpmovzxbw m8, [r2+r3]
+    vpsubw m7, m8
+%elif
+    vpmovzxbw m7, [r2+r3]
+    movdqu [esp-16], m7
+    vpmovzxbw m7, [r0+r1]
+    vpsubw m7, [esp-16]
+
+    movdqu [esp-16], m4
+    movdqu [esp-16*2], m5
+    movdqu [esp-16*3], m6
+    movdqu [esp-16*4], m7
+    lea esp, [esp-16*4]
+%endif
+
+;Horizontal phaze
+
+%if ARCH_X86_64
+vphaddw m8, m0, m1
+vphsubw m9, m0, m1
+
+vphaddw m10, m2, m3
+vphsubw m11, m2, m3
+
+vphaddw m12, m4, m5
+vphsubw m13, m4, m5
+
+vphaddw m14, m6, m7 
+vphsubw m15, m6, m7
+
+
+vphaddw m0, m8, m9
+vphsubw m1, m8, m9
+
+vphaddw m2, m10, m11
+vphsubw m3, m10, m11
+
+vphaddw m4, m12, m13
+vphsubw m5, m12, m13
+
+vphaddw m6, m14, m15
+vphsubw m7, m14, m15
+
+
+vphaddw m8, m0, m1
+vphsubw m9, m0, m1
+
+vphaddw m10, m2, m3
+vphsubw m11, m2, m3
+
+vphaddw m12, m4, m5
+vphsubw m13, m4, m5
+
+vphaddw m14, m6, m7
+vphsubw m15, m6, m7
+
+%elif
+
+vphaddw m4, m0, m1
+vphsubw m5, m0, m1
+
+vphaddw m6, m2, m3
+vphsubw m7, m2, m3
+
+
+vphaddw m0, m4, m5
+vphsubw m1, m4, m5
+
+vphaddw m2, m6, m7
+vphsubw m3, m6, m7
+
+
+vphaddw m4, m0, m1
+vphsubw m5, m0, m1
+
+vphaddw m6, m2, m3
+vphsubw m7, m2, m3
+
+
+movdqu m3, [esp]
+movdqu m2, [esp+16]
+movdqu m1, [esp+16*2]
+movdqu m0, [esp+16*3]
+
+movdqu [esp], m7
+movdqu [esp+16*1], m6
+movdqu [esp+16*2], m5
+movdqu [esp+16*3], m4
+
+vphaddw m4, m0, m1
+vphsubw m5, m0, m1
+
+vphaddw m6, m2, m3
+vphsubw m7, m2, m3
+
+
+vphaddw m0, m4, m5
+vphsubw m1, m4, m5
+
+vphaddw m2, m6, m7
+vphsubw m3, m6, m7
+
+
+vphaddw m4, m0, m1
+vphsubw m5, m0, m1
+
+vphaddw m6, m2, m3
+vphsubw m7, m2, m3
+
+%endif
+
+
+;Vertical phase
+
+%if ARCH_X86_64
+
+vphaddw m0, m8, m9
+vphsubw m1, m8, m9
+
+vphaddw m2, m10, m11
+vphsubw m3, m10, m11
+
+vphaddw m4, m12, m13
+vphsubw m5, m12, m13
+
+vphaddw m6, m14, m15
+vphsubw m7, m14, m15
+
+vpmovzxwd m0, m0
+vpmovzxwd m1, m1
+vpmovzxwd m2, m2
+vpmovzxwd m3, m3
+vpmovzxwd m4, m4
+vpmovzxwd m5, m5
+vpmovzxwd m6, m6
+vpmovzxwd m7, m7
+
+vpaddd m8, m0, m2
+vpaddd m9, m1, m3
+vpsubd m10, m0, m2
+vpsubd m11, m1, m3
+
+vpaddd m12, m4, m6
+vpaddd m13, m5, m7
+vpsubd m14, m4, m6
+vpsubd m15, m5, m7
+
+vpaddd m0, m8, m12
+vpaddd m1, m9, m13
+vpaddd m2, m10, m14
+vpaddd m3, m11, m15
+
+vpsubd m4, m8, m12
+vpsubd m5, m9, m13
+vpsubd m6, m10, m14
+vpsubd m7, m11, m15
+
+%elif
+
+vphaddw m0, m4, m5
+vphsubw m1, m4, m5
+
+vphaddw m2, m6, m7
+vphsubw m3, m6, m7
+
+vpmovzxwd m0, m0
+vpmovzxwd m1, m1
+vpmovzxwd m2, m2
+vpmovzxwd m3, m3
+
+vpaddd m4, m0, m2
+vpaddd m5, m1, m3
+vpsubd m6, m0, m2
+vpsubd m7, m1, m3
+
+movdqu m3, [esp]
+movdqu m2, [esp+16]
+movdqu m1, [esp+16*2]
+movdqu m0, [esp+16*3]
+
+movdqu [esp], m7
+movdqu [esp+16*1], m6
+movdqu [esp+16*2], m5
+movdqu [esp+16*3], m4
+
+vphaddw m0, m4, m5
+vphsubw m1, m4, m5
+
+vphaddw m2, m6, m7
+vphsubw m3, m6, m7
+
+vpmovzxwd m0, m0
+vpmovzxwd m1, m1
+vpmovzxwd m2, m2
+vpmovzxwd m3, m3
+
+vpaddd m4, m0, m2
+vpaddd m5, m1, m3
+vpsubd m6, m0, m2
+vpsubd m7, m1, m3
+
+vpaddd m4, m2, [esp-16]
+vpaddd m5, m3, [esp]
+vpsubd m6, m2, [esp-16]
+vpsubd m7, m3, [esp]
+
+vpabsd m4, m4
+vpabsd m5, m5
+vpabsd m6, m6
+vpabsd m7, m7
+
+vpaddd m2, m4, m5
+vpaddd m2, m6
+vpaddd m2, m7
+
+vpaddd m4, m0, [esp-16*3]
+vpaddd m5, m1, [esp-16*2]
+vpsubd m6, m0, [esp-16*3]
+vpsubd m7, m1, [esp-16*2]
+
+vpabsd m4, m4
+vpabsd m5, m5
+vpabsd m6, m6
+vpabsd m7, m7
+
+vpaddd m0, m4, m5
+vpaddd m0, m6
+vpaddd m0, m7
+
+vpaddd m0, m2
+
+%endif
+
+%if ARCH_X86_64
+vpabsd m0, m0
+vpabsd m1, m1
+vpabsd m2, m2
+vpabsd m3, m3
+vpabsd m4, m4
+vpabsd m5, m5
+vpabsd m6, m6
+vpabsd m7, m7
+
+vpaddd m0, m1
+vpaddd m0, m2
+vpaddd m0, m3
+vpaddd m0, m4
+vpaddd m0, m5
+vpaddd m0, m6
+vpaddd m0, m7
+%endif
+
+vphaddd m0, m0
+vphaddd m0, m0
+vpextrd eax, m0, 1
+vpinsrd m1, eax, 0
+vpaddd m0, m1
+vpextrd eax, m0, 1
+vpinsrd m1, eax, 0
+vpaddd m0, m1
+
+%if ARCH_X86_64 == 0
+    lea esp, [esp+16*4]
+%endif
+
+vmovd eax, m0
+
+;Uncomment if transformed values not divided elsewhere
+;add eax, 2
+;shr eax, 2
+
+RET
--- a/src/x86/picture_x86.h
+++ b/src/x86/picture_x86.h
@ -0,0 +1,39 @@
+#ifndef _PICTURE_X86_H_
+#define _PICTURE_X86_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+ /*! \file picture_x86.h
+    \brief assembly functions header for sad and satd
+*/
+
+unsigned kvz_sad_4x4(const pixel*, const pixel*);
+unsigned kvz_sad_8x8(const pixel*, const pixel*);
+unsigned kvz_sad_16x16(const pixel*, const pixel*);
+
+unsigned kvz_sad_4x4_stride(const pixel *data1, const pixel *data2, unsigned stride);
+unsigned kvz_sad_8x8_stride(const pixel *data1, const pixel *data2, unsigned stride);
+unsigned kvz_sad_16x16_stride(const pixel *data1, const pixel *data2, unsigned stride);
+
+unsigned kvz_satd_4x4(const pixel *org, const pixel *cur);
+unsigned kvz_satd_8x8(const pixel *org, const pixel *cur);
+
+unsigned kvz_satd_8x8_stride(const pixel *org, int32_t org_stride, const pixel *cur, int32_t cur_stride);
+
+#endif