From 02cd17b427989e6d7b5f305e1a626c52d3f768b3 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 29 Aug 2016 03:52:05 +0300 Subject: [PATCH] Add faster AVX inter SAD for 32x32 and 64x64 Add implementations for these functions that process the image line by line instead of using the 16x16 function to process block by block. The 32x32 is around 30% faster, and 64x64 is around 15% faster, on Haswell. PASS inter_sad: 28.744M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec) PASS inter_sad: 7.882M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec) to PASS inter_sad: 37.828M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec) PASS inter_sad: 9.081M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec) --- .../x86_asm/picture-x86-asm-sad.asm | 80 +++++++++++++++++++ src/strategies/x86_asm/picture-x86-asm-sad.h | 2 + src/strategies/x86_asm/picture-x86-asm.c | 20 ----- 3 files changed, 82 insertions(+), 20 deletions(-) diff --git a/src/strategies/x86_asm/picture-x86-asm-sad.asm b/src/strategies/x86_asm/picture-x86-asm-sad.asm index f2f391f6..beea5545 100644 --- a/src/strategies/x86_asm/picture-x86-asm-sad.asm +++ b/src/strategies/x86_asm/picture-x86-asm-sad.asm @@ -291,3 +291,83 @@ cglobal sad_16x16_stride, 3, 3, 5 vmovd eax, m4 RET + + +;KVZ_SAD_32x32_STRIDE +;Calculates SAD of a 32x32 block inside a frame with stride +;r0 address of the first value(current) +;r1 address of the first value(reference) +;r2 stride +cglobal sad_32x32_stride, 3, 3, 5 + vpxor m4, m4 + + ; Handle 2 lines per iteration + %rep 16 + vmovdqu m0, [r0] + vmovdqu m1, [r0 + 16] + vmovdqu m2, [r0 + r2] + vmovdqu m3, [r0 + r2 + 16] + lea r0, [r0 + 2 * r2] + + vpsadbw m0, [r1] + vpsadbw m1, [r1 + 16] + vpsadbw m2, [r1 + r2] + vpsadbw m3, [r1 + r2 + 16] + lea r1, [r1 + 2 * r2] + + vpaddd m4, m0 + vpaddd m4, m1 + vpaddd m4, m2 + vpaddd m4, m3 + %endrep + + vmovhlps m0, m4 + vpaddd m4, m0 + + vmovd eax, m4 + + RET + + +;KVZ_SAD_64x64_STRIDE +;Calculates SAD of a 64x64 block inside a frame with stride +;r0 address of the first value(current) +;r1 address of the first value(reference) +;r2 stride +cglobal sad_64x64_stride, 3, 4, 5 + vpxor m4, m4 ; sum accumulation register + mov r3, 4 ; number of iterations in the loop + +Process16Lines: + ; Intel optimization manual says to not unroll beyond 500 instructions. + ; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but + ; smaller is better, when speed is the same, right? + %rep 16 + vmovdqu m0, [r0] + vmovdqu m1, [r0 + 1*16] + vmovdqu m2, [r0 + 2*16] + vmovdqu m3, [r0 + 3*16] + + vpsadbw m0, [r1] + vpsadbw m1, [r1 + 1*16] + vpsadbw m2, [r1 + 2*16] + vpsadbw m3, [r1 + 3*16] + + lea r0, [r0 + r2] + lea r1, [r1 + r2] + + vpaddd m4, m0 + vpaddd m4, m1 + vpaddd m4, m2 + vpaddd m4, m3 + %endrep + + dec r3 + jnz Process16Lines + + vmovhlps m0, m4 + vpaddd m4, m0 + + vmovd eax, m4 + + RET diff --git a/src/strategies/x86_asm/picture-x86-asm-sad.h b/src/strategies/x86_asm/picture-x86-asm-sad.h index 91a41c59..6f108038 100644 --- a/src/strategies/x86_asm/picture-x86-asm-sad.h +++ b/src/strategies/x86_asm/picture-x86-asm-sad.h @@ -36,6 +36,8 @@ unsigned kvz_sad_16x16_avx(const kvz_pixel*, const kvz_pixel*); unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); +unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); +unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); #endif diff --git a/src/strategies/x86_asm/picture-x86-asm.c b/src/strategies/x86_asm/picture-x86-asm.c index 4c5be65d..36017c20 100644 --- a/src/strategies/x86_asm/picture-x86-asm.c +++ b/src/strategies/x86_asm/picture-x86-asm.c @@ -39,16 +39,6 @@ static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2 return sad; } -static unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride) -{ - unsigned sad = 0; - sad += kvz_sad_16x16_stride_avx(data1, data2, stride); - sad += kvz_sad_16x16_stride_avx(data1 + 16, data2 + 16, stride); - sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride, data2 + 16 * stride, stride); - sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride); - return sad; -} - static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2) { unsigned sad = 0; @@ -59,16 +49,6 @@ static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2 return sad; } -static unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride) -{ - unsigned sad = 0; - sad += kvz_sad_32x32_stride_avx(data1, data2, stride); - sad += kvz_sad_32x32_stride_avx(data1 + 32, data2 + 32, stride); - sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride, data2 + 32 * stride, stride); - sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride); - return sad; -} - static unsigned kvz_sad_other_avx(const kvz_pixel * const data1, const kvz_pixel * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) {