Add faster AVX inter SAD for 32x32 and 64x64

Add implementations for these functions that process the image line by line instead of using the 16x16 function to process block by block. The 32x32 is around 30% faster, and 64x64 is around 15% faster, on Haswell. PASS inter_sad: 28.744M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec) PASS inter_sad: 7.882M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec) to PASS inter_sad: 37.828M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec) PASS inter_sad: 9.081M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec)
2024-11-24 02:24:07 +00:00 · 2016-08-29 03:52:05 +03:00 · 2016-08-29 03:52:05 +03:00 · 02cd17b427
parent f098e46f4f
commit 02cd17b427
3 changed files with 82 additions and 20 deletions
--- a/src/strategies/x86_asm/picture-x86-asm-sad.asm
+++ b/src/strategies/x86_asm/picture-x86-asm-sad.asm
@ -291,3 +291,83 @@ cglobal sad_16x16_stride, 3, 3, 5
    vmovd eax, m4
    RET
 ;KVZ_SAD_32x32_STRIDE
 ;Calculates SAD of a 32x32 block inside a frame with stride
 ;r0 address of the first value(current)
 ;r1 address of the first value(reference)
 ;r2 stride
 cglobal sad_32x32_stride, 3, 3, 5
    vpxor m4, m4
 	; Handle 2 lines per iteration
    %rep 16
        vmovdqu m0, [r0]
        vmovdqu m1, [r0 + 16]
        vmovdqu m2, [r0 + r2]
        vmovdqu m3, [r0 + r2 + 16]
        lea r0, [r0 + 2 * r2]
        vpsadbw m0, [r1]
        vpsadbw m1, [r1 + 16]
        vpsadbw m2, [r1 + r2]
        vpsadbw m3, [r1 + r2 + 16]
        lea r1, [r1 + 2 * r2]
        vpaddd m4, m0
        vpaddd m4, m1
        vpaddd m4, m2
        vpaddd m4, m3
    %endrep
    vmovhlps m0, m4
    vpaddd m4, m0
    vmovd eax, m4
    RET
 ;KVZ_SAD_64x64_STRIDE
 ;Calculates SAD of a 64x64 block inside a frame with stride
 ;r0 address of the first value(current)
 ;r1 address of the first value(reference)
 ;r2 stride
 cglobal sad_64x64_stride, 3, 4, 5
    vpxor m4, m4 ; sum accumulation register
 	mov r3, 4 ; number of iterations in the loop
 Process16Lines:
 	; Intel optimization manual says to not unroll beyond 500 instructions.
 	; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but
 	; smaller is better, when speed is the same, right?
    %rep 16
        vmovdqu m0, [r0]
        vmovdqu m1, [r0 + 1*16]
        vmovdqu m2, [r0 + 2*16]
        vmovdqu m3, [r0 + 3*16]
        vpsadbw m0, [r1]
        vpsadbw m1, [r1 + 1*16]
        vpsadbw m2, [r1 + 2*16]
        vpsadbw m3, [r1 + 3*16]
        lea r0, [r0 + r2]
        lea r1, [r1 + r2]
        vpaddd m4, m0
        vpaddd m4, m1
        vpaddd m4, m2
        vpaddd m4, m3
    %endrep
 	dec r3
 	jnz Process16Lines
    vmovhlps m0, m4
    vpaddd m4, m0
    vmovd eax, m4
    RET
--- a/src/strategies/x86_asm/picture-x86-asm-sad.h
+++ b/src/strategies/x86_asm/picture-x86-asm-sad.h
@ -36,6 +36,8 @@ unsigned kvz_sad_16x16_avx(const kvz_pixel*, const kvz_pixel*);
 unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
 unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
 unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
 unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
 unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
 #endif
--- a/src/strategies/x86_asm/picture-x86-asm.c
+++ b/src/strategies/x86_asm/picture-x86-asm.c
@ -39,16 +39,6 @@ static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2
  return sad;
 }
 static unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
 {
  unsigned sad = 0;
  sad += kvz_sad_16x16_stride_avx(data1, data2, stride);
  sad += kvz_sad_16x16_stride_avx(data1 + 16, data2 + 16, stride);
  sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride, data2 + 16 * stride, stride);
  sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride);
  return sad;
 }
 static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2)
 {
  unsigned sad = 0;
@ -59,16 +49,6 @@ static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2
  return sad;
 }
 static unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
 {
  unsigned sad = 0;
  sad += kvz_sad_32x32_stride_avx(data1, data2, stride);
  sad += kvz_sad_32x32_stride_avx(data1 + 32, data2 + 32, stride);
  sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride, data2 + 32 * stride, stride);
  sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride);
  return sad;
 }
 static unsigned kvz_sad_other_avx(const kvz_pixel * const data1, const kvz_pixel * const data2,
  const int width, const int height, const unsigned stride1, const unsigned stride2)
 {