Add faster AVX inter SAD for 32x32 and 64x64

Add implementations for these functions that process the image line by
line instead of using the 16x16 function to process block by block.

The 32x32 is around 30% faster, and 64x64 is around 15% faster,
on Haswell.

PASS inter_sad: 28.744M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec)
PASS inter_sad: 7.882M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec)
to
PASS inter_sad: 37.828M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec)
PASS inter_sad: 9.081M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec)
This commit is contained in:
Ari Koivula 2016-08-29 03:52:05 +03:00
parent f098e46f4f
commit 02cd17b427
3 changed files with 82 additions and 20 deletions

View file

@ -291,3 +291,83 @@ cglobal sad_16x16_stride, 3, 3, 5
vmovd eax, m4 vmovd eax, m4
RET RET
;KVZ_SAD_32x32_STRIDE
;Calculates SAD of a 32x32 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_32x32_stride, 3, 3, 5
vpxor m4, m4
; Handle 2 lines per iteration
%rep 16
vmovdqu m0, [r0]
vmovdqu m1, [r0 + 16]
vmovdqu m2, [r0 + r2]
vmovdqu m3, [r0 + r2 + 16]
lea r0, [r0 + 2 * r2]
vpsadbw m0, [r1]
vpsadbw m1, [r1 + 16]
vpsadbw m2, [r1 + r2]
vpsadbw m3, [r1 + r2 + 16]
lea r1, [r1 + 2 * r2]
vpaddd m4, m0
vpaddd m4, m1
vpaddd m4, m2
vpaddd m4, m3
%endrep
vmovhlps m0, m4
vpaddd m4, m0
vmovd eax, m4
RET
;KVZ_SAD_64x64_STRIDE
;Calculates SAD of a 64x64 block inside a frame with stride
;r0 address of the first value(current)
;r1 address of the first value(reference)
;r2 stride
cglobal sad_64x64_stride, 3, 4, 5
vpxor m4, m4 ; sum accumulation register
mov r3, 4 ; number of iterations in the loop
Process16Lines:
; Intel optimization manual says to not unroll beyond 500 instructions.
; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but
; smaller is better, when speed is the same, right?
%rep 16
vmovdqu m0, [r0]
vmovdqu m1, [r0 + 1*16]
vmovdqu m2, [r0 + 2*16]
vmovdqu m3, [r0 + 3*16]
vpsadbw m0, [r1]
vpsadbw m1, [r1 + 1*16]
vpsadbw m2, [r1 + 2*16]
vpsadbw m3, [r1 + 3*16]
lea r0, [r0 + r2]
lea r1, [r1 + r2]
vpaddd m4, m0
vpaddd m4, m1
vpaddd m4, m2
vpaddd m4, m3
%endrep
dec r3
jnz Process16Lines
vmovhlps m0, m4
vpaddd m4, m0
vmovd eax, m4
RET

View file

@ -36,6 +36,8 @@ unsigned kvz_sad_16x16_avx(const kvz_pixel*, const kvz_pixel*);
unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
#endif #endif

View file

@ -39,16 +39,6 @@ static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2
return sad; return sad;
} }
static unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
{
unsigned sad = 0;
sad += kvz_sad_16x16_stride_avx(data1, data2, stride);
sad += kvz_sad_16x16_stride_avx(data1 + 16, data2 + 16, stride);
sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride, data2 + 16 * stride, stride);
sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride);
return sad;
}
static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2) static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2)
{ {
unsigned sad = 0; unsigned sad = 0;
@ -59,16 +49,6 @@ static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2
return sad; return sad;
} }
static unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
{
unsigned sad = 0;
sad += kvz_sad_32x32_stride_avx(data1, data2, stride);
sad += kvz_sad_32x32_stride_avx(data1 + 32, data2 + 32, stride);
sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride, data2 + 32 * stride, stride);
sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride);
return sad;
}
static unsigned kvz_sad_other_avx(const kvz_pixel * const data1, const kvz_pixel * const data2, static unsigned kvz_sad_other_avx(const kvz_pixel * const data1, const kvz_pixel * const data2,
const int width, const int height, const unsigned stride1, const unsigned stride2) const int width, const int height, const unsigned stride1, const unsigned stride2)
{ {