mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Add faster AVX inter SAD for 32x32 and 64x64
Add implementations for these functions that process the image line by line instead of using the 16x16 function to process block by block. The 32x32 is around 30% faster, and 64x64 is around 15% faster, on Haswell. PASS inter_sad: 28.744M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec) PASS inter_sad: 7.882M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec) to PASS inter_sad: 37.828M x reg_sad(32x32):x86_asm_avx (1014 ticks, 1.014 sec) PASS inter_sad: 9.081M x reg_sad(64x64):x86_asm_avx (1014 ticks, 1.014 sec)
This commit is contained in:
parent
f098e46f4f
commit
02cd17b427
|
@ -291,3 +291,83 @@ cglobal sad_16x16_stride, 3, 3, 5
|
|||
vmovd eax, m4
|
||||
|
||||
RET
|
||||
|
||||
|
||||
;KVZ_SAD_32x32_STRIDE
|
||||
;Calculates SAD of a 32x32 block inside a frame with stride
|
||||
;r0 address of the first value(current)
|
||||
;r1 address of the first value(reference)
|
||||
;r2 stride
|
||||
cglobal sad_32x32_stride, 3, 3, 5
|
||||
vpxor m4, m4
|
||||
|
||||
; Handle 2 lines per iteration
|
||||
%rep 16
|
||||
vmovdqu m0, [r0]
|
||||
vmovdqu m1, [r0 + 16]
|
||||
vmovdqu m2, [r0 + r2]
|
||||
vmovdqu m3, [r0 + r2 + 16]
|
||||
lea r0, [r0 + 2 * r2]
|
||||
|
||||
vpsadbw m0, [r1]
|
||||
vpsadbw m1, [r1 + 16]
|
||||
vpsadbw m2, [r1 + r2]
|
||||
vpsadbw m3, [r1 + r2 + 16]
|
||||
lea r1, [r1 + 2 * r2]
|
||||
|
||||
vpaddd m4, m0
|
||||
vpaddd m4, m1
|
||||
vpaddd m4, m2
|
||||
vpaddd m4, m3
|
||||
%endrep
|
||||
|
||||
vmovhlps m0, m4
|
||||
vpaddd m4, m0
|
||||
|
||||
vmovd eax, m4
|
||||
|
||||
RET
|
||||
|
||||
|
||||
;KVZ_SAD_64x64_STRIDE
|
||||
;Calculates SAD of a 64x64 block inside a frame with stride
|
||||
;r0 address of the first value(current)
|
||||
;r1 address of the first value(reference)
|
||||
;r2 stride
|
||||
cglobal sad_64x64_stride, 3, 4, 5
|
||||
vpxor m4, m4 ; sum accumulation register
|
||||
mov r3, 4 ; number of iterations in the loop
|
||||
|
||||
Process16Lines:
|
||||
; Intel optimization manual says to not unroll beyond 500 instructions.
|
||||
; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but
|
||||
; smaller is better, when speed is the same, right?
|
||||
%rep 16
|
||||
vmovdqu m0, [r0]
|
||||
vmovdqu m1, [r0 + 1*16]
|
||||
vmovdqu m2, [r0 + 2*16]
|
||||
vmovdqu m3, [r0 + 3*16]
|
||||
|
||||
vpsadbw m0, [r1]
|
||||
vpsadbw m1, [r1 + 1*16]
|
||||
vpsadbw m2, [r1 + 2*16]
|
||||
vpsadbw m3, [r1 + 3*16]
|
||||
|
||||
lea r0, [r0 + r2]
|
||||
lea r1, [r1 + r2]
|
||||
|
||||
vpaddd m4, m0
|
||||
vpaddd m4, m1
|
||||
vpaddd m4, m2
|
||||
vpaddd m4, m3
|
||||
%endrep
|
||||
|
||||
dec r3
|
||||
jnz Process16Lines
|
||||
|
||||
vmovhlps m0, m4
|
||||
vpaddd m4, m0
|
||||
|
||||
vmovd eax, m4
|
||||
|
||||
RET
|
||||
|
|
|
@ -36,6 +36,8 @@ unsigned kvz_sad_16x16_avx(const kvz_pixel*, const kvz_pixel*);
|
|||
unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
|
||||
unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
|
||||
unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
|
||||
unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
|
||||
unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -39,16 +39,6 @@ static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2
|
|||
return sad;
|
||||
}
|
||||
|
||||
static unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
|
||||
{
|
||||
unsigned sad = 0;
|
||||
sad += kvz_sad_16x16_stride_avx(data1, data2, stride);
|
||||
sad += kvz_sad_16x16_stride_avx(data1 + 16, data2 + 16, stride);
|
||||
sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride, data2 + 16 * stride, stride);
|
||||
sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride);
|
||||
return sad;
|
||||
}
|
||||
|
||||
static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2)
|
||||
{
|
||||
unsigned sad = 0;
|
||||
|
@ -59,16 +49,6 @@ static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2
|
|||
return sad;
|
||||
}
|
||||
|
||||
static unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
|
||||
{
|
||||
unsigned sad = 0;
|
||||
sad += kvz_sad_32x32_stride_avx(data1, data2, stride);
|
||||
sad += kvz_sad_32x32_stride_avx(data1 + 32, data2 + 32, stride);
|
||||
sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride, data2 + 32 * stride, stride);
|
||||
sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride);
|
||||
return sad;
|
||||
}
|
||||
|
||||
static unsigned kvz_sad_other_avx(const kvz_pixel * const data1, const kvz_pixel * const data2,
|
||||
const int width, const int height, const unsigned stride1, const unsigned stride2)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue