mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 10:34:05 +00:00
Add support for 4x4 blocks to SATD_ANY_SIZE.
Makes functions satd_any_size_generic and satd_any_size_8bit_avx2 work on blocks whose width and/or height are not multiples of 8.
This commit is contained in:
parent
2ae260e422
commit
bf26661782
|
@ -31,6 +31,7 @@
|
||||||
#include "strategies/strategies-picture.h"
|
#include "strategies/strategies-picture.h"
|
||||||
#include "strategyselector.h"
|
#include "strategyselector.h"
|
||||||
#include "strategies/strategies-common.h"
|
#include "strategies/strategies-common.h"
|
||||||
|
#include "strategies/generic/picture-generic.h"
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -454,6 +455,19 @@ INLINE static void hor_transform_block_dual_avx2(__m256i (*row_diff)[8])
|
||||||
hor_transform_row_dual_avx2((*row_diff) + 7);
|
hor_transform_row_dual_avx2((*row_diff) + 7);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Calculate SATD between two 4x4 blocks inside bigger arrays.
|
||||||
|
*/
|
||||||
|
static unsigned kvz_satd_4x4_subblock_8bit_avx2(const kvz_pixel * buf1,
|
||||||
|
const int32_t stride1,
|
||||||
|
const kvz_pixel * buf2,
|
||||||
|
const int32_t stride2)
|
||||||
|
{
|
||||||
|
// TODO: AVX2 implementation
|
||||||
|
return kvz_satd_4x4_subblock_generic(buf1, stride1, buf2, stride2);
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
|
static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
|
||||||
{
|
{
|
||||||
__m128i temp[8];
|
__m128i temp[8];
|
||||||
|
|
|
@ -99,19 +99,13 @@ static unsigned reg_sad_generic(const kvz_pixel * const data1, const kvz_pixel *
|
||||||
return sad;
|
return sad;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Calculate SATD between two 4x4 blocks inside bigger arrays.
|
* \brief Transform differences between two 4x4 blocks.
|
||||||
* From HM 13.0
|
* From HM 13.0
|
||||||
*/
|
*/
|
||||||
static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
|
static int32_t hadamard_4x4_generic(int32_t diff[4*4])
|
||||||
{
|
{
|
||||||
int32_t k, satd = 0, diff[16], m[16], d[16];
|
int32_t m[4 * 4];
|
||||||
for (k = 0; k < 16; ++k) {
|
|
||||||
diff[k] = piOrg[k] - piCur[k];
|
|
||||||
}
|
|
||||||
|
|
||||||
/*===== hadamard transform =====*/
|
|
||||||
m[0] = diff[0] + diff[12];
|
m[0] = diff[0] + diff[12];
|
||||||
m[1] = diff[1] + diff[13];
|
m[1] = diff[1] + diff[13];
|
||||||
m[2] = diff[2] + diff[14];
|
m[2] = diff[2] + diff[14];
|
||||||
|
@ -129,6 +123,7 @@ static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
|
||||||
m[14] = diff[2] - diff[14];
|
m[14] = diff[2] - diff[14];
|
||||||
m[15] = diff[3] - diff[15];
|
m[15] = diff[3] - diff[15];
|
||||||
|
|
||||||
|
int32_t d[4 * 4];
|
||||||
d[0] = m[0] + m[4];
|
d[0] = m[0] + m[4];
|
||||||
d[1] = m[1] + m[5];
|
d[1] = m[1] + m[5];
|
||||||
d[2] = m[2] + m[6];
|
d[2] = m[2] + m[6];
|
||||||
|
@ -180,14 +175,45 @@ static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
|
||||||
d[14] = m[14] + m[15];
|
d[14] = m[14] + m[15];
|
||||||
d[15] = m[15] - m[14];
|
d[15] = m[15] - m[14];
|
||||||
|
|
||||||
for (k = 0; k<16; ++k) {
|
int32_t satd = 0;
|
||||||
satd += abs(d[k]);
|
for (int i = 0; i < 16; i++) {
|
||||||
|
satd += abs(d[i]);
|
||||||
}
|
}
|
||||||
satd = ((satd + 1) >> 1);
|
satd = ((satd + 1) >> 1);
|
||||||
|
|
||||||
return satd;
|
return satd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Calculate SATD between two 4x4 blocks.
|
||||||
|
*/
|
||||||
|
static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
|
||||||
|
{
|
||||||
|
int32_t diff[4 * 4];
|
||||||
|
for (int i = 0; i < 4 * 4; i++) {
|
||||||
|
diff[i] = piOrg[i] - piCur[i];
|
||||||
|
}
|
||||||
|
return hadamard_4x4_generic(diff);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Calculate SATD between two 4x4 blocks inside bigger arrays.
|
||||||
|
*/
|
||||||
|
unsigned kvz_satd_4x4_subblock_generic(const kvz_pixel * buf1,
|
||||||
|
const int32_t stride1,
|
||||||
|
const kvz_pixel * buf2,
|
||||||
|
const int32_t stride2)
|
||||||
|
{
|
||||||
|
int32_t diff[4 * 4];
|
||||||
|
for (int y = 0; y < 4; y++) {
|
||||||
|
for (int x = 0; x < 4; x++) {
|
||||||
|
diff[x + y * 4] = buf1[x + y * stride1] - buf2[x + y * stride2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hadamard_4x4_generic(diff);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Calculate SATD between two 8x8 blocks inside bigger arrays.
|
* \brief Calculate SATD between two 8x8 blocks inside bigger arrays.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -39,4 +39,9 @@ kvz_pixel kvz_fast_clip_16bit_to_pixel(int16_t value);
|
||||||
// Assumes PIXEL_MAX to be 2^n-1
|
// Assumes PIXEL_MAX to be 2^n-1
|
||||||
kvz_pixel kvz_fast_clip_32bit_to_pixel(int32_t value);
|
kvz_pixel kvz_fast_clip_32bit_to_pixel(int32_t value);
|
||||||
|
|
||||||
|
unsigned kvz_satd_4x4_subblock_generic(const kvz_pixel * buf1,
|
||||||
|
const int32_t stride1,
|
||||||
|
const kvz_pixel * buf2,
|
||||||
|
const int32_t stride2);
|
||||||
|
|
||||||
#endif //STRATEGIES_PICTURE_GENERIC_H_
|
#endif //STRATEGIES_PICTURE_GENERIC_H_
|
||||||
|
|
|
@ -66,11 +66,33 @@ static unsigned satd_ ## n ## x ## n ## _ ## suffix ( \
|
||||||
const kvz_pixel *block2, int stride2) \
|
const kvz_pixel *block2, int stride2) \
|
||||||
{ \
|
{ \
|
||||||
unsigned sum = 0; \
|
unsigned sum = 0; \
|
||||||
|
if (width % 8 != 0) { \
|
||||||
|
/* Process the first column using 4x4 blocks. */ \
|
||||||
|
for (int y = 0; y < height; y += 4) { \
|
||||||
|
sum += kvz_satd_4x4_subblock_ ## suffix(&block1[y * stride1], stride1, \
|
||||||
|
&block2[y * stride2], stride2); \
|
||||||
|
} \
|
||||||
|
block1 += 4; \
|
||||||
|
block2 += 4; \
|
||||||
|
width -= 4; \
|
||||||
|
} \
|
||||||
|
if (height % 8 != 0) { \
|
||||||
|
/* Process the first row using 4x4 blocks. */ \
|
||||||
|
for (int x = 0; x < width; x += 4) { \
|
||||||
|
sum += kvz_satd_4x4_subblock_ ## suffix(&block1[x], stride1, \
|
||||||
|
&block2[x], stride2); \
|
||||||
|
} \
|
||||||
|
block1 += 4 * stride1; \
|
||||||
|
block2 += 4 * stride2; \
|
||||||
|
height -= 4; \
|
||||||
|
} \
|
||||||
|
/* The rest can now be processed with 8x8 blocks. */ \
|
||||||
for (int y = 0; y < height; y += 8) { \
|
for (int y = 0; y < height; y += 8) { \
|
||||||
const kvz_pixel *row1 = &block1[y * stride1]; \
|
const kvz_pixel *row1 = &block1[y * stride1]; \
|
||||||
const kvz_pixel *row2 = &block2[y * stride2]; \
|
const kvz_pixel *row2 = &block2[y * stride2]; \
|
||||||
for (int x = 0; x < width; x += 8) { \
|
for (int x = 0; x < width; x += 8) { \
|
||||||
sum += satd_8x8_subblock_ ## suffix(&row1[x], stride1, &row2[x], stride2); \
|
sum += satd_8x8_subblock_ ## suffix(&row1[x], stride1, \
|
||||||
|
&row2[x], stride2); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
return sum >> (KVZ_BIT_DEPTH - 8); \
|
return sum >> (KVZ_BIT_DEPTH - 8); \
|
||||||
|
|
Loading…
Reference in a new issue