mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Consistent naming part 1
This commit is contained in:
parent
8f0e96162a
commit
a68d73674b
|
@ -60,10 +60,10 @@ typedef struct {
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int size;
|
int size;
|
||||||
kvz_pixel_ip *y;
|
kvz_pixel_im *y;
|
||||||
kvz_pixel_ip *u;
|
kvz_pixel_im *u;
|
||||||
kvz_pixel_ip *v;
|
kvz_pixel_im *v;
|
||||||
} yuv_ip_t;
|
} yuv_im_t;
|
||||||
|
|
||||||
kvz_picture *kvz_image_alloc_420(const int32_t width, const int32_t height);
|
kvz_picture *kvz_image_alloc_420(const int32_t width, const int32_t height);
|
||||||
kvz_picture *kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_t width, const int32_t height);
|
kvz_picture *kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_t width, const int32_t height);
|
||||||
|
|
24
src/inter.c
24
src/inter.c
|
@ -115,7 +115,7 @@ static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
|
||||||
int32_t block_width,
|
int32_t block_width,
|
||||||
int32_t block_height,
|
int32_t block_height,
|
||||||
const int16_t mv_param[2],
|
const int16_t mv_param[2],
|
||||||
yuv_ip_t *out,
|
yuv_im_t *out,
|
||||||
const unsigned out_stride)
|
const unsigned out_stride)
|
||||||
{
|
{
|
||||||
int mv_frac_x = (mv_param[0] & 3);
|
int mv_frac_x = (mv_param[0] & 3);
|
||||||
|
@ -248,7 +248,7 @@ static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
|
||||||
int32_t pu_w,
|
int32_t pu_w,
|
||||||
int32_t pu_h,
|
int32_t pu_h,
|
||||||
const int16_t mv_param[2],
|
const int16_t mv_param[2],
|
||||||
yuv_ip_t *out,
|
yuv_im_t *out,
|
||||||
const unsigned out_stride)
|
const unsigned out_stride)
|
||||||
{
|
{
|
||||||
int mv_frac_x = (mv_param[0] & 7);
|
int mv_frac_x = (mv_param[0] & 7);
|
||||||
|
@ -367,7 +367,7 @@ static void inter_cp_with_ext_border(const kvz_pixel *ref_buf, int ref_stride,
|
||||||
* \param height PU height
|
* \param height PU height
|
||||||
* \param mv_param motion vector
|
* \param mv_param motion vector
|
||||||
* \param lcu_px destination lcu
|
* \param lcu_px destination lcu
|
||||||
* \param lcu_ip destination of high precision output, or NULL if not needed
|
* \param lcu_im destination of high precision output, or NULL if not needed
|
||||||
* \param predict_luma Enable or disable luma prediction for this call.
|
* \param predict_luma Enable or disable luma prediction for this call.
|
||||||
* \param predict_chroma Enable or disable chroma prediction for this call.
|
* \param predict_chroma Enable or disable chroma prediction for this call.
|
||||||
*/
|
*/
|
||||||
|
@ -380,7 +380,7 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state,
|
||||||
int32_t out_stride_luma,
|
int32_t out_stride_luma,
|
||||||
const int16_t mv_param[2],
|
const int16_t mv_param[2],
|
||||||
yuv_t *yuv_px,
|
yuv_t *yuv_px,
|
||||||
yuv_ip_t *yuv_ip,
|
yuv_im_t *yuv_im,
|
||||||
bool predict_luma,
|
bool predict_luma,
|
||||||
bool predict_chroma)
|
bool predict_chroma)
|
||||||
{
|
{
|
||||||
|
@ -403,11 +403,11 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state,
|
||||||
if (predict_luma) {
|
if (predict_luma) {
|
||||||
if (fractional_luma) {
|
if (fractional_luma) {
|
||||||
// With a fractional MV, do interpolation.
|
// With a fractional MV, do interpolation.
|
||||||
if (state->encoder_control->cfg.bipred && yuv_ip) {
|
if (state->encoder_control->cfg.bipred && yuv_im) {
|
||||||
inter_recon_frac_luma_hi(state, ref,
|
inter_recon_frac_luma_hi(state, ref,
|
||||||
pu_x, pu_y,
|
pu_x, pu_y,
|
||||||
pu_w, pu_h,
|
pu_w, pu_h,
|
||||||
mv_param, yuv_ip, out_stride_luma);
|
mv_param, yuv_im, out_stride_luma);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
inter_recon_frac_luma(state, ref,
|
inter_recon_frac_luma(state, ref,
|
||||||
|
@ -444,11 +444,11 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state,
|
||||||
// Generate prediction for chroma.
|
// Generate prediction for chroma.
|
||||||
if (fractional_luma || fractional_chroma) {
|
if (fractional_luma || fractional_chroma) {
|
||||||
// With a fractional MV, do interpolation.
|
// With a fractional MV, do interpolation.
|
||||||
if (state->encoder_control->cfg.bipred && yuv_ip) {
|
if (state->encoder_control->cfg.bipred && yuv_im) {
|
||||||
inter_recon_frac_chroma_hi(state, ref,
|
inter_recon_frac_chroma_hi(state, ref,
|
||||||
pu_x, pu_y,
|
pu_x, pu_y,
|
||||||
pu_w, pu_h,
|
pu_w, pu_h,
|
||||||
mv_param, yuv_ip, out_stride_c);
|
mv_param, yuv_im, out_stride_c);
|
||||||
} else {
|
} else {
|
||||||
inter_recon_frac_chroma(state, ref,
|
inter_recon_frac_chroma(state, ref,
|
||||||
pu_x, pu_y,
|
pu_x, pu_y,
|
||||||
|
@ -516,8 +516,8 @@ void kvz_inter_recon_bipred(const encoder_state_t *const state,
|
||||||
// Allocate maximum size arrays for interpolated and copied samples
|
// Allocate maximum size arrays for interpolated and copied samples
|
||||||
ALIGNED(64) kvz_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
ALIGNED(64) kvz_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
||||||
ALIGNED(64) kvz_pixel px_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
ALIGNED(64) kvz_pixel px_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
||||||
ALIGNED(64) kvz_pixel_ip ip_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
ALIGNED(64) kvz_pixel_im ip_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
||||||
ALIGNED(64) kvz_pixel_ip ip_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
ALIGNED(64) kvz_pixel_im ip_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE];
|
||||||
|
|
||||||
yuv_t px_L0;
|
yuv_t px_L0;
|
||||||
px_L0.size = pu_w * pu_h;
|
px_L0.size = pu_w * pu_h;
|
||||||
|
@ -531,13 +531,13 @@ void kvz_inter_recon_bipred(const encoder_state_t *const state,
|
||||||
px_L1.u = &px_buf_L1[LCU_LUMA_SIZE];
|
px_L1.u = &px_buf_L1[LCU_LUMA_SIZE];
|
||||||
px_L1.v = &px_buf_L1[LCU_LUMA_SIZE + LCU_CHROMA_SIZE];
|
px_L1.v = &px_buf_L1[LCU_LUMA_SIZE + LCU_CHROMA_SIZE];
|
||||||
|
|
||||||
yuv_ip_t ip_L0;
|
yuv_im_t ip_L0;
|
||||||
ip_L0.size = pu_w * pu_h;
|
ip_L0.size = pu_w * pu_h;
|
||||||
ip_L0.y = &ip_buf_L0[0];
|
ip_L0.y = &ip_buf_L0[0];
|
||||||
ip_L0.u = &ip_buf_L0[LCU_LUMA_SIZE];
|
ip_L0.u = &ip_buf_L0[LCU_LUMA_SIZE];
|
||||||
ip_L0.v = &ip_buf_L0[LCU_LUMA_SIZE + LCU_CHROMA_SIZE];
|
ip_L0.v = &ip_buf_L0[LCU_LUMA_SIZE + LCU_CHROMA_SIZE];
|
||||||
|
|
||||||
yuv_ip_t ip_L1;
|
yuv_im_t ip_L1;
|
||||||
ip_L1.size = pu_w * pu_h;
|
ip_L1.size = pu_w * pu_h;
|
||||||
ip_L1.y = &ip_buf_L1[0];
|
ip_L1.y = &ip_buf_L1[0];
|
||||||
ip_L1.u = &ip_buf_L1[LCU_LUMA_SIZE];
|
ip_L1.u = &ip_buf_L1[LCU_LUMA_SIZE];
|
||||||
|
|
|
@ -97,7 +97,7 @@ typedef uint8_t kvz_pixel;
|
||||||
typedef uint16_t kvz_pixel;
|
typedef uint16_t kvz_pixel;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef int16_t kvz_pixel_ip;
|
typedef int16_t kvz_pixel_im; // For intermediate precision (interpolation/bipred).
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Opaque data structure representing one instance of the encoder.
|
* \brief Opaque data structure representing one instance of the encoder.
|
||||||
|
|
|
@ -947,9 +947,9 @@ static INLINE void bipred_average_px_px_avx2(kvz_pixel *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void bipred_average_ip_ip_template_avx2(kvz_pixel *dst,
|
static INLINE void bipred_average_im_im_template_avx2(kvz_pixel *dst,
|
||||||
kvz_pixel_ip *ip_L0,
|
kvz_pixel_im *ip_L0,
|
||||||
kvz_pixel_ip *ip_L1,
|
kvz_pixel_im *ip_L1,
|
||||||
unsigned pu_w,
|
unsigned pu_w,
|
||||||
unsigned pu_h,
|
unsigned pu_h,
|
||||||
unsigned dst_stride)
|
unsigned dst_stride)
|
||||||
|
@ -1135,9 +1135,9 @@ static INLINE void bipred_average_ip_ip_template_avx2(kvz_pixel *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bipred_average_ip_ip_avx2(kvz_pixel *dst,
|
static void bipred_average_im_im_avx2(kvz_pixel *dst,
|
||||||
kvz_pixel_ip *ip_L0,
|
kvz_pixel_im *ip_L0,
|
||||||
kvz_pixel_ip *ip_L1,
|
kvz_pixel_im *ip_L1,
|
||||||
unsigned pu_w,
|
unsigned pu_w,
|
||||||
unsigned pu_h,
|
unsigned pu_h,
|
||||||
unsigned dst_stride)
|
unsigned dst_stride)
|
||||||
|
@ -1145,16 +1145,16 @@ static void bipred_average_ip_ip_avx2(kvz_pixel *dst,
|
||||||
// Use scalar code for yet unoptimized block sizes (4x4, 2x8)
|
// Use scalar code for yet unoptimized block sizes (4x4, 2x8)
|
||||||
if (!(pu_w == 4 && pu_h == 4) && pu_w > 2) {
|
if (!(pu_w == 4 && pu_h == 4) && pu_w > 2) {
|
||||||
switch (pu_w) {
|
switch (pu_w) {
|
||||||
case 4: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 4, pu_h, dst_stride); break;
|
case 4: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 4, pu_h, dst_stride); break;
|
||||||
case 8: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 8, pu_h, dst_stride); break;
|
case 8: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 8, pu_h, dst_stride); break;
|
||||||
case 16: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 16, pu_h, dst_stride); break;
|
case 16: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 16, pu_h, dst_stride); break;
|
||||||
case 32: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 32, pu_h, dst_stride); break;
|
case 32: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 32, pu_h, dst_stride); break;
|
||||||
case 64: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 64, pu_h, dst_stride); break;
|
case 64: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 64, pu_h, dst_stride); break;
|
||||||
|
|
||||||
case 6: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 6, pu_h, dst_stride); break;
|
case 6: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 6, pu_h, dst_stride); break;
|
||||||
case 12: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 12, pu_h, dst_stride); break;
|
case 12: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 12, pu_h, dst_stride); break;
|
||||||
case 24: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 24, pu_h, dst_stride); break;
|
case 24: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 24, pu_h, dst_stride); break;
|
||||||
case 48: bipred_average_ip_ip_template_avx2(dst, ip_L0, ip_L1, 48, pu_h, dst_stride); break;
|
case 48: bipred_average_im_im_template_avx2(dst, ip_L0, ip_L1, 48, pu_h, dst_stride); break;
|
||||||
default:
|
default:
|
||||||
assert(0 && "Unexpected block width.");
|
assert(0 && "Unexpected block width.");
|
||||||
break;
|
break;
|
||||||
|
@ -1175,9 +1175,9 @@ static void bipred_average_ip_ip_avx2(kvz_pixel *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void bipred_average_px_ip_template_avx2(kvz_pixel *dst,
|
static INLINE void bipred_average_px_im_template_avx2(kvz_pixel *dst,
|
||||||
kvz_pixel *px,
|
kvz_pixel *px,
|
||||||
kvz_pixel_ip *ip,
|
kvz_pixel_im *ip,
|
||||||
unsigned pu_w,
|
unsigned pu_w,
|
||||||
unsigned pu_h,
|
unsigned pu_h,
|
||||||
unsigned dst_stride)
|
unsigned dst_stride)
|
||||||
|
@ -1201,19 +1201,19 @@ static INLINE void bipred_average_px_ip_template_avx2(kvz_pixel *dst,
|
||||||
__m256i sample_px_23_16bit = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)&px[i + 16]));
|
__m256i sample_px_23_16bit = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)&px[i + 16]));
|
||||||
sample_px_01_16bit = _mm256_slli_epi16(sample_px_01_16bit, 14 - KVZ_BIT_DEPTH);
|
sample_px_01_16bit = _mm256_slli_epi16(sample_px_01_16bit, 14 - KVZ_BIT_DEPTH);
|
||||||
sample_px_23_16bit = _mm256_slli_epi16(sample_px_23_16bit, 14 - KVZ_BIT_DEPTH);
|
sample_px_23_16bit = _mm256_slli_epi16(sample_px_23_16bit, 14 - KVZ_BIT_DEPTH);
|
||||||
__m256i sample_ip_01_16bit = _mm256_loadu_si256((__m256i*)&ip[i]);
|
__m256i sample_im_01_16bit = _mm256_loadu_si256((__m256i*)&ip[i]);
|
||||||
__m256i sample_ip_23_16bit = _mm256_loadu_si256((__m256i*)&ip[i + 16]);
|
__m256i sample_im_23_16bit = _mm256_loadu_si256((__m256i*)&ip[i + 16]);
|
||||||
|
|
||||||
__m256i sample_px_ip_01_lo = _mm256_unpacklo_epi16(sample_px_01_16bit, sample_ip_01_16bit);
|
__m256i sample_px_im_01_lo = _mm256_unpacklo_epi16(sample_px_01_16bit, sample_im_01_16bit);
|
||||||
__m256i sample_px_ip_01_hi = _mm256_unpackhi_epi16(sample_px_01_16bit, sample_ip_01_16bit);
|
__m256i sample_px_im_01_hi = _mm256_unpackhi_epi16(sample_px_01_16bit, sample_im_01_16bit);
|
||||||
__m256i sample_px_ip_23_lo = _mm256_unpacklo_epi16(sample_px_23_16bit, sample_ip_23_16bit);
|
__m256i sample_px_im_23_lo = _mm256_unpacklo_epi16(sample_px_23_16bit, sample_im_23_16bit);
|
||||||
__m256i sample_px_ip_23_hi = _mm256_unpackhi_epi16(sample_px_23_16bit, sample_ip_23_16bit);
|
__m256i sample_px_im_23_hi = _mm256_unpackhi_epi16(sample_px_23_16bit, sample_im_23_16bit);
|
||||||
|
|
||||||
__m256i all_ones = _mm256_set1_epi16(1);
|
__m256i all_ones = _mm256_set1_epi16(1);
|
||||||
__m256i avg_01_lo = _mm256_madd_epi16(sample_px_ip_01_lo, all_ones);
|
__m256i avg_01_lo = _mm256_madd_epi16(sample_px_im_01_lo, all_ones);
|
||||||
__m256i avg_01_hi = _mm256_madd_epi16(sample_px_ip_01_hi, all_ones);
|
__m256i avg_01_hi = _mm256_madd_epi16(sample_px_im_01_hi, all_ones);
|
||||||
__m256i avg_23_lo = _mm256_madd_epi16(sample_px_ip_23_lo, all_ones);
|
__m256i avg_23_lo = _mm256_madd_epi16(sample_px_im_23_lo, all_ones);
|
||||||
__m256i avg_23_hi = _mm256_madd_epi16(sample_px_ip_23_hi, all_ones);
|
__m256i avg_23_hi = _mm256_madd_epi16(sample_px_im_23_hi, all_ones);
|
||||||
|
|
||||||
avg_01_lo = _mm256_add_epi32(avg_01_lo, offset);
|
avg_01_lo = _mm256_add_epi32(avg_01_lo, offset);
|
||||||
avg_01_hi = _mm256_add_epi32(avg_01_hi, offset);
|
avg_01_hi = _mm256_add_epi32(avg_01_hi, offset);
|
||||||
|
@ -1255,19 +1255,19 @@ static INLINE void bipred_average_px_ip_template_avx2(kvz_pixel *dst,
|
||||||
__m256i sample_px_23_16bit = _mm256_cvtepu8_epi16(sample_px_23_8bit);
|
__m256i sample_px_23_16bit = _mm256_cvtepu8_epi16(sample_px_23_8bit);
|
||||||
sample_px_01_16bit = _mm256_slli_epi16(sample_px_01_16bit, 14 - KVZ_BIT_DEPTH);
|
sample_px_01_16bit = _mm256_slli_epi16(sample_px_01_16bit, 14 - KVZ_BIT_DEPTH);
|
||||||
sample_px_23_16bit = _mm256_slli_epi16(sample_px_23_16bit, 14 - KVZ_BIT_DEPTH);
|
sample_px_23_16bit = _mm256_slli_epi16(sample_px_23_16bit, 14 - KVZ_BIT_DEPTH);
|
||||||
__m256i sample_ip_01_16bit = _mm256_loadu_si256((__m256i*)&ip[i]);
|
__m256i sample_im_01_16bit = _mm256_loadu_si256((__m256i*)&ip[i]);
|
||||||
__m256i sample_ip_23_16bit = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&ip[i + 16]));
|
__m256i sample_im_23_16bit = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&ip[i + 16]));
|
||||||
|
|
||||||
__m256i sample_px_ip_01_lo = _mm256_unpacklo_epi16(sample_px_01_16bit, sample_ip_01_16bit);
|
__m256i sample_px_im_01_lo = _mm256_unpacklo_epi16(sample_px_01_16bit, sample_im_01_16bit);
|
||||||
__m256i sample_px_ip_01_hi = _mm256_unpackhi_epi16(sample_px_01_16bit, sample_ip_01_16bit);
|
__m256i sample_px_im_01_hi = _mm256_unpackhi_epi16(sample_px_01_16bit, sample_im_01_16bit);
|
||||||
__m256i sample_px_ip_23_lo = _mm256_unpacklo_epi16(sample_px_23_16bit, sample_ip_23_16bit);
|
__m256i sample_px_im_23_lo = _mm256_unpacklo_epi16(sample_px_23_16bit, sample_im_23_16bit);
|
||||||
__m256i sample_px_ip_23_hi = _mm256_unpackhi_epi16(sample_px_23_16bit, sample_ip_23_16bit);
|
__m256i sample_px_im_23_hi = _mm256_unpackhi_epi16(sample_px_23_16bit, sample_im_23_16bit);
|
||||||
|
|
||||||
__m256i all_ones = _mm256_set1_epi16(1);
|
__m256i all_ones = _mm256_set1_epi16(1);
|
||||||
__m256i avg_01_lo = _mm256_madd_epi16(sample_px_ip_01_lo, all_ones);
|
__m256i avg_01_lo = _mm256_madd_epi16(sample_px_im_01_lo, all_ones);
|
||||||
__m256i avg_01_hi = _mm256_madd_epi16(sample_px_ip_01_hi, all_ones);
|
__m256i avg_01_hi = _mm256_madd_epi16(sample_px_im_01_hi, all_ones);
|
||||||
__m256i avg_23_lo = _mm256_madd_epi16(sample_px_ip_23_lo, all_ones);
|
__m256i avg_23_lo = _mm256_madd_epi16(sample_px_im_23_lo, all_ones);
|
||||||
__m256i avg_23_hi = _mm256_madd_epi16(sample_px_ip_23_hi, all_ones);
|
__m256i avg_23_hi = _mm256_madd_epi16(sample_px_im_23_hi, all_ones);
|
||||||
|
|
||||||
avg_01_lo = _mm256_add_epi32(avg_01_lo, offset);
|
avg_01_lo = _mm256_add_epi32(avg_01_lo, offset);
|
||||||
avg_01_hi = _mm256_add_epi32(avg_01_hi, offset);
|
avg_01_hi = _mm256_add_epi32(avg_01_hi, offset);
|
||||||
|
@ -1304,14 +1304,14 @@ static INLINE void bipred_average_px_ip_template_avx2(kvz_pixel *dst,
|
||||||
__m128i sample_px_8bit = _mm_loadu_si128((__m128i*)&px[i]);
|
__m128i sample_px_8bit = _mm_loadu_si128((__m128i*)&px[i]);
|
||||||
__m256i sample_px_16bit = _mm256_cvtepu8_epi16(sample_px_8bit);
|
__m256i sample_px_16bit = _mm256_cvtepu8_epi16(sample_px_8bit);
|
||||||
sample_px_16bit = _mm256_slli_epi16(sample_px_16bit, 14 - KVZ_BIT_DEPTH);
|
sample_px_16bit = _mm256_slli_epi16(sample_px_16bit, 14 - KVZ_BIT_DEPTH);
|
||||||
__m256i sample_ip_16bit = _mm256_loadu_si256((__m256i*)&ip[i]);
|
__m256i sample_im_16bit = _mm256_loadu_si256((__m256i*)&ip[i]);
|
||||||
|
|
||||||
__m256i sample_px_ip_lo = _mm256_unpacklo_epi16(sample_px_16bit, sample_ip_16bit);
|
__m256i sample_px_im_lo = _mm256_unpacklo_epi16(sample_px_16bit, sample_im_16bit);
|
||||||
__m256i sample_px_ip_hi = _mm256_unpackhi_epi16(sample_px_16bit, sample_ip_16bit);
|
__m256i sample_px_im_hi = _mm256_unpackhi_epi16(sample_px_16bit, sample_im_16bit);
|
||||||
|
|
||||||
__m256i all_ones = _mm256_set1_epi16(1);
|
__m256i all_ones = _mm256_set1_epi16(1);
|
||||||
__m256i avg_lo = _mm256_madd_epi16(sample_px_ip_lo, all_ones);
|
__m256i avg_lo = _mm256_madd_epi16(sample_px_im_lo, all_ones);
|
||||||
__m256i avg_hi = _mm256_madd_epi16(sample_px_ip_hi, all_ones);
|
__m256i avg_hi = _mm256_madd_epi16(sample_px_im_hi, all_ones);
|
||||||
|
|
||||||
avg_lo = _mm256_add_epi32(avg_lo, offset);
|
avg_lo = _mm256_add_epi32(avg_lo, offset);
|
||||||
avg_hi = _mm256_add_epi32(avg_hi, offset);
|
avg_hi = _mm256_add_epi32(avg_hi, offset);
|
||||||
|
@ -1339,14 +1339,14 @@ static INLINE void bipred_average_px_ip_template_avx2(kvz_pixel *dst,
|
||||||
__m256i mask = _mm256_setr_epi64x(-1, -1, -1, 0);
|
__m256i mask = _mm256_setr_epi64x(-1, -1, -1, 0);
|
||||||
__m256i sample_px_16bit = _mm256_cvtepu8_epi16(sample_px_8bit);
|
__m256i sample_px_16bit = _mm256_cvtepu8_epi16(sample_px_8bit);
|
||||||
sample_px_16bit = _mm256_slli_epi16(sample_px_16bit, 14 - KVZ_BIT_DEPTH);
|
sample_px_16bit = _mm256_slli_epi16(sample_px_16bit, 14 - KVZ_BIT_DEPTH);
|
||||||
__m256i sample_ip_16bit = _mm256_maskload_epi64((const long long*)(&ip[i]), mask);
|
__m256i sample_im_16bit = _mm256_maskload_epi64((const long long*)(&ip[i]), mask);
|
||||||
|
|
||||||
__m256i sample_px_ip_lo = _mm256_unpacklo_epi16(sample_px_16bit, sample_ip_16bit);
|
__m256i sample_px_im_lo = _mm256_unpacklo_epi16(sample_px_16bit, sample_im_16bit);
|
||||||
__m256i sample_px_ip_hi = _mm256_unpackhi_epi16(sample_px_16bit, sample_ip_16bit);
|
__m256i sample_px_im_hi = _mm256_unpackhi_epi16(sample_px_16bit, sample_im_16bit);
|
||||||
|
|
||||||
__m256i all_ones = _mm256_set1_epi16(1);
|
__m256i all_ones = _mm256_set1_epi16(1);
|
||||||
__m256i avg_lo = _mm256_madd_epi16(sample_px_ip_lo, all_ones);
|
__m256i avg_lo = _mm256_madd_epi16(sample_px_im_lo, all_ones);
|
||||||
__m256i avg_hi = _mm256_madd_epi16(sample_px_ip_hi, all_ones);
|
__m256i avg_hi = _mm256_madd_epi16(sample_px_im_hi, all_ones);
|
||||||
|
|
||||||
avg_lo = _mm256_add_epi32(avg_lo, offset);
|
avg_lo = _mm256_add_epi32(avg_lo, offset);
|
||||||
avg_hi = _mm256_add_epi32(avg_hi, offset);
|
avg_hi = _mm256_add_epi32(avg_hi, offset);
|
||||||
|
@ -1376,9 +1376,9 @@ static INLINE void bipred_average_px_ip_template_avx2(kvz_pixel *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bipred_average_px_ip_avx2(kvz_pixel *dst,
|
static void bipred_average_px_im_avx2(kvz_pixel *dst,
|
||||||
kvz_pixel *px,
|
kvz_pixel *px,
|
||||||
kvz_pixel_ip *ip,
|
kvz_pixel_im *ip,
|
||||||
unsigned pu_w,
|
unsigned pu_w,
|
||||||
unsigned pu_h,
|
unsigned pu_h,
|
||||||
unsigned dst_stride)
|
unsigned dst_stride)
|
||||||
|
@ -1386,16 +1386,16 @@ static void bipred_average_px_ip_avx2(kvz_pixel *dst,
|
||||||
// Use scalar code for yet unoptimized block sizes (4x4, 2x8)
|
// Use scalar code for yet unoptimized block sizes (4x4, 2x8)
|
||||||
if (!(pu_w == 4 && pu_h == 4) && pu_w > 2) {
|
if (!(pu_w == 4 && pu_h == 4) && pu_w > 2) {
|
||||||
switch (pu_w) {
|
switch (pu_w) {
|
||||||
case 4: bipred_average_px_ip_template_avx2(dst, px, ip, 4, pu_h, dst_stride); break;
|
case 4: bipred_average_px_im_template_avx2(dst, px, ip, 4, pu_h, dst_stride); break;
|
||||||
case 8: bipred_average_px_ip_template_avx2(dst, px, ip, 8, pu_h, dst_stride); break;
|
case 8: bipred_average_px_im_template_avx2(dst, px, ip, 8, pu_h, dst_stride); break;
|
||||||
case 16: bipred_average_px_ip_template_avx2(dst, px, ip, 16, pu_h, dst_stride); break;
|
case 16: bipred_average_px_im_template_avx2(dst, px, ip, 16, pu_h, dst_stride); break;
|
||||||
case 32: bipred_average_px_ip_template_avx2(dst, px, ip, 32, pu_h, dst_stride); break;
|
case 32: bipred_average_px_im_template_avx2(dst, px, ip, 32, pu_h, dst_stride); break;
|
||||||
case 64: bipred_average_px_ip_template_avx2(dst, px, ip, 64, pu_h, dst_stride); break;
|
case 64: bipred_average_px_im_template_avx2(dst, px, ip, 64, pu_h, dst_stride); break;
|
||||||
|
|
||||||
case 6: bipred_average_px_ip_template_avx2(dst, px, ip, 6, pu_h, dst_stride); break;
|
case 6: bipred_average_px_im_template_avx2(dst, px, ip, 6, pu_h, dst_stride); break;
|
||||||
case 12: bipred_average_px_ip_template_avx2(dst, px, ip, 12, pu_h, dst_stride); break;
|
case 12: bipred_average_px_im_template_avx2(dst, px, ip, 12, pu_h, dst_stride); break;
|
||||||
case 24: bipred_average_px_ip_template_avx2(dst, px, ip, 24, pu_h, dst_stride); break;
|
case 24: bipred_average_px_im_template_avx2(dst, px, ip, 24, pu_h, dst_stride); break;
|
||||||
case 48: bipred_average_px_ip_template_avx2(dst, px, ip, 48, pu_h, dst_stride); break;
|
case 48: bipred_average_px_im_template_avx2(dst, px, ip, 48, pu_h, dst_stride); break;
|
||||||
default:
|
default:
|
||||||
assert(0 && "Unexpected block width.");
|
assert(0 && "Unexpected block width.");
|
||||||
break;
|
break;
|
||||||
|
@ -1409,8 +1409,8 @@ static void bipred_average_px_ip_avx2(kvz_pixel *dst,
|
||||||
int y = i / pu_w;
|
int y = i / pu_w;
|
||||||
int x = i % pu_w;
|
int x = i % pu_w;
|
||||||
int16_t sample_px = px[i] << (14 - KVZ_BIT_DEPTH);
|
int16_t sample_px = px[i] << (14 - KVZ_BIT_DEPTH);
|
||||||
int16_t sample_ip = ip[i];
|
int16_t sample_im = ip[i];
|
||||||
int32_t rounded = (sample_px + sample_ip + offset) >> shift;
|
int32_t rounded = (sample_px + sample_im + offset) >> shift;
|
||||||
dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(rounded);
|
dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(rounded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1419,8 +1419,8 @@ static void bipred_average_px_ip_avx2(kvz_pixel *dst,
|
||||||
static void bipred_average_avx2(lcu_t *const lcu,
|
static void bipred_average_avx2(lcu_t *const lcu,
|
||||||
const yuv_t *const px_L0,
|
const yuv_t *const px_L0,
|
||||||
const yuv_t *const px_L1,
|
const yuv_t *const px_L1,
|
||||||
const yuv_ip_t *const ip_L0,
|
const yuv_im_t *const ip_L0,
|
||||||
const yuv_ip_t *const ip_L1,
|
const yuv_im_t *const ip_L1,
|
||||||
const unsigned pu_x,
|
const unsigned pu_x,
|
||||||
const unsigned pu_y,
|
const unsigned pu_y,
|
||||||
const unsigned pu_w,
|
const unsigned pu_w,
|
||||||
|
@ -1438,12 +1438,12 @@ static void bipred_average_avx2(lcu_t *const lcu,
|
||||||
bipred_average_px_px_avx2(lcu->rec.y + pb_offset, px_L0->y, px_L1->y, pu_w, pu_h, LCU_WIDTH);
|
bipred_average_px_px_avx2(lcu->rec.y + pb_offset, px_L0->y, px_L1->y, pu_w, pu_h, LCU_WIDTH);
|
||||||
|
|
||||||
} else if ((ip_flags_L0 & 1) && (ip_flags_L1 & 1)) {
|
} else if ((ip_flags_L0 & 1) && (ip_flags_L1 & 1)) {
|
||||||
bipred_average_ip_ip_avx2(lcu->rec.y + pb_offset, ip_L0->y, ip_L1->y, pu_w, pu_h, LCU_WIDTH);
|
bipred_average_im_im_avx2(lcu->rec.y + pb_offset, ip_L0->y, ip_L1->y, pu_w, pu_h, LCU_WIDTH);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
kvz_pixel *src_px = (ip_flags_L0 & 1) ? px_L1->y : px_L0->y;
|
kvz_pixel *src_px = (ip_flags_L0 & 1) ? px_L1->y : px_L0->y;
|
||||||
kvz_pixel_ip *src_ip = (ip_flags_L0 & 1) ? ip_L0->y : ip_L1->y;
|
kvz_pixel_im *src_im = (ip_flags_L0 & 1) ? ip_L0->y : ip_L1->y;
|
||||||
bipred_average_px_ip_avx2(lcu->rec.y + pb_offset, src_px, src_ip, pu_w, pu_h, LCU_WIDTH);
|
bipred_average_px_im_avx2(lcu->rec.y + pb_offset, src_px, src_im, pu_w, pu_h, LCU_WIDTH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (predict_chroma) {
|
if (predict_chroma) {
|
||||||
|
@ -1456,16 +1456,16 @@ static void bipred_average_avx2(lcu_t *const lcu,
|
||||||
bipred_average_px_px_avx2(lcu->rec.v + pb_offset, px_L0->v, px_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_px_px_avx2(lcu->rec.v + pb_offset, px_L0->v, px_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
|
|
||||||
} else if ((ip_flags_L0 & 2) && (ip_flags_L1 & 2)) {
|
} else if ((ip_flags_L0 & 2) && (ip_flags_L1 & 2)) {
|
||||||
bipred_average_ip_ip_avx2(lcu->rec.u + pb_offset, ip_L0->u, ip_L1->u, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_im_im_avx2(lcu->rec.u + pb_offset, ip_L0->u, ip_L1->u, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
bipred_average_ip_ip_avx2(lcu->rec.v + pb_offset, ip_L0->v, ip_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_im_im_avx2(lcu->rec.v + pb_offset, ip_L0->v, ip_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
kvz_pixel *src_px_u = (ip_flags_L0 & 2) ? px_L1->u : px_L0->u;
|
kvz_pixel *src_px_u = (ip_flags_L0 & 2) ? px_L1->u : px_L0->u;
|
||||||
kvz_pixel_ip *src_ip_u = (ip_flags_L0 & 2) ? ip_L0->u : ip_L1->u;
|
kvz_pixel_im *src_im_u = (ip_flags_L0 & 2) ? ip_L0->u : ip_L1->u;
|
||||||
kvz_pixel *src_px_v = (ip_flags_L0 & 2) ? px_L1->v : px_L0->v;
|
kvz_pixel *src_px_v = (ip_flags_L0 & 2) ? px_L1->v : px_L0->v;
|
||||||
kvz_pixel_ip *src_ip_v = (ip_flags_L0 & 2) ? ip_L0->v : ip_L1->v;
|
kvz_pixel_im *src_im_v = (ip_flags_L0 & 2) ? ip_L0->v : ip_L1->v;
|
||||||
bipred_average_px_ip_avx2(lcu->rec.u + pb_offset, src_px_u, src_ip_u, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_px_im_avx2(lcu->rec.u + pb_offset, src_px_u, src_im_u, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
bipred_average_px_ip_avx2(lcu->rec.v + pb_offset, src_px_v, src_ip_v, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_px_im_avx2(lcu->rec.v + pb_offset, src_px_v, src_im_v, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -568,9 +568,9 @@ static void bipred_average_px_px(kvz_pixel *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bipred_average_ip_ip(kvz_pixel *dst,
|
static void bipred_average_im_im(kvz_pixel *dst,
|
||||||
kvz_pixel_ip *ip_L0,
|
kvz_pixel_im *ip_L0,
|
||||||
kvz_pixel_ip *ip_L1,
|
kvz_pixel_im *ip_L1,
|
||||||
unsigned pu_w,
|
unsigned pu_w,
|
||||||
unsigned pu_h,
|
unsigned pu_h,
|
||||||
unsigned dst_stride)
|
unsigned dst_stride)
|
||||||
|
@ -589,9 +589,9 @@ static void bipred_average_ip_ip(kvz_pixel *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bipred_average_px_ip(kvz_pixel *dst,
|
static void bipred_average_px_im(kvz_pixel *dst,
|
||||||
kvz_pixel *px,
|
kvz_pixel *px,
|
||||||
kvz_pixel_ip *ip,
|
kvz_pixel_im *ip,
|
||||||
unsigned pu_w,
|
unsigned pu_w,
|
||||||
unsigned pu_h,
|
unsigned pu_h,
|
||||||
unsigned dst_stride)
|
unsigned dst_stride)
|
||||||
|
@ -604,8 +604,8 @@ static void bipred_average_px_ip(kvz_pixel *dst,
|
||||||
int y = i / pu_w;
|
int y = i / pu_w;
|
||||||
int x = i % pu_w;
|
int x = i % pu_w;
|
||||||
int16_t sample_px = px[i] << (14 - KVZ_BIT_DEPTH);
|
int16_t sample_px = px[i] << (14 - KVZ_BIT_DEPTH);
|
||||||
int16_t sample_ip = ip[i];
|
int16_t sample_im = ip[i];
|
||||||
int32_t rounded = (sample_px + sample_ip + offset) >> shift;
|
int32_t rounded = (sample_px + sample_im + offset) >> shift;
|
||||||
dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(rounded);
|
dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(rounded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -613,8 +613,8 @@ static void bipred_average_px_ip(kvz_pixel *dst,
|
||||||
static void bipred_average_generic(lcu_t *const lcu,
|
static void bipred_average_generic(lcu_t *const lcu,
|
||||||
const yuv_t *const px_L0,
|
const yuv_t *const px_L0,
|
||||||
const yuv_t *const px_L1,
|
const yuv_t *const px_L1,
|
||||||
const yuv_ip_t *const ip_L0,
|
const yuv_im_t *const ip_L0,
|
||||||
const yuv_ip_t *const ip_L1,
|
const yuv_im_t *const ip_L1,
|
||||||
const unsigned pu_x,
|
const unsigned pu_x,
|
||||||
const unsigned pu_y,
|
const unsigned pu_y,
|
||||||
const unsigned pu_w,
|
const unsigned pu_w,
|
||||||
|
@ -632,12 +632,12 @@ static void bipred_average_generic(lcu_t *const lcu,
|
||||||
bipred_average_px_px(lcu->rec.y + pb_offset, px_L0->y, px_L1->y, pu_w, pu_h, LCU_WIDTH);
|
bipred_average_px_px(lcu->rec.y + pb_offset, px_L0->y, px_L1->y, pu_w, pu_h, LCU_WIDTH);
|
||||||
|
|
||||||
} else if ((ip_flags_L0 & 1) && (ip_flags_L1 & 1)) {
|
} else if ((ip_flags_L0 & 1) && (ip_flags_L1 & 1)) {
|
||||||
bipred_average_ip_ip(lcu->rec.y + pb_offset, ip_L0->y, ip_L1->y, pu_w, pu_h, LCU_WIDTH);
|
bipred_average_im_im(lcu->rec.y + pb_offset, ip_L0->y, ip_L1->y, pu_w, pu_h, LCU_WIDTH);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
kvz_pixel *src_px = (ip_flags_L0 & 1) ? px_L1->y : px_L0->y;
|
kvz_pixel *src_px = (ip_flags_L0 & 1) ? px_L1->y : px_L0->y;
|
||||||
kvz_pixel_ip *src_ip = (ip_flags_L0 & 1) ? ip_L0->y : ip_L1->y;
|
kvz_pixel_im *src_im = (ip_flags_L0 & 1) ? ip_L0->y : ip_L1->y;
|
||||||
bipred_average_px_ip(lcu->rec.y + pb_offset, src_px, src_ip, pu_w, pu_h, LCU_WIDTH);
|
bipred_average_px_im(lcu->rec.y + pb_offset, src_px, src_im, pu_w, pu_h, LCU_WIDTH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (predict_chroma) {
|
if (predict_chroma) {
|
||||||
|
@ -650,16 +650,16 @@ static void bipred_average_generic(lcu_t *const lcu,
|
||||||
bipred_average_px_px(lcu->rec.v + pb_offset, px_L0->v, px_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_px_px(lcu->rec.v + pb_offset, px_L0->v, px_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
|
|
||||||
} else if ((ip_flags_L0 & 2) && (ip_flags_L1 & 2)) {
|
} else if ((ip_flags_L0 & 2) && (ip_flags_L1 & 2)) {
|
||||||
bipred_average_ip_ip(lcu->rec.u + pb_offset, ip_L0->u, ip_L1->u, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_im_im(lcu->rec.u + pb_offset, ip_L0->u, ip_L1->u, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
bipred_average_ip_ip(lcu->rec.v + pb_offset, ip_L0->v, ip_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_im_im(lcu->rec.v + pb_offset, ip_L0->v, ip_L1->v, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
kvz_pixel *src_px_u = (ip_flags_L0 & 2) ? px_L1->u : px_L0->u;
|
kvz_pixel *src_px_u = (ip_flags_L0 & 2) ? px_L1->u : px_L0->u;
|
||||||
kvz_pixel_ip *src_ip_u = (ip_flags_L0 & 2) ? ip_L0->u : ip_L1->u;
|
kvz_pixel_im *src_im_u = (ip_flags_L0 & 2) ? ip_L0->u : ip_L1->u;
|
||||||
kvz_pixel *src_px_v = (ip_flags_L0 & 2) ? px_L1->v : px_L0->v;
|
kvz_pixel *src_px_v = (ip_flags_L0 & 2) ? px_L1->v : px_L0->v;
|
||||||
kvz_pixel_ip *src_ip_v = (ip_flags_L0 & 2) ? ip_L0->v : ip_L1->v;
|
kvz_pixel_im *src_im_v = (ip_flags_L0 & 2) ? ip_L0->v : ip_L1->v;
|
||||||
bipred_average_px_ip(lcu->rec.u + pb_offset, src_px_u, src_ip_u, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_px_im(lcu->rec.u + pb_offset, src_px_u, src_im_u, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
bipred_average_px_ip(lcu->rec.v + pb_offset, src_px_v, src_ip_v, pb_w, pb_h, LCU_WIDTH_C);
|
bipred_average_px_im(lcu->rec.v + pb_offset, src_px_v, src_im_v, pb_w, pb_h, LCU_WIDTH_C);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -136,8 +136,8 @@ typedef uint32_t (hor_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_
|
||||||
typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
|
typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
|
||||||
const yuv_t *const px_L0,
|
const yuv_t *const px_L0,
|
||||||
const yuv_t *const px_L1,
|
const yuv_t *const px_L1,
|
||||||
const yuv_ip_t *const ip_L0,
|
const yuv_im_t *const ip_L0,
|
||||||
const yuv_ip_t *const ip_L1,
|
const yuv_im_t *const ip_L1,
|
||||||
const unsigned pu_x,
|
const unsigned pu_x,
|
||||||
const unsigned pu_y,
|
const unsigned pu_y,
|
||||||
const unsigned pu_w,
|
const unsigned pu_w,
|
||||||
|
|
Loading…
Reference in a new issue