mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-23 18:14:06 +00:00
Merge branch 'vaq'
This commit is contained in:
commit
901c25c0c8
|
@ -164,6 +164,8 @@ Video structure:
|
|||
--high-tier : Used with --level. Use high tier bitrate limits
|
||||
instead of the main tier limits during encoding.
|
||||
High tier requires level 4 or higher.
|
||||
--vaq <integer> : Enable variance adaptive quantization with given
|
||||
strength, in range 1..20.
|
||||
|
||||
Compression tools:
|
||||
--(no-)deblock <beta:tc> : Deblocking filter. [0:0]
|
||||
|
|
|
@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
|
|||
#
|
||||
# Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
|
||||
ver_major=6
|
||||
ver_minor=1
|
||||
ver_minor=2
|
||||
ver_release=0
|
||||
|
||||
# Prevents configure from adding a lot of defines to the CFLAGS
|
||||
|
|
|
@ -195,6 +195,10 @@ Same as \-\-level but warnings instead of errors.
|
|||
Used with \-\-level. Use high tier bitrate limits
|
||||
instead of the main tier limits during encoding.
|
||||
High tier requires level 4 or higher.
|
||||
.TP
|
||||
\fB\-\-vaq <integer>
|
||||
Enable variance adaptive quantization with given
|
||||
strength, in range 1..20.
|
||||
|
||||
.SS "Compression tools:"
|
||||
.TP
|
||||
|
|
10
src/cfg.c
10
src/cfg.c
|
@ -143,6 +143,8 @@ int kvz_config_init(kvz_config *cfg)
|
|||
|
||||
cfg->me_max_steps = (uint32_t)-1;
|
||||
|
||||
cfg->vaq = 0;
|
||||
|
||||
cfg->scaling_list = KVZ_SCALING_LIST_OFF;
|
||||
|
||||
cfg->max_merge = 5;
|
||||
|
@ -1305,6 +1307,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
|
|||
}
|
||||
else if (OPT("fast-residual-cost"))
|
||||
cfg->fast_residual_cost_limit = atoi(value);
|
||||
else if (OPT("vaq")) {
|
||||
cfg->vaq = (int)atoi(value);
|
||||
}
|
||||
else if (OPT("max-merge")) {
|
||||
int max_merge = atoi(value);
|
||||
if (max_merge < 1 || max_merge > 5) {
|
||||
|
@ -1466,6 +1471,11 @@ int kvz_config_validate(const kvz_config *const cfg)
|
|||
{
|
||||
int error = 0;
|
||||
|
||||
if (cfg->vaq < 0) {
|
||||
fprintf(stderr, "vaq strength must be positive\n");
|
||||
error = 1;
|
||||
}
|
||||
|
||||
if (cfg->width <= 0) {
|
||||
fprintf(stderr, "Input error: width must be positive\n");
|
||||
error = 1;
|
||||
|
|
|
@ -133,6 +133,8 @@ static const struct option long_options[] = {
|
|||
{ "set-qp-in-cu", no_argument, NULL, 0 },
|
||||
{ "open-gop", no_argument, NULL, 0 },
|
||||
{ "no-open-gop", no_argument, NULL, 0 },
|
||||
{ "vaq", required_argument, NULL, 0 },
|
||||
{ "no-vaq", no_argument, NULL, 0 },
|
||||
{ "scaling-list", required_argument, NULL, 0 },
|
||||
{ "max-merge", required_argument, NULL, 0 },
|
||||
{ "early-skip", no_argument, NULL, 0 },
|
||||
|
@ -457,6 +459,8 @@ void print_help(void)
|
|||
" --high-tier : Used with --level. Use high tier bitrate limits\n"
|
||||
" instead of the main tier limits during encoding.\n"
|
||||
" High tier requires level 4 or higher.\n"
|
||||
" --vaq <integer> : Enable variance adaptive quantization with given\n"
|
||||
" strength, in range 1..20.\n"
|
||||
"\n"
|
||||
/* Word wrap to this width to stay under 80 characters (including ") *************/
|
||||
"Compression tools:\n"
|
||||
|
|
|
@ -269,6 +269,8 @@ static void encode_transform_coeff(encoder_state_t * const state,
|
|||
if (state->must_code_qp_delta) {
|
||||
const int qp_pred = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
|
||||
const int qp_delta = cur_cu->qp - qp_pred;
|
||||
assert(KVZ_BIT_DEPTH == 8 && "This range applies only to 8-bit encoding.");
|
||||
assert(qp_delta >= -26 && qp_delta <= 25 && "QP delta not in valid range [-26, 25]."); // This range applies only to 8-bit encoding
|
||||
const int qp_delta_abs = ABS(qp_delta);
|
||||
cabac_data_t* cabac = &state->cabac;
|
||||
|
||||
|
|
|
@ -376,7 +376,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
|
|||
// for SMP and AMP partition units.
|
||||
encoder->tr_depth_inter = 0;
|
||||
|
||||
if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) {
|
||||
if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
|
||||
encoder->max_qp_delta_depth = 0;
|
||||
} else {
|
||||
encoder->max_qp_delta_depth = -1;
|
||||
|
|
|
@ -55,7 +55,7 @@ typedef struct encoder_control_t
|
|||
int32_t width_in_lcu;
|
||||
int32_t height_in_lcu;
|
||||
int32_t real_width; /*!< \brief real input picture width */
|
||||
int32_t real_height; /*!< \brief real input picture width */
|
||||
int32_t real_height; /*!< \brief real input picture height */
|
||||
int64_t pixels_per_pic;
|
||||
int8_t source_scan_type;
|
||||
} in;
|
||||
|
|
|
@ -59,6 +59,7 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
|
|||
const encoder_control_t * const encoder = state->encoder_control;
|
||||
const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
|
||||
state->frame->lcu_stats = calloc(num_lcus, sizeof(lcu_stats_t));
|
||||
state->frame->aq_offsets = MALLOC(double, num_lcus);
|
||||
|
||||
for (int y = 0; y < encoder->in.height_in_lcu; y++) {
|
||||
for (int x = 0; x < encoder->in.width_in_lcu; x++) {
|
||||
|
@ -92,6 +93,7 @@ static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
|
|||
|
||||
kvz_image_list_destroy(state->frame->ref);
|
||||
FREE_POINTER(state->frame->lcu_stats);
|
||||
FREE_POINTER(state->frame->aq_offsets);
|
||||
}
|
||||
|
||||
static int encoder_state_config_tile_init(encoder_state_t * const state,
|
||||
|
|
|
@ -37,6 +37,8 @@
|
|||
#include "tables.h"
|
||||
#include "threadqueue.h"
|
||||
|
||||
#include "strategies/strategies-picture.h"
|
||||
|
||||
|
||||
int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
|
||||
int i;
|
||||
|
@ -1223,6 +1225,21 @@ static void normalize_lcu_weights(encoder_state_t * const state)
|
|||
}
|
||||
}
|
||||
|
||||
// Check if lcu is edge lcu. Return false if frame dimensions are 64 divisible
|
||||
static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
|
||||
{
|
||||
if (xdiv64 && ydiv64) {
|
||||
return false;
|
||||
}
|
||||
int last_row_first_id = (lcus_y - 1) * lcus_x;
|
||||
if ((id % lcus_x == lcus_x - 1 && !xdiv64) || (id >= last_row_first_id && !ydiv64)) {
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
|
||||
assert(state->type == ENCODER_STATE_TYPE_MAIN);
|
||||
|
||||
|
@ -1236,6 +1253,92 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
|
|||
state->tile->frame->height
|
||||
);
|
||||
|
||||
// Variance adaptive quantization
|
||||
if (cfg->vaq) {
|
||||
const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
|
||||
double d = cfg->vaq * 0.1; // Empirically decided constant. Affects delta-QP strength
|
||||
|
||||
// Calculate frame pixel variance
|
||||
uint32_t len = state->tile->frame->width * state->tile->frame->height;
|
||||
uint32_t c_len = len / 4;
|
||||
double frame_var = kvz_pixel_var(state->tile->frame->source->y, len);
|
||||
if (has_chroma) {
|
||||
frame_var += kvz_pixel_var(state->tile->frame->source->u, c_len);
|
||||
frame_var += kvz_pixel_var(state->tile->frame->source->v, c_len);
|
||||
}
|
||||
|
||||
// Loop through LCUs
|
||||
// For each LCU calculate: D * (log(LCU pixel variance) - log(frame pixel variance))
|
||||
unsigned x_lim = state->tile->frame->width_in_lcu;
|
||||
unsigned y_lim = state->tile->frame->height_in_lcu;
|
||||
|
||||
unsigned id = 0;
|
||||
for (int y = 0; y < y_lim; ++y) {
|
||||
for (int x = 0; x < x_lim; ++x) {
|
||||
kvz_pixel tmp[LCU_LUMA_SIZE];
|
||||
int pxl_x = x * LCU_WIDTH;
|
||||
int pxl_y = y * LCU_WIDTH;
|
||||
int x_max = MIN(pxl_x + LCU_WIDTH, frame->width) - pxl_x;
|
||||
int y_max = MIN(pxl_y + LCU_WIDTH, frame->height) - pxl_y;
|
||||
|
||||
bool xdiv64 = false;
|
||||
bool ydiv64 = false;
|
||||
if (frame->width % 64 == 0) xdiv64 = true;
|
||||
if (frame->height % 64 == 0) ydiv64 = true;
|
||||
|
||||
// Luma variance
|
||||
if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
|
||||
kvz_pixels_blit(&state->tile->frame->source->y[pxl_x + pxl_y * state->tile->frame->source->stride], tmp,
|
||||
x_max, y_max, state->tile->frame->source->stride, LCU_WIDTH);
|
||||
} else {
|
||||
// Extend edge pixels for edge lcus
|
||||
for (int y = 0; y < LCU_WIDTH; y++) {
|
||||
for (int x = 0; x < LCU_WIDTH; x++) {
|
||||
int src_y = CLIP(0, frame->height - 1, pxl_y + y);
|
||||
int src_x = CLIP(0, frame->width - 1, pxl_x + x);
|
||||
tmp[y * LCU_WIDTH + x] = state->tile->frame->source->y[src_y * state->tile->frame->source->stride + src_x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double lcu_var = kvz_pixel_var(tmp, LCU_LUMA_SIZE);
|
||||
|
||||
if (has_chroma) {
|
||||
// Add chroma variance if not monochrome
|
||||
int32_t c_stride = state->tile->frame->source->stride >> 1;
|
||||
kvz_pixel chromau_tmp[LCU_CHROMA_SIZE];
|
||||
kvz_pixel chromav_tmp[LCU_CHROMA_SIZE];
|
||||
int lcu_chroma_width = LCU_WIDTH >> 1;
|
||||
int c_pxl_x = x * lcu_chroma_width;
|
||||
int c_pxl_y = y * lcu_chroma_width;
|
||||
int c_x_max = MIN(c_pxl_x + lcu_chroma_width, frame->width >> 1) - c_pxl_x;
|
||||
int c_y_max = MIN(c_pxl_y + lcu_chroma_width, frame->height >> 1) - c_pxl_y;
|
||||
|
||||
if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
|
||||
kvz_pixels_blit(&state->tile->frame->source->u[c_pxl_x + c_pxl_y * c_stride], chromau_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
|
||||
kvz_pixels_blit(&state->tile->frame->source->v[c_pxl_x + c_pxl_y * c_stride], chromav_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
|
||||
}
|
||||
else {
|
||||
for (int y = 0; y < lcu_chroma_width; y++) {
|
||||
for (int x = 0; x < lcu_chroma_width; x++) {
|
||||
int src_y = CLIP(0, (frame->height >> 1) - 1, c_pxl_y + y);
|
||||
int src_x = CLIP(0, (frame->width >> 1) - 1, c_pxl_x + x);
|
||||
chromau_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->u[src_y * c_stride + src_x];
|
||||
chromav_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->v[src_y * c_stride + src_x];
|
||||
}
|
||||
}
|
||||
}
|
||||
lcu_var += kvz_pixel_var(chromau_tmp, LCU_CHROMA_SIZE);
|
||||
lcu_var += kvz_pixel_var(chromav_tmp, LCU_CHROMA_SIZE);
|
||||
}
|
||||
|
||||
state->frame->aq_offsets[id] = d * (log(lcu_var) - log(frame_var));
|
||||
id++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Variance adaptive quantization - END
|
||||
|
||||
// Use this flag to handle closed gop irap picture selection.
|
||||
// If set to true, irap is already set and we avoid
|
||||
// setting it based on the intra period
|
||||
|
|
|
@ -160,6 +160,11 @@ typedef struct encoder_state_config_frame_t {
|
|||
|
||||
struct encoder_state_t const *previous_layer_state;
|
||||
|
||||
/**
|
||||
* \brief Calculated adaptive QP offset for each LCU.
|
||||
*/
|
||||
double *aq_offsets;
|
||||
|
||||
/**
|
||||
* \brief Whether next NAL is the first NAL in the access unit.
|
||||
*/
|
||||
|
|
|
@ -399,6 +399,8 @@ typedef struct kvz_config
|
|||
/** \brief Flag to enable/disable open GOP configuration */
|
||||
int8_t open_gop;
|
||||
|
||||
int32_t vaq; /** \brief Enable variance adaptive quantization*/
|
||||
|
||||
/** \brief Type of scaling lists to use */
|
||||
int8_t scaling_list;
|
||||
|
||||
|
|
|
@ -637,8 +637,22 @@ static double get_ctu_bits(encoder_state_t * const state, vector2d_t pos) {
|
|||
return avg_bits;
|
||||
}
|
||||
|
||||
static double qp_to_lambda(encoder_state_t* const state, int qp)
|
||||
{
|
||||
const int shift_qp = 12;
|
||||
double lambda = 0.57 * pow(2.0, (qp - shift_qp) / 3.0);
|
||||
|
||||
void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
|
||||
// NOTE: HM adjusts lambda for inter according to Hadamard usage in ME.
|
||||
// SATD is currently always enabled for ME, so this has no effect.
|
||||
// bool hadamard_me = true;
|
||||
// if (!hadamard_me && state->frame->slicetype != KVZ_SLICE_I) {
|
||||
// lambda *= 0.95;
|
||||
// }
|
||||
|
||||
return lambda;
|
||||
}
|
||||
|
||||
void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
|
||||
double bits = get_ctu_bits(state, pos);
|
||||
|
||||
const encoder_control_t * const encoder = state->encoder_control;
|
||||
|
@ -750,6 +764,26 @@ void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
|
|||
ctu->qp = est_qp;
|
||||
ctu->lambda = est_lambda;
|
||||
ctu->i_cost = 0;
|
||||
|
||||
// Apply variance adaptive quantization
|
||||
if (encoder->cfg.vaq) {
|
||||
vector2d_t lcu = {
|
||||
pos.x + state->tile->lcu_offset_x,
|
||||
pos.y + state->tile->lcu_offset_y
|
||||
};
|
||||
int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
|
||||
int aq_offset = round(state->frame->aq_offsets[id]);
|
||||
state->qp += aq_offset;
|
||||
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
|
||||
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
|
||||
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
|
||||
state->qp = CLIP_TO_QP(state->qp);
|
||||
state->lambda = qp_to_lambda(state, state->qp);
|
||||
state->lambda_sqrt = sqrt(state->lambda);
|
||||
|
||||
//ctu->qp = state->qp;
|
||||
//ctu->lambda = state->lambda;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -894,22 +928,6 @@ void kvz_update_after_picture(encoder_state_t * const state) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static double qp_to_lambda(encoder_state_t * const state, int qp)
|
||||
{
|
||||
const int shift_qp = 12;
|
||||
double lambda = 0.57 * pow(2.0, (qp - shift_qp) / 3.0);
|
||||
|
||||
// NOTE: HM adjusts lambda for inter according to Hadamard usage in ME.
|
||||
// SATD is currently always enabled for ME, so this has no effect.
|
||||
// bool hadamard_me = true;
|
||||
// if (!hadamard_me && state->frame->slicetype != KVZ_SLICE_I) {
|
||||
// lambda *= 0.95;
|
||||
// }
|
||||
|
||||
return lambda;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Allocate bits and set lambda and QP for the current picture.
|
||||
* \param state the main encoder state
|
||||
|
@ -1049,4 +1067,21 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
|
|||
state->lambda = state->frame->lambda;
|
||||
state->lambda_sqrt = sqrt(state->frame->lambda);
|
||||
}
|
||||
|
||||
// Apply variance adaptive quantization
|
||||
if (ctrl->cfg.vaq) {
|
||||
vector2d_t lcu = {
|
||||
pos.x + state->tile->lcu_offset_x,
|
||||
pos.y + state->tile->lcu_offset_y
|
||||
};
|
||||
int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
|
||||
int aq_offset = round(state->frame->aq_offsets[id]);
|
||||
state->qp += aq_offset;
|
||||
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
|
||||
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
|
||||
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
|
||||
state->qp = CLIP_TO_QP(state->qp);
|
||||
state->lambda = qp_to_lambda(state, state->qp);
|
||||
state->lambda_sqrt = sqrt(state->lambda);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,30 @@
|
|||
|
||||
#include <immintrin.h>
|
||||
|
||||
// The calling convention used by MSVC on 32-bit builds will essentially
|
||||
// disallow functions to have more than 3 XMM/YMM parameters, because it
|
||||
// will not provide more than 8-byte param alignment, and only the first
|
||||
// three vector params will be carried in SIMD registers. Now the
|
||||
// vectorcall convention could probably be problematic in globally visible
|
||||
// funcitons, but likely not in static ones.
|
||||
#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
|
||||
#define FIX_W32 __vectorcall
|
||||
#else
|
||||
#define FIX_W32
|
||||
#endif
|
||||
|
||||
// Non-inline functions defined in this header are likely to trigger a
|
||||
// warning for each module including this header that does NOT use them,
|
||||
// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).
|
||||
// Tell 'em we actually want to do that, it's not an accident.
|
||||
#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
|
||||
#define FIX_UNUSED __attribute__((unused))
|
||||
#else
|
||||
#define FIX_UNUSED
|
||||
#endif
|
||||
|
||||
#define FIX_NOINLINE FIX_W32 FIX_UNUSED
|
||||
|
||||
/*
|
||||
* Reorder coefficients from raster to scan order
|
||||
* Fun fact: Once upon a time, doing this in a loop looked like this:
|
||||
|
@ -111,4 +135,19 @@ static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t
|
|||
*last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
|
||||
}
|
||||
|
||||
static int32_t FIX_NOINLINE hsum_8x32b(const __m256i v)
|
||||
{
|
||||
__m256i sum1 = v;
|
||||
__m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i sum3 = _mm256_add_epi32 (sum1, sum2);
|
||||
__m256i sum4 = _mm256_shuffle_epi32 (sum3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i sum5 = _mm256_add_epi32 (sum3, sum4);
|
||||
__m256i sum6 = _mm256_shuffle_epi32 (sum5, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m256i sum7 = _mm256_add_epi32 (sum5, sum6);
|
||||
|
||||
__m128i sum8 = _mm256_castsi256_si128 (sum7);
|
||||
int32_t sum9 = _mm_cvtsi128_si32 (sum8);
|
||||
return sum9;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1051,6 +1051,181 @@ static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat
|
|||
pic_stride, ref_stride, left, right);
|
||||
}
|
||||
|
||||
static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len)
|
||||
{
|
||||
const float len_f = (float)len;
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
size_t i;
|
||||
__m256i sums = zero;
|
||||
for (i = 0; i + 31 < len; i += 32) {
|
||||
__m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
|
||||
__m256i curr_sum = _mm256_sad_epu8(curr, zero);
|
||||
sums = _mm256_add_epi64(sums, curr_sum);
|
||||
}
|
||||
__m128i sum_lo = _mm256_castsi256_si128 (sums);
|
||||
__m128i sum_hi = _mm256_extracti128_si256(sums, 1);
|
||||
__m128i sum_3 = _mm_add_epi64 (sum_lo, sum_hi);
|
||||
__m128i sum_4 = _mm_shuffle_epi32 (sum_3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sum_5 = _mm_add_epi64 (sum_3, sum_4);
|
||||
|
||||
int64_t sum = _mm_cvtsi128_si64(sum_5);
|
||||
|
||||
// Remaining len mod 32 pixels
|
||||
for (; i < len; ++i) {
|
||||
sum += buf[i];
|
||||
}
|
||||
|
||||
float mean_f = (float)sum / len_f;
|
||||
__m256 mean = _mm256_set1_ps(mean_f);
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
|
||||
for (i = 0; i + 31 < len; i += 32) {
|
||||
__m128i curr0 = _mm_loadl_epi64((const __m128i *)(buf + i + 0));
|
||||
__m128i curr1 = _mm_loadl_epi64((const __m128i *)(buf + i + 8));
|
||||
__m128i curr2 = _mm_loadl_epi64((const __m128i *)(buf + i + 16));
|
||||
__m128i curr3 = _mm_loadl_epi64((const __m128i *)(buf + i + 24));
|
||||
|
||||
__m256i curr0_32 = _mm256_cvtepu8_epi32(curr0);
|
||||
__m256i curr1_32 = _mm256_cvtepu8_epi32(curr1);
|
||||
__m256i curr2_32 = _mm256_cvtepu8_epi32(curr2);
|
||||
__m256i curr3_32 = _mm256_cvtepu8_epi32(curr3);
|
||||
|
||||
__m256 curr0_f = _mm256_cvtepi32_ps (curr0_32);
|
||||
__m256 curr1_f = _mm256_cvtepi32_ps (curr1_32);
|
||||
__m256 curr2_f = _mm256_cvtepi32_ps (curr2_32);
|
||||
__m256 curr3_f = _mm256_cvtepi32_ps (curr3_32);
|
||||
|
||||
__m256 curr0_sd = _mm256_sub_ps (curr0_f, mean);
|
||||
__m256 curr1_sd = _mm256_sub_ps (curr1_f, mean);
|
||||
__m256 curr2_sd = _mm256_sub_ps (curr2_f, mean);
|
||||
__m256 curr3_sd = _mm256_sub_ps (curr3_f, mean);
|
||||
|
||||
__m256 curr0_v = _mm256_mul_ps (curr0_sd, curr0_sd);
|
||||
__m256 curr1_v = _mm256_mul_ps (curr1_sd, curr1_sd);
|
||||
__m256 curr2_v = _mm256_mul_ps (curr2_sd, curr2_sd);
|
||||
__m256 curr3_v = _mm256_mul_ps (curr3_sd, curr3_sd);
|
||||
|
||||
__m256 curr01 = _mm256_add_ps (curr0_v, curr1_v);
|
||||
__m256 curr23 = _mm256_add_ps (curr2_v, curr3_v);
|
||||
__m256 curr = _mm256_add_ps (curr01, curr23);
|
||||
accum = _mm256_add_ps (accum, curr);
|
||||
}
|
||||
__m256d accum_d = _mm256_castps_pd (accum);
|
||||
__m256d accum2_d = _mm256_permute4x64_pd(accum_d, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256 accum2 = _mm256_castpd_ps (accum2_d);
|
||||
|
||||
__m256 accum3 = _mm256_add_ps (accum, accum2);
|
||||
__m256 accum4 = _mm256_permute_ps (accum3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256 accum5 = _mm256_add_ps (accum3, accum4);
|
||||
__m256 accum6 = _mm256_permute_ps (accum5, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m256 accum7 = _mm256_add_ps (accum5, accum6);
|
||||
|
||||
float var_sum = _mm256_cvtss_f32 (accum7);
|
||||
|
||||
// Remaining len mod 32 pixels
|
||||
for (; i < len; ++i) {
|
||||
float diff = buf[i] - mean_f;
|
||||
var_sum += diff * diff;
|
||||
}
|
||||
|
||||
return var_sum / len_f;
|
||||
}
|
||||
|
||||
#ifdef INACCURATE_VARIANCE_CALCULATION
|
||||
|
||||
// Assumes that u is a power of two
|
||||
static INLINE uint32_t ilog2(uint32_t u)
|
||||
{
|
||||
return _tzcnt_u32(u);
|
||||
}
|
||||
|
||||
// A B C D | E F G H (8x32b)
|
||||
// ==>
|
||||
// A+B C+D | E+F G+H (4x64b)
|
||||
static __m256i hsum_epi32_to_epi64(const __m256i v)
|
||||
{
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
__m256i v_shufd = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
|
||||
__m256i sums_32 = _mm256_add_epi32 (v, v_shufd);
|
||||
__m256i sums_64 = _mm256_blend_epi32 (sums_32, zero, 0xaa);
|
||||
return sums_64;
|
||||
}
|
||||
|
||||
static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
|
||||
{
|
||||
assert(sizeof(*buf) == 1);
|
||||
assert((len & 31) == 0);
|
||||
|
||||
// Uses Q8.7 numbers to measure mean and deviation, so variances are Q16.14
|
||||
const uint64_t sum_maxwid = ilog2(len) + (8 * sizeof(*buf));
|
||||
const __m128i normalize_sum = _mm_cvtsi32_si128(sum_maxwid - 15); // Normalize mean to [0, 32767], so signed 16-bit subtraction never overflows
|
||||
const __m128i debias_sum = _mm_cvtsi32_si128(1 << (sum_maxwid - 16));
|
||||
const float varsum_to_f = 1.0f / (float)(1 << (14 + ilog2(len)));
|
||||
|
||||
const bool power_of_two = (len & (len - 1)) == 0;
|
||||
if (sum_maxwid > 32 || sum_maxwid < 15 || !power_of_two) {
|
||||
return pixel_var_avx2_largebuf(buf, len);
|
||||
}
|
||||
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i himask_15 = _mm256_set1_epi16(0x7f00);
|
||||
|
||||
size_t i;
|
||||
__m256i sums = zero;
|
||||
for (i = 0; i < len; i += 32) {
|
||||
__m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
|
||||
__m256i curr_sum = _mm256_sad_epu8(curr, zero);
|
||||
sums = _mm256_add_epi64(sums, curr_sum);
|
||||
}
|
||||
__m128i sum_lo = _mm256_castsi256_si128 (sums);
|
||||
__m128i sum_hi = _mm256_extracti128_si256(sums, 1);
|
||||
__m128i sum_3 = _mm_add_epi64 (sum_lo, sum_hi);
|
||||
__m128i sum_4 = _mm_shuffle_epi32 (sum_3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sum_5 = _mm_add_epi64 (sum_3, sum_4);
|
||||
__m128i sum_5n = _mm_srl_epi32 (sum_5, normalize_sum);
|
||||
sum_5n = _mm_add_epi32 (sum_5n, debias_sum);
|
||||
|
||||
__m256i sum_n = _mm256_broadcastw_epi16 (sum_5n);
|
||||
|
||||
__m256i accum = zero;
|
||||
for (i = 0; i < len; i += 32) {
|
||||
__m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
|
||||
|
||||
__m256i curr0 = _mm256_slli_epi16 (curr, 7);
|
||||
__m256i curr1 = _mm256_srli_epi16 (curr, 1);
|
||||
curr0 = _mm256_and_si256 (curr0, himask_15);
|
||||
curr1 = _mm256_and_si256 (curr1, himask_15);
|
||||
|
||||
__m256i dev0 = _mm256_sub_epi16 (curr0, sum_n);
|
||||
__m256i dev1 = _mm256_sub_epi16 (curr1, sum_n);
|
||||
|
||||
__m256i vars0 = _mm256_madd_epi16 (dev0, dev0);
|
||||
__m256i vars1 = _mm256_madd_epi16 (dev1, dev1);
|
||||
|
||||
__m256i varsum = _mm256_add_epi32 (vars0, vars1);
|
||||
varsum = hsum_epi32_to_epi64(varsum);
|
||||
accum = _mm256_add_epi64 (accum, varsum);
|
||||
}
|
||||
__m256i accum2 = _mm256_permute4x64_epi64(accum, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i accum3 = _mm256_add_epi64 (accum, accum2);
|
||||
__m256i accum4 = _mm256_permute4x64_epi64(accum3, _MM_SHUFFLE(2, 3, 1, 0));
|
||||
__m256i v_tot = _mm256_add_epi64 (accum3, accum4);
|
||||
__m128i vt128 = _mm256_castsi256_si128 (v_tot);
|
||||
uint64_t vars = _mm_cvtsi128_si64 (vt128);
|
||||
|
||||
return (float)vars * varsum_to_f;
|
||||
}
|
||||
|
||||
#else // INACCURATE_VARIANCE_CALCULATION
|
||||
|
||||
static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
|
||||
{
|
||||
return pixel_var_avx2_largebuf(buf, len);
|
||||
}
|
||||
|
||||
#endif // !INACCURATE_VARIANCE_CALCULATION
|
||||
|
||||
#endif //COMPILE_INTEL_AVX2
|
||||
|
||||
int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
|
||||
|
@ -1089,6 +1264,8 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
|
|||
success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "hor_sad", "avx2", 40, &hor_sad_avx2);
|
||||
|
||||
success &= kvz_strategyselector_register(opaque, "pixel_var", "avx2", 40, &pixel_var_avx2);
|
||||
|
||||
}
|
||||
#endif
|
||||
return success;
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
|
||||
// Use a couple generic functions from here as a worst-case fallback
|
||||
#include "strategies/generic/sao_shared_generics.h"
|
||||
#include "strategies/avx2/avx2_common_functions.h"
|
||||
#include "strategies/missing-intel-intrinsics.h"
|
||||
#include "cu.h"
|
||||
#include "encoder.h"
|
||||
|
@ -34,37 +35,10 @@
|
|||
#include "sao.h"
|
||||
#include "strategyselector.h"
|
||||
|
||||
// The calling convention used by MSVC on 32-bit builds will essentially
|
||||
// disallow functions to have more than 3 XMM/YMM parameters, because it
|
||||
// will not provide more than 8-byte param alignment, and only the first
|
||||
// three vector params will be carried in SIMD registers. Now the
|
||||
// vectorcall convention could probably be problematic in globally visible
|
||||
// funcitons, but likely not in static ones.
|
||||
#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
|
||||
#define FIX_W32 __vectorcall
|
||||
#else
|
||||
#define FIX_W32
|
||||
#endif
|
||||
|
||||
// These optimizations are based heavily on sao-generic.c.
|
||||
// Might be useful to check that if (when) this file
|
||||
// is difficult to understand.
|
||||
|
||||
static int32_t FIX_W32 hsum_8x32b(const __m256i v)
|
||||
{
|
||||
__m256i sum1 = v;
|
||||
__m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i sum3 = _mm256_add_epi32 (sum1, sum2);
|
||||
__m256i sum4 = _mm256_shuffle_epi32 (sum3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i sum5 = _mm256_add_epi32 (sum3, sum4);
|
||||
__m256i sum6 = _mm256_shuffle_epi32 (sum5, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m256i sum7 = _mm256_add_epi32 (sum5, sum6);
|
||||
|
||||
__m128i sum8 = _mm256_castsi256_si128 (sum7);
|
||||
int32_t sum9 = _mm_cvtsi128_si32 (sum8);
|
||||
return sum9;
|
||||
}
|
||||
|
||||
// Do the SIGN3 operation for the difference a-b
|
||||
static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
|
||||
{
|
||||
|
|
|
@ -675,6 +675,32 @@ static uint32_t hor_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_
|
|||
return result;
|
||||
}
|
||||
|
||||
// Calculate pixel value variance. Takes in arrays of kvz_pixel
|
||||
static double pixel_var_generic(const kvz_pixel *arr, const uint32_t len)
|
||||
{
|
||||
double var = 0;
|
||||
double arr_mean = 0;
|
||||
|
||||
// Calculate array mean
|
||||
int i = 0;
|
||||
double sum = 0;
|
||||
|
||||
for (; i < len; ++i) {
|
||||
sum += arr[i];
|
||||
}
|
||||
arr_mean = sum / (double)len;
|
||||
|
||||
// Calculate array variance
|
||||
for (i = 0; i < len; ++i) {
|
||||
double tmp = (double)arr[i] - arr_mean;
|
||||
var += tmp*tmp;
|
||||
}
|
||||
|
||||
var /= len;
|
||||
|
||||
return var;
|
||||
}
|
||||
|
||||
int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
|
||||
{
|
||||
bool success = true;
|
||||
|
@ -714,5 +740,7 @@ int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
|
|||
success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic);
|
||||
|
||||
success &= kvz_strategyselector_register(opaque, "pixel_var", "generic", 0, &pixel_var_generic);
|
||||
|
||||
return success;
|
||||
}
|
||||
|
|
|
@ -67,6 +67,8 @@ get_optimized_sad_func *kvz_get_optimized_sad = 0;
|
|||
ver_sad_func *kvz_ver_sad = 0;
|
||||
hor_sad_func *kvz_hor_sad = 0;
|
||||
|
||||
pixel_var_func *kvz_pixel_var = 0;
|
||||
|
||||
|
||||
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {
|
||||
bool success = true;
|
||||
|
|
|
@ -138,6 +138,8 @@ typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
|
|||
bool predict_luma,
|
||||
bool predict_chroma);
|
||||
|
||||
typedef double (pixel_var_func)(const kvz_pixel *buf, const uint32_t len);
|
||||
|
||||
// Declare function pointers.
|
||||
extern reg_sad_func * kvz_reg_sad;
|
||||
|
||||
|
@ -176,6 +178,8 @@ extern get_optimized_sad_func *kvz_get_optimized_sad;
|
|||
extern ver_sad_func *kvz_ver_sad;
|
||||
extern hor_sad_func *kvz_hor_sad;
|
||||
|
||||
extern pixel_var_func *kvz_pixel_var;
|
||||
|
||||
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
|
||||
cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
|
||||
cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n);
|
||||
|
@ -211,6 +215,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
|
|||
{"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
|
||||
{"ver_sad", (void**) &kvz_ver_sad}, \
|
||||
{"hor_sad", (void**) &kvz_hor_sad}, \
|
||||
{"pixel_var", (void**) &kvz_pixel_var}, \
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -10,3 +10,6 @@ common_args='264x130 10 -p0 -r1 --threads=2 --wpp --owf=1 --rd=0'
|
|||
valgrind_test $common_args --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3
|
||||
valgrind_test $common_args --no-rdoq --no-signhide --subme=0
|
||||
valgrind_test $common_args --rdoq --no-deblock --no-sao --subme=0
|
||||
valgrind_test $common_args --vaq=8
|
||||
valgrind_test $common_args --vaq=8 --bitrate 3500
|
||||
valgrind_test $common_args --vaq=8 --rc-algorithm oba --bitrate 3500
|
||||
|
|
Loading…
Reference in a new issue