Merge branch 'vaq'

This commit is contained in:
Ari Lemmetti 2020-04-03 19:51:17 +03:00
commit 901c25c0c8
20 changed files with 444 additions and 47 deletions

View file

@ -164,6 +164,8 @@ Video structure:
--high-tier : Used with --level. Use high tier bitrate limits
instead of the main tier limits during encoding.
High tier requires level 4 or higher.
--vaq <integer> : Enable variance adaptive quantization with given
strength, in range 1..20.
Compression tools:
--(no-)deblock <beta:tc> : Deblocking filter. [0:0]

View file

@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
#
# Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
ver_major=6
ver_minor=1
ver_minor=2
ver_release=0
# Prevents configure from adding a lot of defines to the CFLAGS

View file

@ -195,6 +195,10 @@ Same as \-\-level but warnings instead of errors.
Used with \-\-level. Use high tier bitrate limits
instead of the main tier limits during encoding.
High tier requires level 4 or higher.
.TP
\fB\-\-vaq <integer>
Enable variance adaptive quantization with given
strength, in range 1..20.
.SS "Compression tools:"
.TP

View file

@ -143,6 +143,8 @@ int kvz_config_init(kvz_config *cfg)
cfg->me_max_steps = (uint32_t)-1;
cfg->vaq = 0;
cfg->scaling_list = KVZ_SCALING_LIST_OFF;
cfg->max_merge = 5;
@ -1305,6 +1307,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
}
else if (OPT("fast-residual-cost"))
cfg->fast_residual_cost_limit = atoi(value);
else if (OPT("vaq")) {
cfg->vaq = (int)atoi(value);
}
else if (OPT("max-merge")) {
int max_merge = atoi(value);
if (max_merge < 1 || max_merge > 5) {
@ -1466,6 +1471,11 @@ int kvz_config_validate(const kvz_config *const cfg)
{
int error = 0;
if (cfg->vaq < 0) {
fprintf(stderr, "vaq strength must be positive\n");
error = 1;
}
if (cfg->width <= 0) {
fprintf(stderr, "Input error: width must be positive\n");
error = 1;

View file

@ -133,6 +133,8 @@ static const struct option long_options[] = {
{ "set-qp-in-cu", no_argument, NULL, 0 },
{ "open-gop", no_argument, NULL, 0 },
{ "no-open-gop", no_argument, NULL, 0 },
{ "vaq", required_argument, NULL, 0 },
{ "no-vaq", no_argument, NULL, 0 },
{ "scaling-list", required_argument, NULL, 0 },
{ "max-merge", required_argument, NULL, 0 },
{ "early-skip", no_argument, NULL, 0 },
@ -457,6 +459,8 @@ void print_help(void)
" --high-tier : Used with --level. Use high tier bitrate limits\n"
" instead of the main tier limits during encoding.\n"
" High tier requires level 4 or higher.\n"
" --vaq <integer> : Enable variance adaptive quantization with given\n"
" strength, in range 1..20.\n"
"\n"
/* Word wrap to this width to stay under 80 characters (including ") *************/
"Compression tools:\n"

View file

@ -269,6 +269,8 @@ static void encode_transform_coeff(encoder_state_t * const state,
if (state->must_code_qp_delta) {
const int qp_pred = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
const int qp_delta = cur_cu->qp - qp_pred;
assert(KVZ_BIT_DEPTH == 8 && "This range applies only to 8-bit encoding.");
assert(qp_delta >= -26 && qp_delta <= 25 && "QP delta not in valid range [-26, 25]."); // This range applies only to 8-bit encoding
const int qp_delta_abs = ABS(qp_delta);
cabac_data_t* cabac = &state->cabac;

View file

@ -376,7 +376,7 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
// for SMP and AMP partition units.
encoder->tr_depth_inter = 0;
if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) {
if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
encoder->max_qp_delta_depth = 0;
} else {
encoder->max_qp_delta_depth = -1;

View file

@ -55,7 +55,7 @@ typedef struct encoder_control_t
int32_t width_in_lcu;
int32_t height_in_lcu;
int32_t real_width; /*!< \brief real input picture width */
int32_t real_height; /*!< \brief real input picture width */
int32_t real_height; /*!< \brief real input picture height */
int64_t pixels_per_pic;
int8_t source_scan_type;
} in;

View file

@ -59,6 +59,7 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
const encoder_control_t * const encoder = state->encoder_control;
const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
state->frame->lcu_stats = calloc(num_lcus, sizeof(lcu_stats_t));
state->frame->aq_offsets = MALLOC(double, num_lcus);
for (int y = 0; y < encoder->in.height_in_lcu; y++) {
for (int x = 0; x < encoder->in.width_in_lcu; x++) {
@ -92,6 +93,7 @@ static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
kvz_image_list_destroy(state->frame->ref);
FREE_POINTER(state->frame->lcu_stats);
FREE_POINTER(state->frame->aq_offsets);
}
static int encoder_state_config_tile_init(encoder_state_t * const state,

View file

@ -37,6 +37,8 @@
#include "tables.h"
#include "threadqueue.h"
#include "strategies/strategies-picture.h"
int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
int i;
@ -1223,6 +1225,21 @@ static void normalize_lcu_weights(encoder_state_t * const state)
}
}
// Check if lcu is edge lcu. Return false if frame dimensions are 64 divisible
static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
{
if (xdiv64 && ydiv64) {
return false;
}
int last_row_first_id = (lcus_y - 1) * lcus_x;
if ((id % lcus_x == lcus_x - 1 && !xdiv64) || (id >= last_row_first_id && !ydiv64)) {
return true;
}
else {
return false;
}
}
static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
assert(state->type == ENCODER_STATE_TYPE_MAIN);
@ -1236,6 +1253,92 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
state->tile->frame->height
);
// Variance adaptive quantization
if (cfg->vaq) {
const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
double d = cfg->vaq * 0.1; // Empirically decided constant. Affects delta-QP strength
// Calculate frame pixel variance
uint32_t len = state->tile->frame->width * state->tile->frame->height;
uint32_t c_len = len / 4;
double frame_var = kvz_pixel_var(state->tile->frame->source->y, len);
if (has_chroma) {
frame_var += kvz_pixel_var(state->tile->frame->source->u, c_len);
frame_var += kvz_pixel_var(state->tile->frame->source->v, c_len);
}
// Loop through LCUs
// For each LCU calculate: D * (log(LCU pixel variance) - log(frame pixel variance))
unsigned x_lim = state->tile->frame->width_in_lcu;
unsigned y_lim = state->tile->frame->height_in_lcu;
unsigned id = 0;
for (int y = 0; y < y_lim; ++y) {
for (int x = 0; x < x_lim; ++x) {
kvz_pixel tmp[LCU_LUMA_SIZE];
int pxl_x = x * LCU_WIDTH;
int pxl_y = y * LCU_WIDTH;
int x_max = MIN(pxl_x + LCU_WIDTH, frame->width) - pxl_x;
int y_max = MIN(pxl_y + LCU_WIDTH, frame->height) - pxl_y;
bool xdiv64 = false;
bool ydiv64 = false;
if (frame->width % 64 == 0) xdiv64 = true;
if (frame->height % 64 == 0) ydiv64 = true;
// Luma variance
if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
kvz_pixels_blit(&state->tile->frame->source->y[pxl_x + pxl_y * state->tile->frame->source->stride], tmp,
x_max, y_max, state->tile->frame->source->stride, LCU_WIDTH);
} else {
// Extend edge pixels for edge lcus
for (int y = 0; y < LCU_WIDTH; y++) {
for (int x = 0; x < LCU_WIDTH; x++) {
int src_y = CLIP(0, frame->height - 1, pxl_y + y);
int src_x = CLIP(0, frame->width - 1, pxl_x + x);
tmp[y * LCU_WIDTH + x] = state->tile->frame->source->y[src_y * state->tile->frame->source->stride + src_x];
}
}
}
double lcu_var = kvz_pixel_var(tmp, LCU_LUMA_SIZE);
if (has_chroma) {
// Add chroma variance if not monochrome
int32_t c_stride = state->tile->frame->source->stride >> 1;
kvz_pixel chromau_tmp[LCU_CHROMA_SIZE];
kvz_pixel chromav_tmp[LCU_CHROMA_SIZE];
int lcu_chroma_width = LCU_WIDTH >> 1;
int c_pxl_x = x * lcu_chroma_width;
int c_pxl_y = y * lcu_chroma_width;
int c_x_max = MIN(c_pxl_x + lcu_chroma_width, frame->width >> 1) - c_pxl_x;
int c_y_max = MIN(c_pxl_y + lcu_chroma_width, frame->height >> 1) - c_pxl_y;
if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
kvz_pixels_blit(&state->tile->frame->source->u[c_pxl_x + c_pxl_y * c_stride], chromau_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
kvz_pixels_blit(&state->tile->frame->source->v[c_pxl_x + c_pxl_y * c_stride], chromav_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
}
else {
for (int y = 0; y < lcu_chroma_width; y++) {
for (int x = 0; x < lcu_chroma_width; x++) {
int src_y = CLIP(0, (frame->height >> 1) - 1, c_pxl_y + y);
int src_x = CLIP(0, (frame->width >> 1) - 1, c_pxl_x + x);
chromau_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->u[src_y * c_stride + src_x];
chromav_tmp[y * lcu_chroma_width + x] = state->tile->frame->source->v[src_y * c_stride + src_x];
}
}
}
lcu_var += kvz_pixel_var(chromau_tmp, LCU_CHROMA_SIZE);
lcu_var += kvz_pixel_var(chromav_tmp, LCU_CHROMA_SIZE);
}
state->frame->aq_offsets[id] = d * (log(lcu_var) - log(frame_var));
id++;
}
}
}
// Variance adaptive quantization - END
// Use this flag to handle closed gop irap picture selection.
// If set to true, irap is already set and we avoid
// setting it based on the intra period

View file

@ -160,6 +160,11 @@ typedef struct encoder_state_config_frame_t {
struct encoder_state_t const *previous_layer_state;
/**
* \brief Calculated adaptive QP offset for each LCU.
*/
double *aq_offsets;
/**
* \brief Whether next NAL is the first NAL in the access unit.
*/

View file

@ -399,6 +399,8 @@ typedef struct kvz_config
/** \brief Flag to enable/disable open GOP configuration */
int8_t open_gop;
int32_t vaq; /** \brief Enable variance adaptive quantization*/
/** \brief Type of scaling lists to use */
int8_t scaling_list;

View file

@ -637,8 +637,22 @@ static double get_ctu_bits(encoder_state_t * const state, vector2d_t pos) {
return avg_bits;
}
static double qp_to_lambda(encoder_state_t* const state, int qp)
{
const int shift_qp = 12;
double lambda = 0.57 * pow(2.0, (qp - shift_qp) / 3.0);
void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
// NOTE: HM adjusts lambda for inter according to Hadamard usage in ME.
// SATD is currently always enabled for ME, so this has no effect.
// bool hadamard_me = true;
// if (!hadamard_me && state->frame->slicetype != KVZ_SLICE_I) {
// lambda *= 0.95;
// }
return lambda;
}
void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
double bits = get_ctu_bits(state, pos);
const encoder_control_t * const encoder = state->encoder_control;
@ -750,6 +764,26 @@ void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
ctu->qp = est_qp;
ctu->lambda = est_lambda;
ctu->i_cost = 0;
// Apply variance adaptive quantization
if (encoder->cfg.vaq) {
vector2d_t lcu = {
pos.x + state->tile->lcu_offset_x,
pos.y + state->tile->lcu_offset_y
};
int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
int aq_offset = round(state->frame->aq_offsets[id]);
state->qp += aq_offset;
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
state->qp = CLIP_TO_QP(state->qp);
state->lambda = qp_to_lambda(state, state->qp);
state->lambda_sqrt = sqrt(state->lambda);
//ctu->qp = state->qp;
//ctu->lambda = state->lambda;
}
}
@ -894,22 +928,6 @@ void kvz_update_after_picture(encoder_state_t * const state) {
}
}
static double qp_to_lambda(encoder_state_t * const state, int qp)
{
const int shift_qp = 12;
double lambda = 0.57 * pow(2.0, (qp - shift_qp) / 3.0);
// NOTE: HM adjusts lambda for inter according to Hadamard usage in ME.
// SATD is currently always enabled for ME, so this has no effect.
// bool hadamard_me = true;
// if (!hadamard_me && state->frame->slicetype != KVZ_SLICE_I) {
// lambda *= 0.95;
// }
return lambda;
}
/**
* \brief Allocate bits and set lambda and QP for the current picture.
* \param state the main encoder state
@ -1049,4 +1067,21 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
state->lambda = state->frame->lambda;
state->lambda_sqrt = sqrt(state->frame->lambda);
}
// Apply variance adaptive quantization
if (ctrl->cfg.vaq) {
vector2d_t lcu = {
pos.x + state->tile->lcu_offset_x,
pos.y + state->tile->lcu_offset_y
};
int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
int aq_offset = round(state->frame->aq_offsets[id]);
state->qp += aq_offset;
// Maximum delta QP is clipped between [-26, 25] according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
// Since this value will be later combined with qp_pred, clip to half of that instead to be safe
state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
state->qp = CLIP_TO_QP(state->qp);
state->lambda = qp_to_lambda(state, state->qp);
state->lambda_sqrt = sqrt(state->lambda);
}
}

View file

@ -3,6 +3,30 @@
#include <immintrin.h>
// The calling convention used by MSVC on 32-bit builds will essentially
// disallow functions to have more than 3 XMM/YMM parameters, because it
// will not provide more than 8-byte param alignment, and only the first
// three vector params will be carried in SIMD registers. Now the
// vectorcall convention could probably be problematic in globally visible
// funcitons, but likely not in static ones.
#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
#define FIX_W32 __vectorcall
#else
#define FIX_W32
#endif
// Non-inline functions defined in this header are likely to trigger a
// warning for each module including this header that does NOT use them,
// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).
// Tell 'em we actually want to do that, it's not an accident.
#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
#define FIX_UNUSED __attribute__((unused))
#else
#define FIX_UNUSED
#endif
#define FIX_NOINLINE FIX_W32 FIX_UNUSED
/*
* Reorder coefficients from raster to scan order
* Fun fact: Once upon a time, doing this in a loop looked like this:
@ -111,4 +135,19 @@ static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t
*last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
}
static int32_t FIX_NOINLINE hsum_8x32b(const __m256i v)
{
__m256i sum1 = v;
__m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
__m256i sum3 = _mm256_add_epi32 (sum1, sum2);
__m256i sum4 = _mm256_shuffle_epi32 (sum3, _MM_SHUFFLE(1, 0, 3, 2));
__m256i sum5 = _mm256_add_epi32 (sum3, sum4);
__m256i sum6 = _mm256_shuffle_epi32 (sum5, _MM_SHUFFLE(2, 3, 0, 1));
__m256i sum7 = _mm256_add_epi32 (sum5, sum6);
__m128i sum8 = _mm256_castsi256_si128 (sum7);
int32_t sum9 = _mm_cvtsi128_si32 (sum8);
return sum9;
}
#endif

View file

@ -1051,6 +1051,181 @@ static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat
pic_stride, ref_stride, left, right);
}
static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len)
{
const float len_f = (float)len;
const __m256i zero = _mm256_setzero_si256();
size_t i;
__m256i sums = zero;
for (i = 0; i + 31 < len; i += 32) {
__m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
__m256i curr_sum = _mm256_sad_epu8(curr, zero);
sums = _mm256_add_epi64(sums, curr_sum);
}
__m128i sum_lo = _mm256_castsi256_si128 (sums);
__m128i sum_hi = _mm256_extracti128_si256(sums, 1);
__m128i sum_3 = _mm_add_epi64 (sum_lo, sum_hi);
__m128i sum_4 = _mm_shuffle_epi32 (sum_3, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sum_5 = _mm_add_epi64 (sum_3, sum_4);
int64_t sum = _mm_cvtsi128_si64(sum_5);
// Remaining len mod 32 pixels
for (; i < len; ++i) {
sum += buf[i];
}
float mean_f = (float)sum / len_f;
__m256 mean = _mm256_set1_ps(mean_f);
__m256 accum = _mm256_setzero_ps();
for (i = 0; i + 31 < len; i += 32) {
__m128i curr0 = _mm_loadl_epi64((const __m128i *)(buf + i + 0));
__m128i curr1 = _mm_loadl_epi64((const __m128i *)(buf + i + 8));
__m128i curr2 = _mm_loadl_epi64((const __m128i *)(buf + i + 16));
__m128i curr3 = _mm_loadl_epi64((const __m128i *)(buf + i + 24));
__m256i curr0_32 = _mm256_cvtepu8_epi32(curr0);
__m256i curr1_32 = _mm256_cvtepu8_epi32(curr1);
__m256i curr2_32 = _mm256_cvtepu8_epi32(curr2);
__m256i curr3_32 = _mm256_cvtepu8_epi32(curr3);
__m256 curr0_f = _mm256_cvtepi32_ps (curr0_32);
__m256 curr1_f = _mm256_cvtepi32_ps (curr1_32);
__m256 curr2_f = _mm256_cvtepi32_ps (curr2_32);
__m256 curr3_f = _mm256_cvtepi32_ps (curr3_32);
__m256 curr0_sd = _mm256_sub_ps (curr0_f, mean);
__m256 curr1_sd = _mm256_sub_ps (curr1_f, mean);
__m256 curr2_sd = _mm256_sub_ps (curr2_f, mean);
__m256 curr3_sd = _mm256_sub_ps (curr3_f, mean);
__m256 curr0_v = _mm256_mul_ps (curr0_sd, curr0_sd);
__m256 curr1_v = _mm256_mul_ps (curr1_sd, curr1_sd);
__m256 curr2_v = _mm256_mul_ps (curr2_sd, curr2_sd);
__m256 curr3_v = _mm256_mul_ps (curr3_sd, curr3_sd);
__m256 curr01 = _mm256_add_ps (curr0_v, curr1_v);
__m256 curr23 = _mm256_add_ps (curr2_v, curr3_v);
__m256 curr = _mm256_add_ps (curr01, curr23);
accum = _mm256_add_ps (accum, curr);
}
__m256d accum_d = _mm256_castps_pd (accum);
__m256d accum2_d = _mm256_permute4x64_pd(accum_d, _MM_SHUFFLE(1, 0, 3, 2));
__m256 accum2 = _mm256_castpd_ps (accum2_d);
__m256 accum3 = _mm256_add_ps (accum, accum2);
__m256 accum4 = _mm256_permute_ps (accum3, _MM_SHUFFLE(1, 0, 3, 2));
__m256 accum5 = _mm256_add_ps (accum3, accum4);
__m256 accum6 = _mm256_permute_ps (accum5, _MM_SHUFFLE(2, 3, 0, 1));
__m256 accum7 = _mm256_add_ps (accum5, accum6);
float var_sum = _mm256_cvtss_f32 (accum7);
// Remaining len mod 32 pixels
for (; i < len; ++i) {
float diff = buf[i] - mean_f;
var_sum += diff * diff;
}
return var_sum / len_f;
}
#ifdef INACCURATE_VARIANCE_CALCULATION
// Assumes that u is a power of two
static INLINE uint32_t ilog2(uint32_t u)
{
return _tzcnt_u32(u);
}
// A B C D | E F G H (8x32b)
// ==>
// A+B C+D | E+F G+H (4x64b)
static __m256i hsum_epi32_to_epi64(const __m256i v)
{
const __m256i zero = _mm256_setzero_si256();
__m256i v_shufd = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
__m256i sums_32 = _mm256_add_epi32 (v, v_shufd);
__m256i sums_64 = _mm256_blend_epi32 (sums_32, zero, 0xaa);
return sums_64;
}
static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
{
assert(sizeof(*buf) == 1);
assert((len & 31) == 0);
// Uses Q8.7 numbers to measure mean and deviation, so variances are Q16.14
const uint64_t sum_maxwid = ilog2(len) + (8 * sizeof(*buf));
const __m128i normalize_sum = _mm_cvtsi32_si128(sum_maxwid - 15); // Normalize mean to [0, 32767], so signed 16-bit subtraction never overflows
const __m128i debias_sum = _mm_cvtsi32_si128(1 << (sum_maxwid - 16));
const float varsum_to_f = 1.0f / (float)(1 << (14 + ilog2(len)));
const bool power_of_two = (len & (len - 1)) == 0;
if (sum_maxwid > 32 || sum_maxwid < 15 || !power_of_two) {
return pixel_var_avx2_largebuf(buf, len);
}
const __m256i zero = _mm256_setzero_si256();
const __m256i himask_15 = _mm256_set1_epi16(0x7f00);
size_t i;
__m256i sums = zero;
for (i = 0; i < len; i += 32) {
__m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
__m256i curr_sum = _mm256_sad_epu8(curr, zero);
sums = _mm256_add_epi64(sums, curr_sum);
}
__m128i sum_lo = _mm256_castsi256_si128 (sums);
__m128i sum_hi = _mm256_extracti128_si256(sums, 1);
__m128i sum_3 = _mm_add_epi64 (sum_lo, sum_hi);
__m128i sum_4 = _mm_shuffle_epi32 (sum_3, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sum_5 = _mm_add_epi64 (sum_3, sum_4);
__m128i sum_5n = _mm_srl_epi32 (sum_5, normalize_sum);
sum_5n = _mm_add_epi32 (sum_5n, debias_sum);
__m256i sum_n = _mm256_broadcastw_epi16 (sum_5n);
__m256i accum = zero;
for (i = 0; i < len; i += 32) {
__m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
__m256i curr0 = _mm256_slli_epi16 (curr, 7);
__m256i curr1 = _mm256_srli_epi16 (curr, 1);
curr0 = _mm256_and_si256 (curr0, himask_15);
curr1 = _mm256_and_si256 (curr1, himask_15);
__m256i dev0 = _mm256_sub_epi16 (curr0, sum_n);
__m256i dev1 = _mm256_sub_epi16 (curr1, sum_n);
__m256i vars0 = _mm256_madd_epi16 (dev0, dev0);
__m256i vars1 = _mm256_madd_epi16 (dev1, dev1);
__m256i varsum = _mm256_add_epi32 (vars0, vars1);
varsum = hsum_epi32_to_epi64(varsum);
accum = _mm256_add_epi64 (accum, varsum);
}
__m256i accum2 = _mm256_permute4x64_epi64(accum, _MM_SHUFFLE(1, 0, 3, 2));
__m256i accum3 = _mm256_add_epi64 (accum, accum2);
__m256i accum4 = _mm256_permute4x64_epi64(accum3, _MM_SHUFFLE(2, 3, 1, 0));
__m256i v_tot = _mm256_add_epi64 (accum3, accum4);
__m128i vt128 = _mm256_castsi256_si128 (v_tot);
uint64_t vars = _mm_cvtsi128_si64 (vt128);
return (float)vars * varsum_to_f;
}
#else // INACCURATE_VARIANCE_CALCULATION
static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
{
return pixel_var_avx2_largebuf(buf, len);
}
#endif // !INACCURATE_VARIANCE_CALCULATION
#endif //COMPILE_INTEL_AVX2
int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
@ -1089,6 +1264,8 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
success &= kvz_strategyselector_register(opaque, "hor_sad", "avx2", 40, &hor_sad_avx2);
success &= kvz_strategyselector_register(opaque, "pixel_var", "avx2", 40, &pixel_var_avx2);
}
#endif
return success;

View file

@ -26,6 +26,7 @@
// Use a couple generic functions from here as a worst-case fallback
#include "strategies/generic/sao_shared_generics.h"
#include "strategies/avx2/avx2_common_functions.h"
#include "strategies/missing-intel-intrinsics.h"
#include "cu.h"
#include "encoder.h"
@ -34,37 +35,10 @@
#include "sao.h"
#include "strategyselector.h"
// The calling convention used by MSVC on 32-bit builds will essentially
// disallow functions to have more than 3 XMM/YMM parameters, because it
// will not provide more than 8-byte param alignment, and only the first
// three vector params will be carried in SIMD registers. Now the
// vectorcall convention could probably be problematic in globally visible
// funcitons, but likely not in static ones.
#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
#define FIX_W32 __vectorcall
#else
#define FIX_W32
#endif
// These optimizations are based heavily on sao-generic.c.
// Might be useful to check that if (when) this file
// is difficult to understand.
static int32_t FIX_W32 hsum_8x32b(const __m256i v)
{
__m256i sum1 = v;
__m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
__m256i sum3 = _mm256_add_epi32 (sum1, sum2);
__m256i sum4 = _mm256_shuffle_epi32 (sum3, _MM_SHUFFLE(1, 0, 3, 2));
__m256i sum5 = _mm256_add_epi32 (sum3, sum4);
__m256i sum6 = _mm256_shuffle_epi32 (sum5, _MM_SHUFFLE(2, 3, 0, 1));
__m256i sum7 = _mm256_add_epi32 (sum5, sum6);
__m128i sum8 = _mm256_castsi256_si128 (sum7);
int32_t sum9 = _mm_cvtsi128_si32 (sum8);
return sum9;
}
// Do the SIGN3 operation for the difference a-b
static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
{

View file

@ -675,6 +675,32 @@ static uint32_t hor_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_
return result;
}
// Calculate pixel value variance. Takes in arrays of kvz_pixel
static double pixel_var_generic(const kvz_pixel *arr, const uint32_t len)
{
double var = 0;
double arr_mean = 0;
// Calculate array mean
int i = 0;
double sum = 0;
for (; i < len; ++i) {
sum += arr[i];
}
arr_mean = sum / (double)len;
// Calculate array variance
for (i = 0; i < len; ++i) {
double tmp = (double)arr[i] - arr_mean;
var += tmp*tmp;
}
var /= len;
return var;
}
int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
{
bool success = true;
@ -714,5 +740,7 @@ int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic);
success &= kvz_strategyselector_register(opaque, "pixel_var", "generic", 0, &pixel_var_generic);
return success;
}

View file

@ -67,6 +67,8 @@ get_optimized_sad_func *kvz_get_optimized_sad = 0;
ver_sad_func *kvz_ver_sad = 0;
hor_sad_func *kvz_hor_sad = 0;
pixel_var_func *kvz_pixel_var = 0;
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {
bool success = true;

View file

@ -138,6 +138,8 @@ typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
bool predict_luma,
bool predict_chroma);
typedef double (pixel_var_func)(const kvz_pixel *buf, const uint32_t len);
// Declare function pointers.
extern reg_sad_func * kvz_reg_sad;
@ -176,6 +178,8 @@ extern get_optimized_sad_func *kvz_get_optimized_sad;
extern ver_sad_func *kvz_ver_sad;
extern hor_sad_func *kvz_hor_sad;
extern pixel_var_func *kvz_pixel_var;
int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n);
@ -211,6 +215,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
{"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
{"ver_sad", (void**) &kvz_ver_sad}, \
{"hor_sad", (void**) &kvz_hor_sad}, \
{"pixel_var", (void**) &kvz_pixel_var}, \

View file

@ -10,3 +10,6 @@ common_args='264x130 10 -p0 -r1 --threads=2 --wpp --owf=1 --rd=0'
valgrind_test $common_args --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3
valgrind_test $common_args --no-rdoq --no-signhide --subme=0
valgrind_test $common_args --rdoq --no-deblock --no-sao --subme=0
valgrind_test $common_args --vaq=8
valgrind_test $common_args --vaq=8 --bitrate 3500
valgrind_test $common_args --vaq=8 --rc-algorithm oba --bitrate 3500