Merge branch 'new_presets'

Significant boost to either BDRate, speed or both for every preset.
This commit is contained in:
Ari Koivula 2016-09-29 17:35:35 +03:00
commit 5f5fffb8b5
9 changed files with 405 additions and 256 deletions

View file

@ -89,8 +89,8 @@ matrix:
- env: VALGRIND_TEST="-p4 -r1 --owf=0 --threads=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
- env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
- env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
- env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --tiles-height-split=u2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
- env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --tiles-height-split=u2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
- env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
- env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
# Tests for rdoq, sao, deblock and signhide and subme.
- env: VALGRIND_TEST="-p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3"

499
src/cfg.c
View file

@ -45,21 +45,22 @@ int kvz_config_init(kvz_config *cfg)
cfg->framerate = 25; // deprecated and will be removed.
cfg->framerate_num = 0;
cfg->framerate_denom = 1;
cfg->qp = 32;
cfg->intra_period = 0;
cfg->qp = 22;
cfg->intra_period = 64;
cfg->vps_period = 0;
cfg->deblock_enable = 1;
cfg->deblock_beta = 0;
cfg->deblock_tc = 0;
cfg->sao_enable = 1;
cfg->rdoq_enable = 1;
cfg->rdoq_skip = 1;
cfg->signhide_enable = true;
cfg->smp_enable = false;
cfg->amp_enable = false;
cfg->rdo = 1;
cfg->mv_rdo = 0;
cfg->full_intra_search = 0;
cfg->trskip_enable = 1;
cfg->trskip_enable = 0;
cfg->tr_depth_intra = 0;
cfg->ime_algorithm = 0; /* hexbs */
cfg->fme_level = 4;
@ -76,7 +77,8 @@ int kvz_config_init(kvz_config *cfg)
cfg->aud_enable = 0;
cfg->cqmfile = NULL;
cfg->ref_frames = DEFAULT_REF_PIC_COUNT;
cfg->gop_len = 0;
cfg->gop_len = 4;
cfg->gop_lowdelay = true;
cfg->bipred = 0;
cfg->target_bitrate = 0;
cfg->hash = KVZ_HASH_CHECKSUM;
@ -90,20 +92,20 @@ int kvz_config_init(kvz_config *cfg)
cfg->tiles_width_split = NULL;
cfg->tiles_height_split = NULL;
cfg->wpp = 0;
cfg->wpp = 1;
cfg->owf = -1;
cfg->slice_count = 1;
cfg->slice_addresses_in_ts = MALLOC(int32_t, 1);
cfg->slice_addresses_in_ts[0] = 0;
cfg->threads = 0;
cfg->threads = -1;
cfg->cpuid = 1;
// Defaults for what sizes of PUs are tried.
cfg->pu_depth_inter.min = 0; // 0-3
cfg->pu_depth_inter.min = 2; // 0-3
cfg->pu_depth_inter.max = 3; // 0-3
cfg->pu_depth_intra.min = 1; // 0-4
cfg->pu_depth_intra.max = 4; // 0-4
cfg->pu_depth_intra.min = 2; // 0-4
cfg->pu_depth_intra.max = 3; // 0-4
cfg->add_encoder_info = true;
cfg->calc_psnr = true;
@ -111,13 +113,14 @@ int kvz_config_init(kvz_config *cfg)
cfg->mv_constraint = KVZ_MV_CONSTRAIN_NONE;
cfg->crypto_features = KVZ_CRYPTO_OFF;
cfg->me_early_termination = 0;
cfg->rdoq_skip = 0;
cfg->me_early_termination = 1;
cfg->input_format = KVZ_FORMAT_P420;
cfg->input_bitdepth = 8;
cfg->gop_lp_definition.d = 3;
cfg->gop_lp_definition.t = 1;
return 1;
}
@ -309,195 +312,235 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
static const char * const me_early_termination_names[] = { "off", "on", "sensitive", NULL };
static const char * const preset_values[11][32] = {
static const char * const preset_values[11][20*2] = {
{
"ultrafast",
"pu-depth-intra", "2-3",
"pu-depth-inter", "1-3",
"pu-depth-inter", "2-3",
"rd", "0",
"me", "hexbs",
"ref", "1",
"deblock", "1",
"deblock", "0:0",
"signhide", "0",
"subme", "0",
"sao", "0",
"rdoq", "0",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"gop", "lp-g4d3t1",
NULL
},
{
"superfast",
"pu-depth-intra", "2-3",
"pu-depth-inter", "2-3",
"rd", "0",
"me", "hexbs",
"ref", "1",
"deblock", "0:0",
"signhide", "0",
"subme", "0",
"sao", "1",
"rdoq", "0",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"gop", "lp-g4d3t1",
NULL
},
{
"veryfast",
"pu-depth-intra", "2-3",
"pu-depth-inter", "2-3",
"rd", "0",
"me", "hexbs",
"ref", "1",
"deblock", "0:0",
"signhide", "0",
"subme", "2",
"sao", "1",
"rdoq", "0",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"gop", "lp-g4d3t1",
NULL
},
{
"faster",
"pu-depth-intra", "2-3",
"pu-depth-inter", "1-3",
"rd", "1",
"me", "hexbs",
"ref", "1",
"deblock", "0:0",
"signhide", "0",
"subme", "2",
"sao", "1",
"rdoq", "0",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "sensitive",
"gop", "lp-g4d3t1",
NULL
},
{
"fast",
"pu-depth-intra", "2-3",
"pu-depth-inter", "1-3",
"rd", "1",
"me", "hexbs",
"ref", "1",
"deblock", "0:0",
"signhide", "0",
"subme", "4",
"sao", "1",
"rdoq", "0",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "on",
"gop", "lp-g4d3t1",
NULL
},
{
"medium",
"pu-depth-intra", "1-3",
"pu-depth-inter", "1-3",
"rd", "1",
"me", "hexbs",
"ref", "1",
"deblock", "1",
"deblock", "0:0",
"signhide", "0",
"subme", "0",
"sao", "0",
"rdoq", "0",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
NULL
},
{
"veryfast",
"pu-depth-intra", "1-3",
"pu-depth-inter", "0-3",
"rd", "1",
"me", "hexbs",
"ref", "2",
"deblock", "1",
"signhide", "0",
"subme", "0",
"sao", "0",
"rdoq", "0",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
NULL
},
{
"faster",
"pu-depth-intra", "1-3",
"pu-depth-inter", "0-3",
"rd", "1",
"me", "hexbs",
"ref", "2",
"deblock", "1",
"signhide", "1",
"subme", "0",
"sao", "0",
"rdoq", "0",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
NULL
},
{
"fast",
"pu-depth-intra", "1-3",
"pu-depth-inter", "0-3",
"rd", "1",
"me", "hexbs",
"ref", "2",
"deblock", "1",
"signhide", "1",
"subme", "4",
"sao", "0",
"rdoq", "0",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
NULL
},
{
"medium",
"pu-depth-intra", "1-4",
"pu-depth-inter", "0-3",
"rd", "1",
"me", "hexbs",
"ref", "3",
"deblock", "1",
"signhide", "1",
"subme", "4",
"sao", "0",
"rdoq", "0",
"sao", "1",
"rdoq", "1",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "on",
"gop", "lp-g4d3t1",
NULL
},
{
"slow",
"pu-depth-intra", "1-4",
"pu-depth-inter", "0-3",
"rd", "2",
"pu-depth-intra", "1-3",
"pu-depth-inter", "1-3",
"rd", "1",
"me", "hexbs",
"ref", "3",
"deblock", "1",
"signhide", "1",
"subme", "4",
"sao", "1",
"rdoq", "0",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
NULL
},
{
"slower",
"pu-depth-intra", "1-4",
"pu-depth-inter", "0-3",
"rd", "2",
"me", "tz",
"ref", "4",
"deblock", "1",
"ref", "2",
"deblock", "0:0",
"signhide", "1",
"subme", "4",
"sao", "1",
"rdoq", "1",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "on",
"gop", "lp-g4d2t1",
NULL
},
{
"slower",
"pu-depth-intra", "1-3",
"pu-depth-inter", "0-3",
"rd", "1",
"me", "hexbs",
"ref", "2",
"deblock", "0:0",
"signhide", "1",
"subme", "4",
"sao", "1",
"rdoq", "1",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "on",
"gop", "lp-g4d2t1",
NULL
},
{
"veryslow",
"pu-depth-intra", "1-4",
"pu-depth-inter", "0-3",
"rd", "2",
"me", "tz",
"ref", "4",
"deblock", "1",
"rd", "1",
"me", "hexbs",
"ref", "3",
"deblock", "0:0",
"signhide", "1",
"subme", "4",
"sao", "1",
"rdoq", "1",
"transform-skip", "1",
"rdoq-skip", "1",
"transform-skip", "0",
"full-intra-search", "0",
"mv-rdo", "1",
"mv-rdo", "0",
"smp", "0",
"amp", "0",
"cu-split-termination", "zero",
"me-early-termination", "on",
"gop", "lp-g4d2t1",
NULL
},
{
"placebo",
"pu-depth-intra", "0-4",
"pu-depth-intra", "1-4",
"pu-depth-inter", "0-3",
"rd", "3",
"rd", "1",
"me", "tz",
"ref", "6",
"deblock", "1",
"ref", "4",
"deblock", "0:0",
"signhide", "1",
"subme", "4",
"sao", "1",
"rdoq", "1",
"rdoq-skip", "0",
"transform-skip", "1",
"full-intra-search", "1",
"full-intra-search", "0",
"mv-rdo", "1",
"smp", "1",
"amp", "1",
"cu-split-termination", "off",
"me-early-termination", "off",
"gop", "lp-g4d2t1",
NULL
},
{ NULL }
@ -683,105 +726,32 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
struct {
unsigned g; // length
unsigned d; // depth
unsigned r; // references
unsigned t; // temporal
} gop = { 0, 0, 0, 0 };
} gop = { 0, 0, 0 };
if (sscanf(value, "lp-g%ud%ur%ut%u", &gop.g, &gop.d, &gop.r, &gop.t) != 4) {
fprintf(stderr, "Error in GOP syntax. Example: lp-g8d4r2t2\n");
// Parse --gop=lp-g#d#t#
if (sscanf(value, "lp-g%ud%ut%u", &gop.g, &gop.d, &gop.t) != 3) {
fprintf(stderr, "Error in GOP syntax. Example: lp-g8d4t2\n");
return 0;
}
if (gop.g < 1 || gop.g > 32) {
fprintf(stderr, "gop.g must be between 1 and 32.\n");
return 0;
}
if (gop.d < 1 || gop.d > 8) {
fprintf(stderr, "gop.d must be between 1 and 8.\n");
}
if (gop.r < 1 || gop.r > 15) {
fprintf(stderr, "gop.d must be between 1 and 15.\n");
return 0;
}
if (gop.t < 1 || gop.t > 15) {
fprintf(stderr, "gop.t must be between 1 and 32.\n");
fprintf(stderr, "gop.t must be between 1 and 15.\n");
return 0;
}
// Initialize modulos for testing depth.
// The picture belong to the lowest depth in which (poc % modulo) == 0.
unsigned depth_modulos[8] = { 0 };
for (int d = 0; d < gop.d; ++d) {
depth_modulos[gop.d - 1 - d] = 1 << d;
}
depth_modulos[0] = gop.g;
cfg->gop_lowdelay = 1;
cfg->gop_lowdelay = true;
cfg->gop_len = gop.g;
for (int g = 1; g <= gop.g; ++g) {
kvz_gop_config *gop_pic = &cfg->gop[g - 1];
// Find gop depth for picture.
int gop_layer = 0;
while (gop_layer < gop.d && (g % depth_modulos[gop_layer])) {
++gop_layer;
}
gop_pic->poc_offset = g;
gop_pic->layer = gop_layer + 1;
gop_pic->qp_offset = gop_layer + 1;
gop_pic->ref_pos_count = 0;
gop_pic->ref_neg_count = gop.r;
gop_pic->is_ref = 0;
// Set first ref to point to previous frame, and the rest to previous
// key-frames.
// If gop.t > 1, have (poc % gop.t) == 0 point gop.t frames away,
// instead of the previous frame. Set the frames in between to
// point to the nearest frame with a lower gop-depth.
if (gop.t > 1) {
if (gop_pic->poc_offset % gop.t == 0) {
gop_pic->ref_neg[0] = gop.t;
} else {
int r = gop_pic->poc_offset - 1;
while (r > 0) {
if (cfg->gop[r].layer < gop_pic->layer) break;
--r;
}
// Var r is now 0 or index of the pic with layer < depth.
if (cfg->gop[r].layer < gop_pic->layer) {
gop_pic->ref_neg[0] = gop_pic->poc_offset - cfg->gop[r].poc_offset;
cfg->gop[r].is_ref = 1;
} else {
// No ref was found, just refer to the previous key-frame.
gop_pic->ref_neg[0] = gop_pic->poc_offset % gop.g;
}
}
} else {
gop_pic->ref_neg[0] = 1;
if (gop_pic->poc_offset >= 2) {
cfg->gop[gop_pic->poc_offset - 2].is_ref = 1;
}
}
int keyframe = gop_pic->poc_offset;
for (int i = 1; i < gop_pic->ref_neg_count; ++i) {
while (keyframe == gop_pic->ref_neg[i - 1]) {
keyframe += gop.g;
}
gop_pic->ref_neg[i] = keyframe;
}
gop_pic->qp_factor = 0.4624; // from HM
}
for (int g = 0; g < gop.g; ++g) {
kvz_gop_config *gop_pic = &cfg->gop[g];
if (!gop_pic->is_ref) {
gop_pic->qp_factor = 0.68 * 1.31; // derived from HM
}
}
// Key-frame is always a reference.
cfg->gop[gop.g - 1].is_ref = 1;
cfg->gop[gop.g - 1].qp_factor = 0.578; // from HM
cfg->gop_lp_definition.d = gop.d;
cfg->gop_lp_definition.t = gop.t;
} else if (atoi(value) == 8) {
cfg->gop_lowdelay = 0;
// GOP
@ -821,10 +791,6 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
fprintf(stderr, "Input error: unsupported gop length, must be 0 or 8\n");
return 0;
}
if (cfg->gop_len && cfg->tmvp_enable) {
cfg->tmvp_enable = false;
fprintf(stderr, "Disabling TMVP because GOP is used.\n");
}
}
else if OPT("bipred")
cfg->bipred = atobool(value);
@ -985,6 +951,97 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
return 1;
}
void kvz_config_process_lp_gop(kvz_config *cfg)
{
struct {
unsigned g;
unsigned d;
unsigned t;
} gop;
gop.g = cfg->gop_len;
gop.d = cfg->gop_lp_definition.d;
gop.t = cfg->gop_lp_definition.t;
// Initialize modulos for testing depth.
// The picture belong to the lowest depth in which (poc % modulo) == 0.
unsigned depth_modulos[8] = { 0 };
for (int d = 0; d < gop.d; ++d) {
depth_modulos[gop.d - 1 - d] = 1 << d;
}
depth_modulos[0] = gop.g;
cfg->gop_lowdelay = 1;
cfg->gop_len = gop.g;
for (int g = 1; g <= gop.g; ++g) {
kvz_gop_config *gop_pic = &cfg->gop[g - 1];
// Find gop depth for picture.
int gop_layer = 1;
while (gop_layer < gop.d && (g % depth_modulos[gop_layer - 1])) {
++gop_layer;
}
gop_pic->poc_offset = g;
gop_pic->layer = gop_layer;
gop_pic->qp_offset = gop_layer;
gop_pic->ref_pos_count = 0;
gop_pic->ref_neg_count = cfg->ref_frames;
gop_pic->is_ref = 0;
// Set first ref to point to previous frame, and the rest to previous
// key-frames.
// If gop.t > 1, have (poc % gop.t) == 0 point gop.t frames away,
// instead of the previous frame. Set the frames in between to
// point to the nearest frame with a lower gop-depth.
if (gop.t > 1) {
if (gop_pic->poc_offset % gop.t == 0) {
gop_pic->ref_neg[0] = gop.t;
} else {
int r = gop_pic->poc_offset - 1;
while (r > 0) {
if (cfg->gop[r].layer < gop_pic->layer) break;
--r;
}
// Var r is now 0 or index of the pic with layer < depth.
if (cfg->gop[r].layer < gop_pic->layer) {
gop_pic->ref_neg[0] = gop_pic->poc_offset - cfg->gop[r].poc_offset;
cfg->gop[r].is_ref = 1;
} else {
// No ref was found, just refer to the previous key-frame.
gop_pic->ref_neg[0] = gop_pic->poc_offset % gop.g;
}
}
} else {
gop_pic->ref_neg[0] = 1;
if (gop_pic->poc_offset >= 2) {
cfg->gop[gop_pic->poc_offset - 2].is_ref = 1;
}
}
int keyframe = gop_pic->poc_offset;
for (int i = 1; i < gop_pic->ref_neg_count; ++i) {
while (keyframe == gop_pic->ref_neg[i - 1]) {
keyframe += gop.g;
}
gop_pic->ref_neg[i] = keyframe;
}
gop_pic->qp_factor = 0.4624; // from HM
}
for (int g = 0; g < gop.g; ++g) {
kvz_gop_config *gop_pic = &cfg->gop[g];
if (!gop_pic->is_ref) {
gop_pic->qp_factor = 0.68 * 1.31; // derived from HM
}
}
// Key-frame is always a reference.
cfg->gop[gop.g - 1].is_ref = 1;
cfg->gop[gop.g - 1].qp_factor = 0.578; // from HM
}
/**
* \brief Check that configuration is sensible.
*
@ -1028,11 +1085,11 @@ int kvz_config_validate(const kvz_config *const cfg)
error = 1;
}
if (cfg->gop_len &&
cfg->intra_period &&
cfg->intra_period % cfg->gop_len != 0) {
if (cfg->gop_len && cfg->intra_period && !cfg->gop_lowdelay &&
cfg->intra_period % cfg->gop_len != 0)
{
fprintf(stderr,
"Input error: intra period (%d) not a multiple of gop length (%d)\n",
"Input error: intra period (%d) not a multiple of B-gop length (%d)\n",
cfg->intra_period,
cfg->gop_len);
error = 1;

View file

@ -36,6 +36,7 @@ kvz_config *kvz_config_alloc(void);
int kvz_config_init(kvz_config *cfg);
int kvz_config_destroy(kvz_config *cfg);
int kvz_config_parse(kvz_config *cfg, const char *name, const char *value);
void kvz_config_process_lp_gop(kvz_config *cfg);
int kvz_config_validate(const kvz_config *cfg);
#endif

View file

@ -24,6 +24,7 @@
#include <stdlib.h>
#include "cfg.h"
#include "strategyselector.h"
static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
@ -36,51 +37,90 @@ static int size_of_wpp_ends(int threads)
static int select_owf_auto(const kvz_config *const cfg)
{
if (cfg->wpp) {
// If wpp is on, select owf such that less than 15% of the
// frame is covered by the are threads can not work at the same time.
if (cfg->intra_period == 1) {
if (cfg->wpp) {
// If wpp is on, select owf such that less than 15% of the
// frame is covered by the are threads can not work at the same time.
const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
// Find the largest number of threads per frame that satifies the
// the condition: wpp start/stop inefficiency takes up less than 15%
// of frame area.
int threads_per_frame = 1;
const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
while ((threads_per_frame + 1) * 2 < lcu_width &&
threads_per_frame + 1 < lcu_height &&
size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold) {
++threads_per_frame;
}
const int threads = MAX(cfg->threads, 1);
const int frames = CEILDIV(threads, threads_per_frame);
// Convert from number of parallel frames to number of additional frames.
return CLIP(0, threads - 1, frames - 1);
} else {
// If wpp is not on, select owf such that there is enough
// tiles for twice the number of threads.
int tiles_per_frame = cfg->tiles_width_count * cfg->tiles_height_count;
int threads = (cfg->threads > 1 ? cfg->threads : 1);
int frames = CEILDIV(threads * 4, tiles_per_frame);
// Limit number of frames to 1.25x the number of threads for the case
// where there is only 1 tile per frame.
frames = CLIP(1, threads * 4 / 3, frames);
return frames - 1;
}
} else {
// Try and estimate a good number of parallel frames for inter.
const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
int threads_per_frame = MIN(lcu_width / 2, lcu_height);
int threads = cfg->threads;
// Find the largest number of threads per frame that satifies the
// the condition: wpp start/stop inefficiency takes up less than 15%
// of frame area.
int threads_per_frame = 1;
const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
while ((threads_per_frame + 1) * 2 < lcu_width &&
threads_per_frame + 1 < lcu_height &&
size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold)
{
++threads_per_frame;
// If all threads fit into one frame, at least two parallel frames should
// be used to reduce the effect of WPP spin-up and wind-down.
int frames = 1;
while (threads > 0 && threads_per_frame > 0) {
frames += 1;
threads -= threads_per_frame;
threads_per_frame -= 2;
}
const int threads = MAX(cfg->threads, 1);
const int frames = CEILDIV(threads, threads_per_frame);
// Convert from number of parallel frames to number of additional frames.
return CLIP(0, threads - 1, frames - 1);
} else {
// If wpp is not on, select owf such that there is enough
// tiles for twice the number of threads.
int tiles_per_frame= cfg->tiles_width_count * cfg->tiles_height_count;
int threads = (cfg->threads > 1 ? cfg->threads : 1);
int frames = CEILDIV(threads * 4, tiles_per_frame);
// Limit number of frames to 1.25x the number of threads for the case
// where there is only 1 tile per frame.
frames = CLIP(1, threads * 4 / 3, frames);
return frames - 1;
if (cfg->gop_lowdelay && cfg->gop_lp_definition.t > 1) {
// Temporal skipping makes every other frame very fast to encode so
// more parallel frames should be used.
frames *= 2;
}
return CLIP(0, cfg->threads * 2 - 1, frames - 1);
}
}
static unsigned cfg_num_threads(void)
{
unsigned cpus = kvz_g_hardware_flags.physical_cpu_count;
unsigned fake_cpus = kvz_g_hardware_flags.logical_cpu_count - cpus;
// Default to 4 if we don't know the number of CPUs.
if (cpus == 0) return 4;
// 1.5 times the number of physical cores seems to be a good compromise
// when hyperthreading is available on Haswell.
return cpus + fake_cpus / 2;
}
/**
* \brief Allocate and initialize an encoder control structure.
*
* \param cfg encoder configuration
* \return initialized encoder control or NULL on failure
*/
encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) {
encoder_control_t* kvz_encoder_control_init(kvz_config *const cfg) {
encoder_control_t *encoder = NULL;
if (!cfg) {
@ -88,6 +128,20 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) {
goto init_failed;
}
if (cfg->threads == -1) {
cfg->threads = cfg_num_threads();
}
if (cfg->gop_len > 0) {
if (cfg->tmvp_enable) {
cfg->tmvp_enable = false;
fprintf(stderr, "Disabling TMVP because GOP is used.\n");
}
if (cfg->gop_lowdelay) {
kvz_config_process_lp_gop(cfg);
}
}
// Make sure that the parameters make sense.
if (!kvz_config_validate(cfg)) {
goto init_failed;

View file

@ -155,7 +155,7 @@ typedef struct encoder_control_t
} encoder_control_t;
encoder_control_t* kvz_encoder_control_init(const kvz_config *cfg);
encoder_control_t* kvz_encoder_control_init(kvz_config *cfg);
void kvz_encoder_control_free(encoder_control_t *encoder);
void kvz_encoder_control_input_init(encoder_control_t *encoder, int32_t width, int32_t height);

View file

@ -75,7 +75,9 @@ static kvz_encoder * kvazaar_open(const kvz_config *cfg)
goto kvazaar_open_failure;
}
encoder->control = kvz_encoder_control_init(cfg);
// FIXME: const qualifier disgarded. I don't want to change kvazaar_open
// but I really need to change cfg.
encoder->control = kvz_encoder_control_init((kvz_config*)cfg);
if (!encoder->control) {
goto kvazaar_open_failure;
}

View file

@ -312,6 +312,11 @@ typedef struct kvz_config
enum kvz_input_format input_format; /*!< \brief Use Temporal Motion Vector Predictors. */
int32_t input_bitdepth; /*!< \brief Use Temporal Motion Vector Predictors. */
struct {
unsigned d; // depth
unsigned t; // temporal
} gop_lp_definition;
} kvz_config;
/**

View file

@ -24,6 +24,15 @@
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
#include <windows.h>
#elif MACOS
#include <sys/param.h>
#include <sys/sysctl.h>
#else
#include <unistd.h>
#endif
hardware_flags_t kvz_g_hardware_flags;
hardware_flags_t kvz_g_strategies_in_use;
hardware_flags_t kvz_g_strategies_available;
@ -410,6 +419,7 @@ static void set_hardware_flags(int32_t cpuid) {
CPUID1_EDX_MMX = 1 << 23,
CPUID1_EDX_SSE = 1 << 25,
CPUID1_EDX_SSE2 = 1 << 26,
CPUID1_EDX_HYPER_THREADING = 1 << 28,
};
enum {
CPUID1_ECX_SSE3 = 1 << 0,
@ -430,6 +440,21 @@ static void set_hardware_flags(int32_t cpuid) {
// Dig CPU features with cpuid
get_cpuid(1, 0, &cpuid1);
#ifdef _WIN32
SYSTEM_INFO systeminfo;
GetSystemInfo(&systeminfo);
kvz_g_hardware_flags.logical_cpu_count = systeminfo.dwNumberOfProcessors;
#else
kvz_g_hardware_flags.logical_cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
#endif
kvz_g_hardware_flags.physical_cpu_count = kvz_g_hardware_flags.logical_cpu_count;
kvz_g_hardware_flags.intel_flags.hyper_threading = cpuid1.edx & CPUID1_EDX_HYPER_THREADING;
if (kvz_g_hardware_flags.intel_flags.hyper_threading) {
kvz_g_hardware_flags.physical_cpu_count /= 2;
}
// EDX
if (cpuid1.edx & CPUID1_EDX_MMX) kvz_g_hardware_flags.intel_flags.mmx = 1;

View file

@ -63,6 +63,8 @@ typedef struct {
int sse42;
int avx;
int avx2;
bool hyper_threading;
} intel_flags;
struct {
@ -72,6 +74,9 @@ typedef struct {
struct {
int neon;
} arm_flags;
int logical_cpu_count;
int physical_cpu_count;
} hardware_flags_t;
extern hardware_flags_t kvz_g_hardware_flags;