Rewrite owf=auto code to be more general.

- Change the definition to be a bit more general. The mapping from resolution
  to owf frames stays mostly the same however, but should handle weird
  resolutions better.
- Move everything to config module.
- Fix handling of tiles. It had a bug where owf for tiles was always
  threads * 4/3 - 1. Works as intended now.
This commit is contained in:
Ari Koivula 2014-12-05 14:42:09 +02:00
parent 33b32de6c9
commit 51b5692121
4 changed files with 60 additions and 42 deletions

View file

@ -602,3 +602,59 @@ int config_validate(config *cfg)
}
return 1;
}
int size_of_wpp_ends(int threads)
{
// Based on the shape of the area where all threads can't yet run in parallel.
return 4 * threads * threads - 2 * threads;
}
int config_set_owf_auto(config *cfg)
{
if (cfg->wpp) {
// If wpp is on, select owf such that less than 15% of the
// frame is covered by the are threads can not work at the same time.
const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
// Find the largest number of threads per frame that satifies the
// the condition: wpp start/stop inefficiency takes up less than 15%
// of frame area.
int threads_per_frame = 1;
const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
while ((threads_per_frame + 1) * 2 < lcu_width &&
threads_per_frame + 1 < lcu_height &&
size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold)
{
++threads_per_frame;
}
const int threads = (cfg->threads > 1 ? cfg->threads : 1);
const int frames = CEILDIV(threads, threads_per_frame);
// Convert from number of parallel frames to number of additional frames.
cfg->owf = CLIP(0, threads - 1, frames - 1);
} else {
// If wpp is not on, select owf such that there is enough
// tiles for twice the number of threads.
int tiles_per_frame = 1;
if (cfg->tiles_width_count > 0) {
tiles_per_frame *= cfg->tiles_width_count + 1;
}
if (cfg->tiles_height_count > 0) {
tiles_per_frame *= cfg->tiles_height_count + 1;
}
int threads = (cfg->threads > 1 ? cfg->threads : 1);
int frames = CEILDIV(threads * 2, tiles_per_frame);
// Limit number of frames to 1.25x the number of threads for the case
// where there is only 1 tile per frame.
frames = CLIP(1, threads * 4 / 3, frames);
cfg->owf = frames - 1;
}
fprintf(stderr, "--owf=auto value set to %d.\n", cfg->owf);
return 1;
}

View file

@ -89,5 +89,6 @@ int config_init(config *cfg);
int config_destroy(config *cfg);
int config_read(config *cfg,int argc, char *argv[]);
int config_validate(config *cfg);
int config_set_owf_auto(config *cfg);
#endif

View file

@ -195,49 +195,9 @@ int main(int argc, char *argv[])
}
if (cfg->owf == -1) {
if (cfg->wpp) {
// If --owf=auto and wpp is on, select owf according to the lesser dimension.
// An ok rule for all intra seems to be to always have at least 4 wpp
// streams per thread. For a single frame that would mean that all threads
// are working for at least half of the frame.
int lcu_width = (cfg->width + LCU_WIDTH - 1) / LCU_WIDTH;
int lcu_height = (cfg->height + LCU_WIDTH - 1) / LCU_WIDTH;
int min_dimension = MIN(lcu_width, lcu_height);
int threads = (cfg->threads > 1 ? cfg->threads : 1);
// Find the largest number of threads per frame that satifies the
// the condition that there are 4 wpp streams per thread.
int threads_per_frame = 1;
while (min_dimension / (threads_per_frame + 1) >= 4) {
++threads_per_frame;
if (!config_set_owf_auto(cfg)) {
goto exit_failure;
}
// Get ceil(threads / threads_per_frame).
int frames = (threads + threads_per_frame - 1) / threads_per_frame;
cfg->owf = CLIP(0, threads - 1, frames - 1);
} else {
// If --owf=auto and wpp is not on, select owf such that there is enough
// tiles for twice the number of threads. That should make sure there are
// always some tiles to work on.
int tiles_per_frame = 1;
if (cfg->tiles_width_split != NULL) {
tiles_per_frame *= cfg->tiles_width_count;
}
if (cfg->tiles_height_split != NULL) {
tiles_per_frame *= cfg->tiles_height_count;
}
int threads = (cfg->threads > 1 ? cfg->threads : 1);
// Get ceil(threads * 2 / tiles_per_frame).
int frames = (threads * 2 + tiles_per_frame - 1) / tiles_per_frame;
// Limit number of frames to 1.25x the number of threads for the case
// where there is only 1 tile per frame.
frames = CLIP(1, threads * 4 / 3, frames);
cfg->owf = frames - 1;
}
fprintf(stderr, "--owf=auto value set to %d.\n", cfg->owf);
}
// Do more validation to make sure the parameters we have make sense.

View file

@ -124,6 +124,7 @@ typedef int16_t coefficient;
#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
#define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
#define PU_INDEX(x_pu, y_pu) (((x_pu) % 2) + 2 * ((y_pu) % 2))
#define CEILDIV(x,y) (((x) + (y) - 1) / (y))
#define LOG2_LCU_WIDTH 6
// CU_TO_PIXEL = y * lcu_width * pic_width + x * lcu_width