From 51b5692121213bfe5c76efff418fbf9e84f7c95b Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Fri, 5 Dec 2014 14:42:09 +0200 Subject: [PATCH] Rewrite owf=auto code to be more general. - Change the definition to be a bit more general. The mapping from resolution to owf frames stays mostly the same however, but should handle weird resolutions better. - Move everything to config module. - Fix handling of tiles. It had a bug where owf for tiles was always threads * 4/3 - 1. Works as intended now. --- src/config.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/config.h | 1 + src/encmain.c | 44 ++-------------------------------------- src/global.h | 1 + 4 files changed, 60 insertions(+), 42 deletions(-) diff --git a/src/config.c b/src/config.c index 85151b60..412b0798 100644 --- a/src/config.c +++ b/src/config.c @@ -602,3 +602,59 @@ int config_validate(config *cfg) } return 1; } + +int size_of_wpp_ends(int threads) +{ + // Based on the shape of the area where all threads can't yet run in parallel. + return 4 * threads * threads - 2 * threads; +} + +int config_set_owf_auto(config *cfg) +{ + if (cfg->wpp) { + // If wpp is on, select owf such that less than 15% of the + // frame is covered by the are threads can not work at the same time. + const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH); + const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH); + + // Find the largest number of threads per frame that satifies the + // the condition: wpp start/stop inefficiency takes up less than 15% + // of frame area. + int threads_per_frame = 1; + const int wpp_treshold = lcu_width * lcu_height * 15 / 100; + while ((threads_per_frame + 1) * 2 < lcu_width && + threads_per_frame + 1 < lcu_height && + size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold) + { + ++threads_per_frame; + } + + const int threads = (cfg->threads > 1 ? cfg->threads : 1); + const int frames = CEILDIV(threads, threads_per_frame); + + // Convert from number of parallel frames to number of additional frames. + cfg->owf = CLIP(0, threads - 1, frames - 1); + } else { + // If wpp is not on, select owf such that there is enough + // tiles for twice the number of threads. + + int tiles_per_frame = 1; + if (cfg->tiles_width_count > 0) { + tiles_per_frame *= cfg->tiles_width_count + 1; + } + if (cfg->tiles_height_count > 0) { + tiles_per_frame *= cfg->tiles_height_count + 1; + } + int threads = (cfg->threads > 1 ? cfg->threads : 1); + int frames = CEILDIV(threads * 2, tiles_per_frame); + + // Limit number of frames to 1.25x the number of threads for the case + // where there is only 1 tile per frame. + frames = CLIP(1, threads * 4 / 3, frames); + cfg->owf = frames - 1; + } + + fprintf(stderr, "--owf=auto value set to %d.\n", cfg->owf); + + return 1; +} diff --git a/src/config.h b/src/config.h index e6974c52..39e3b232 100644 --- a/src/config.h +++ b/src/config.h @@ -89,5 +89,6 @@ int config_init(config *cfg); int config_destroy(config *cfg); int config_read(config *cfg,int argc, char *argv[]); int config_validate(config *cfg); +int config_set_owf_auto(config *cfg); #endif diff --git a/src/encmain.c b/src/encmain.c index ff8fe97f..439b89a7 100644 --- a/src/encmain.c +++ b/src/encmain.c @@ -195,49 +195,9 @@ int main(int argc, char *argv[]) } if (cfg->owf == -1) { - if (cfg->wpp) { - // If --owf=auto and wpp is on, select owf according to the lesser dimension. - // An ok rule for all intra seems to be to always have at least 4 wpp - // streams per thread. For a single frame that would mean that all threads - // are working for at least half of the frame. - int lcu_width = (cfg->width + LCU_WIDTH - 1) / LCU_WIDTH; - int lcu_height = (cfg->height + LCU_WIDTH - 1) / LCU_WIDTH; - int min_dimension = MIN(lcu_width, lcu_height); - int threads = (cfg->threads > 1 ? cfg->threads : 1); - - // Find the largest number of threads per frame that satifies the - // the condition that there are 4 wpp streams per thread. - int threads_per_frame = 1; - while (min_dimension / (threads_per_frame + 1) >= 4) { - ++threads_per_frame; - } - - // Get ceil(threads / threads_per_frame). - int frames = (threads + threads_per_frame - 1) / threads_per_frame; - cfg->owf = CLIP(0, threads - 1, frames - 1); - } else { - // If --owf=auto and wpp is not on, select owf such that there is enough - // tiles for twice the number of threads. That should make sure there are - // always some tiles to work on. - - int tiles_per_frame = 1; - if (cfg->tiles_width_split != NULL) { - tiles_per_frame *= cfg->tiles_width_count; - } - if (cfg->tiles_height_split != NULL) { - tiles_per_frame *= cfg->tiles_height_count; - } - int threads = (cfg->threads > 1 ? cfg->threads : 1); - // Get ceil(threads * 2 / tiles_per_frame). - int frames = (threads * 2 + tiles_per_frame - 1) / tiles_per_frame; - - // Limit number of frames to 1.25x the number of threads for the case - // where there is only 1 tile per frame. - frames = CLIP(1, threads * 4 / 3, frames); - cfg->owf = frames - 1; + if (!config_set_owf_auto(cfg)) { + goto exit_failure; } - - fprintf(stderr, "--owf=auto value set to %d.\n", cfg->owf); } // Do more validation to make sure the parameters we have make sense. diff --git a/src/global.h b/src/global.h index 22a2ecb6..773d2560 100644 --- a/src/global.h +++ b/src/global.h @@ -124,6 +124,7 @@ typedef int16_t coefficient; #define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth) #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val)) #define PU_INDEX(x_pu, y_pu) (((x_pu) % 2) + 2 * ((y_pu) % 2)) +#define CEILDIV(x,y) (((x) + (y) - 1) / (y)) #define LOG2_LCU_WIDTH 6 // CU_TO_PIXEL = y * lcu_width * pic_width + x * lcu_width