Merge pull request #79 from lfasnacht/thread_visualizer

Better log file format for visualizer
This commit is contained in:
Ari Koivula 2014-08-11 14:26:19 +03:00
commit a38c3241e8
8 changed files with 102 additions and 50 deletions

View file

@ -729,21 +729,21 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
}
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
for (i = 0; main_state->children[i].encoder_control; ++i) {
//Append bitstream to main stream
bitstream_append(&main_state->stream, &main_state->children[i].stream);
//FIXME: Move this...
bitstream_clear(&main_state->children[i].stream);
}
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
}
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
// Calculate checksum
add_checksum(main_state);
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
}
assert(main_state->tile->frame->poc == main_state->global->poc);

View file

@ -316,29 +316,32 @@ static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
//If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism
if (encoder_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (encoder_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !encoder_state->parent->children[1].encoder_control)) {
for (i = 0; i < encoder_state->lcu_order_count; ++i) {
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_ENCODE_LCU);
encoder_state_worker_encode_lcu(&encoder_state->lcu_order[i]);
#ifdef _DEBUG
{
const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_lcu,frame=%d,tile=%d,slice=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_ENCODE_LCU, encoder_state->encoder_control->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
}
#endif //_DEBUG
}
if (encoder->sao_enable) {
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME);
sao_reconstruct_frame(encoder_state);
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME, encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y,
encoder_state->tile->lcu_offset_x * LCU_WIDTH, encoder_state->tile->frame->width + encoder_state->tile->lcu_offset_x * LCU_WIDTH - 1,
encoder_state->tile->lcu_offset_y * LCU_WIDTH, encoder_state->tile->frame->height + encoder_state->tile->lcu_offset_y * LCU_WIDTH - 1
);
}
} else {
for (i = 0; i < encoder_state->lcu_order_count; ++i) {
const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
#ifdef _DEBUG
char job_description[256];
sprintf(job_description, "type=search_lcu,frame=%d,tile=%d,slice=%d,row=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->wfrow->lcu_offset_y, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
#else
char* job_description = NULL;
#endif
@ -392,14 +395,14 @@ static void encoder_state_worker_encode_children(void * opaque) {
encoder_state_encode(sub_state);
if (sub_state->is_leaf) {
if (sub_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_WRITE_BITSTREAM_LEAF);
encoder_state_write_bitstream_leaf(sub_state);
PERFORMANCE_MEASURE_END(sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position.y + sub_state->tile->lcu_offset_y, sub_state->lcu_order[sub_state->lcu_order_count-1].position.y + sub_state->tile->lcu_offset_y);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_WRITE_BITSTREAM_LEAF, sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.x + sub_state->lcu_order[sub_state->lcu_order_count-1].size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order[0].position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.y + sub_state->lcu_order[sub_state->lcu_order_count-1].size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
} else {
threadqueue_job *job;
#ifdef _DEBUG
char job_description[256];
sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->wfrow->lcu_offset_y);
sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.x + sub_state->lcu_order[sub_state->lcu_order_count-1].size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order[0].position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.y + sub_state->lcu_order[sub_state->lcu_order_count-1].size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
#else
char* job_description = NULL;
#endif
@ -516,13 +519,15 @@ static void encoder_state_encode(encoder_state * const main_state) {
char job_description[256];
switch (main_state->children[i].type) {
case ENCODER_STATE_TYPE_TILE:
sprintf(job_description, "frame=%d,tile=%d,row=%d-%d,position_x=%d,position_y=%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].tile->lcu_offset_x, main_state->children[i].tile->lcu_offset_y);
sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y,
main_state->children[i].lcu_order[0].position_px.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.x + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH - 1,
main_state->children[i].lcu_order[0].position_px.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.y + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH - 1);
break;
case ENCODER_STATE_TYPE_SLICE:
sprintf(job_description, "frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
break;
default:
sprintf(job_description, "frame=%d,invalid", main_state->children[i].global->frame);
sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->children[i].global->frame);
break;
}
#else
@ -554,7 +559,7 @@ static void encoder_state_encode(encoder_state * const main_state) {
threadqueue_job *job;
#ifdef _DEBUG
char job_description[256];
sprintf(job_description, "type=sao,frame=%d,tile=%d,position_y=%d", main_state->global->frame, main_state->tile->id, y + main_state->tile->lcu_offset_y);
sprintf(job_description, "type=sao,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d", main_state->global->frame, main_state->tile->id, main_state->tile->lcu_offset_x * LCU_WIDTH, main_state->tile->lcu_offset_x * LCU_WIDTH + main_state->tile->frame->width - 1, (main_state->tile->lcu_offset_y + y) * LCU_WIDTH, MIN(main_state->tile->lcu_offset_y * LCU_WIDTH + main_state->tile->frame->height, (main_state->tile->lcu_offset_y + y + 1) * LCU_WIDTH)-1);
#else
char* job_description = NULL;
#endif
@ -683,14 +688,14 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state * const enc
void encode_one_frame(encoder_state * const main_state)
{
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
encoder_state_new_frame(main_state);
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=new_frame,frame=%d,poc=%d", main_state->global->frame, main_state->global->poc);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=new_frame,frame=%d,poc=%d", main_state->global->frame, main_state->global->poc);
}
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
encoder_state_encode(main_state);
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame);
}
//threadqueue_flush(main_state->encoder_control->threadqueue);
{

View file

@ -184,6 +184,16 @@ typedef int16_t coefficient;
#define MAX_TR_DYNAMIC_RANGE 15
//DEBUG BITMASK
#define _DEBUG_PERF_FRAME_LEVEL 0x0001
#define _DEBUG_PERF_JOB 0x0002
#define _DEBUG_PERF_ENCODE_LCU 0x0004
#define _DEBUG_PERF_SAO_RECONSTRUCT_FRAME 0x0008
#define _DEBUG_PERF_WRITE_BITSTREAM_LEAF 0x0010
#define _DEBUG_PERF_SEARCH_CU 0x0020
#define _DEBUG_PERF_SEARCH_PIXELS 0x0040
//Constants
typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, NUM_COLORS } color_index;
enum { SLICE_B = 0, SLICE_P = 1, SLICE_I = 2 };

View file

@ -209,13 +209,18 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
const vector2d *pattern = &large_hexbs[i];
unsigned cost;
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
cost = image_calc_sad(pic, ref, orig->x, orig->y,
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
block_width, block_width, max_lcu_below);
cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, block_width, block_width);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
orig->x + mv.x + pattern->x,
orig->x + mv.x + pattern->x + block_width,
orig->y + mv.y + pattern->y,
orig->y + mv.y + pattern->y + block_width);
}
if (cost < best_cost) {
@ -229,13 +234,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
if (!(mv.x == 0 && mv.y == 0)) {
unsigned cost;
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
cost = image_calc_sad(pic, ref, orig->x, orig->y,
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
block_width, block_width, max_lcu_below);
cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, block_width, block_width);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=00vector,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
orig->x,
orig->x + block_width,
orig->y,
orig->y + block_width);
}
// If the 0,0 is better, redo the hexagon around that point.
@ -250,13 +259,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
const vector2d *pattern = &large_hexbs[i];
unsigned cost;
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
cost = image_calc_sad(pic, ref, orig->x, orig->y,
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y,
block_width, block_width, max_lcu_below);
cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y, block_width, block_width);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_around00,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
orig->x + pattern->x,
orig->x + pattern->x + block_width,
orig->y + pattern->y,
orig->y + pattern->y + block_width);
}
if (cost < best_cost) {
@ -290,13 +303,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
const vector2d *offset = &large_hexbs[start + i];
unsigned cost;
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
cost = image_calc_sad(pic, ref, orig->x, orig->y,
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
orig->x + mv.x + offset->x,
orig->y + mv.y + offset->y,
block_width, block_width, max_lcu_below);
cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
orig->x + mv.x + offset->x,
orig->x + mv.x + offset->x + block_width,
orig->y + mv.y + offset->y,
orig->y + mv.y + offset->y + block_width);
}
if (cost < best_cost) {
@ -318,13 +335,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
const vector2d *offset = &small_hexbs[i];
unsigned cost;
{
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
cost = image_calc_sad(pic, ref, orig->x, orig->y,
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
block_width, block_width, max_lcu_below);
cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
orig->x + mv.x + offset->x,
orig->x + mv.x + offset->x + block_width,
orig->y + mv.y + offset->y,
orig->y + mv.y + offset->y + block_width);
}
if (cost > 0 && cost < best_cost) {
@ -1126,10 +1147,10 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
int cost = MAX_INT;
cu_info *cur_cu;
int x_local = (x&0x3f), y_local = (y&0x3f);
#if _DEBUG
#ifdef _DEBUG
int debug_split = 0;
#endif
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_CU);
// Stop recursion if the CU is completely outside the frame.
if (x >= frame->width || y >= frame->height) {
@ -1237,7 +1258,12 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
}
}
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,x=%d,y=%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, x, y, depth, debug_split, (cur_cu->type==CU_INTRA)?1:0);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_CU, encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id,
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + x,
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + x + (LCU_WIDTH >> depth),
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + y,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + y + (LCU_WIDTH >> depth),
depth, debug_split, (cur_cu->type==CU_INTRA)?1:0);
return cost;
}

View file

@ -411,9 +411,9 @@ static void partial_butterfly_inverse_32_avx2(int16_t *src, int16_t *dst,
#define DCT_NXN_AVX2(n) \
static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
\
int16_t tmp[ ## n ## * ## n ##]; \
int32_t shift_1st = g_convert_to_bit[ ## n ## ] + 1 + (bitdepth - 8); \
int32_t shift_2nd = g_convert_to_bit[ ## n ## ] + 8; \
int16_t tmp[ n * n ]; \
int32_t shift_1st = g_convert_to_bit[ n ] + 1 + (bitdepth - 8); \
int32_t shift_2nd = g_convert_to_bit[ n ] + 8; \
\
partial_butterfly_ ## n ## _avx2(block, tmp, shift_1st); \
partial_butterfly_ ## n ## _avx2(tmp, coeff, shift_2nd); \
@ -422,7 +422,7 @@ static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_
#define IDCT_NXN_AVX2(n) \
static void idct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
\
int16_t tmp[ ## n ## * ## n ##]; \
int16_t tmp[ n * n ]; \
int32_t shift_1st = 7; \
int32_t shift_2nd = 12 - (bitdepth - 8); \
\

View file

@ -407,9 +407,9 @@ static void partial_butterfly_inverse_32_generic(int16_t *src, int16_t *dst,
#define DCT_NXN_GENERIC(n) \
static void dct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
\
int16_t tmp[ ## n ## * ## n ##]; \
int32_t shift_1st = g_convert_to_bit[ ## n ## ] + 1 + (bitdepth - 8); \
int32_t shift_2nd = g_convert_to_bit[ ## n ## ] + 8; \
int16_t tmp[ n * n ]; \
int32_t shift_1st = g_convert_to_bit[ n ] + 1 + (bitdepth - 8); \
int32_t shift_2nd = g_convert_to_bit[ n ] + 8; \
\
partial_butterfly_ ## n ## _generic(block, tmp, shift_1st); \
partial_butterfly_ ## n ## _generic(tmp, coeff, shift_2nd); \
@ -418,7 +418,7 @@ static void dct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int
#define IDCT_NXN_GENERIC(n) \
static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
\
int16_t tmp[ ## n ## * ## n ##]; \
int16_t tmp[ n * n ]; \
int32_t shift_1st = 7; \
int32_t shift_2nd = 12 - (bitdepth - 8); \
\

View file

@ -290,6 +290,7 @@ int threadqueue_init(threadqueue_queue * const threadqueue, int thread_count, in
static void threadqueue_free_job(threadqueue_queue * const threadqueue, int i)
{
#ifdef _DEBUG
#if _DEBUG & _DEBUG_PERF_JOB
int j;
GET_TIME(&threadqueue->queue[i]->debug_clock_dequeue);
fprintf(threadqueue->debug_log, "%p\t%d\t%lf\t+%lf\t+%lf\t+%lf\t%s\n", threadqueue->queue[i], threadqueue->queue[i]->debug_worker_id, CLOCK_T_AS_DOUBLE(threadqueue->queue[i]->debug_clock_enqueue), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_enqueue, threadqueue->queue[i]->debug_clock_start), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_start, threadqueue->queue[i]->debug_clock_stop), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_stop, threadqueue->queue[i]->debug_clock_dequeue), threadqueue->queue[i]->debug_description);
@ -299,6 +300,7 @@ static void threadqueue_free_job(threadqueue_queue * const threadqueue, int i)
}
FREE_POINTER(threadqueue->queue[i]->debug_description);
#endif
#endif
FREE_POINTER(threadqueue->queue[i]->rdepends);
@ -315,6 +317,7 @@ static void threadqueue_free_jobs(threadqueue_queue * const threadqueue) {
threadqueue->queue_count = 0;
threadqueue->queue_start = 0;
#ifdef _DEBUG
#if _DEBUG & _DEBUG_PERF_JOB
{
CLOCK_T time;
GET_TIME(&time);
@ -322,6 +325,7 @@ static void threadqueue_free_jobs(threadqueue_queue * const threadqueue) {
fprintf(threadqueue->debug_log, "\t\t-\t-\t%lf\t-\tFLUSH\n", CLOCK_T_AS_DOUBLE(time));
}
#endif
#endif
}
int threadqueue_finalize(threadqueue_queue * const threadqueue) {
@ -491,9 +495,9 @@ threadqueue_job * threadqueue_submit(threadqueue_queue * const threadqueue, void
//No lock here... this should be constant
if (threadqueue->threads_count == 0) {
//FIXME: This should be improved in order to handle dependencies
PERFORMANCE_MEASURE_START();
PERFORMANCE_MEASURE_START(_DEBUG_PERF_JOB);
fptr(arg);
PERFORMANCE_MEASURE_END(threadqueue, "%s", debug_description);
PERFORMANCE_MEASURE_END(_DEBUG_PERF_JOB, threadqueue, "%s", debug_description);
return NULL;
}

View file

@ -118,12 +118,19 @@ int threadqueue_finalize(threadqueue_queue * threadqueue);
#ifdef _DEBUG
int threadqueue_log(threadqueue_queue * threadqueue, const CLOCK_T *start, const CLOCK_T *stop, const char* debug_description);
//This macro HAS TO BE at the beginning of a block
#define PERFORMANCE_MEASURE_START() CLOCK_T start, stop; GET_TIME(&start)
#define PERFORMANCE_MEASURE_END(threadqueue, str, ...) do {GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description); }} while (0)
#ifdef _GNUC
#define ATTR_UNUSED __attribute__((unused))
#else
#define PERFORMANCE_MEASURE_START() do {} while (0)
#define PERFORMANCE_MEASURE_END(threadqueue, str, ...) do {} while (0)
#define ATTR_UNUSED
#endif
//This macro HAS TO BE at the beginning of a block
#define PERFORMANCE_MEASURE_START(mask) CLOCK_T start ATTR_UNUSED, stop ATTR_UNUSED; if (_DEBUG & mask) GET_TIME(&start)
#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) do {if (_DEBUG & mask) { GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description);}}} while (0) \
#else
#define PERFORMANCE_MEASURE_START(mask) do {} while (0)
#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) do {} while (0)
#endif
/* Constraints: