From f1b303a2d2b89f7e164b5b74254db3eb4c4f9718 Mon Sep 17 00:00:00 2001 From: Laurent Fasnacht Date: Mon, 11 Aug 2014 09:53:06 +0200 Subject: [PATCH 1/4] Fix compilation errors --- src/strategies/avx2/dct-avx2.c | 8 ++++---- src/strategies/generic/dct-generic.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index e79a1176..1e79cce8 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -411,9 +411,9 @@ static void partial_butterfly_inverse_32_avx2(int16_t *src, int16_t *dst, #define DCT_NXN_AVX2(n) \ static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \ \ - int16_t tmp[ ## n ## * ## n ##]; \ - int32_t shift_1st = g_convert_to_bit[ ## n ## ] + 1 + (bitdepth - 8); \ - int32_t shift_2nd = g_convert_to_bit[ ## n ## ] + 8; \ + int16_t tmp[ n * n ]; \ + int32_t shift_1st = g_convert_to_bit[ n ] + 1 + (bitdepth - 8); \ + int32_t shift_2nd = g_convert_to_bit[ n ] + 8; \ \ partial_butterfly_ ## n ## _avx2(block, tmp, shift_1st); \ partial_butterfly_ ## n ## _avx2(tmp, coeff, shift_2nd); \ @@ -422,7 +422,7 @@ static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_ #define IDCT_NXN_AVX2(n) \ static void idct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \ \ - int16_t tmp[ ## n ## * ## n ##]; \ + int16_t tmp[ n * n ]; \ int32_t shift_1st = 7; \ int32_t shift_2nd = 12 - (bitdepth - 8); \ \ diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c index 3313eb27..cf581c13 100644 --- a/src/strategies/generic/dct-generic.c +++ b/src/strategies/generic/dct-generic.c @@ -407,9 +407,9 @@ static void partial_butterfly_inverse_32_generic(int16_t *src, int16_t *dst, #define DCT_NXN_GENERIC(n) \ static void dct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int16_t *coeff) { \ \ - int16_t tmp[ ## n ## * ## n ##]; \ - int32_t shift_1st = g_convert_to_bit[ ## n ## ] + 1 + (bitdepth - 8); \ - int32_t shift_2nd = g_convert_to_bit[ ## n ## ] + 8; \ + int16_t tmp[ n * n ]; \ + int32_t shift_1st = g_convert_to_bit[ n ] + 1 + (bitdepth - 8); \ + int32_t shift_2nd = g_convert_to_bit[ n ] + 8; \ \ partial_butterfly_ ## n ## _generic(block, tmp, shift_1st); \ partial_butterfly_ ## n ## _generic(tmp, coeff, shift_2nd); \ @@ -418,7 +418,7 @@ static void dct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int #define IDCT_NXN_GENERIC(n) \ static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int16_t *coeff) { \ \ - int16_t tmp[ ## n ## * ## n ##]; \ + int16_t tmp[ n * n ]; \ int32_t shift_1st = 7; \ int32_t shift_2nd = 12 - (bitdepth - 8); \ \ From 8502f3d8504793d9d6debf8634a085baf837de7a Mon Sep 17 00:00:00 2001 From: Laurent Fasnacht Date: Mon, 11 Aug 2014 11:35:36 +0200 Subject: [PATCH 2/4] Improve logging --- src/encoder_state-bitstream.c | 8 +++--- src/encoderstate.c | 37 ++++++++++++++----------- src/global.h | 10 +++++++ src/search.c | 51 ++++++++++++++++++++++++----------- src/threadqueue.c | 8 ++++-- src/threadqueue.h | 17 ++++++++---- 6 files changed, 89 insertions(+), 42 deletions(-) diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 95c486e3..7169e7f7 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -729,21 +729,21 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state) main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code); } { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL); for (i = 0; main_state->children[i].encoder_control; ++i) { //Append bitstream to main stream bitstream_append(&main_state->stream, &main_state->children[i].stream); //FIXME: Move this... bitstream_clear(&main_state->children[i].stream); } - PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type); } { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL); // Calculate checksum add_checksum(main_state); - PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type); } assert(main_state->tile->frame->poc == main_state->global->poc); diff --git a/src/encoderstate.c b/src/encoderstate.c index 284b5ade..59147fd6 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -316,29 +316,32 @@ static void encoder_state_encode_leaf(encoder_state * const encoder_state) { //If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism if (encoder_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (encoder_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !encoder_state->parent->children[1].encoder_control)) { for (i = 0; i < encoder_state->lcu_order_count; ++i) { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_ENCODE_LCU); encoder_state_worker_encode_lcu(&encoder_state->lcu_order[i]); #ifdef _DEBUG { const lcu_order_element * const lcu = &encoder_state->lcu_order[i]; - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_lcu,frame=%d,tile=%d,slice=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_ENCODE_LCU, encoder_state->encoder_control->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1); } #endif //_DEBUG } if (encoder->sao_enable) { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME); sao_reconstruct_frame(encoder_state); - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME, encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y, + encoder_state->tile->lcu_offset_x * LCU_WIDTH, encoder_state->tile->frame->width + encoder_state->tile->lcu_offset_x * LCU_WIDTH - 1, + encoder_state->tile->lcu_offset_y * LCU_WIDTH, encoder_state->tile->frame->height + encoder_state->tile->lcu_offset_y * LCU_WIDTH - 1 + ); } } else { for (i = 0; i < encoder_state->lcu_order_count; ++i) { const lcu_order_element * const lcu = &encoder_state->lcu_order[i]; #ifdef _DEBUG char job_description[256]; - sprintf(job_description, "type=search_lcu,frame=%d,tile=%d,slice=%d,row=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->wfrow->lcu_offset_y, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y); + sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1); #else char* job_description = NULL; #endif @@ -392,14 +395,14 @@ static void encoder_state_worker_encode_children(void * opaque) { encoder_state_encode(sub_state); if (sub_state->is_leaf) { if (sub_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_WRITE_BITSTREAM_LEAF); encoder_state_write_bitstream_leaf(sub_state); - PERFORMANCE_MEASURE_END(sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position.y + sub_state->tile->lcu_offset_y, sub_state->lcu_order[sub_state->lcu_order_count-1].position.y + sub_state->tile->lcu_offset_y); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_WRITE_BITSTREAM_LEAF, sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.x + sub_state->lcu_order[sub_state->lcu_order_count-1].size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order[0].position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.y + sub_state->lcu_order[sub_state->lcu_order_count-1].size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1); } else { threadqueue_job *job; #ifdef _DEBUG char job_description[256]; - sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->wfrow->lcu_offset_y); + sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.x + sub_state->lcu_order[sub_state->lcu_order_count-1].size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order[0].position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.y + sub_state->lcu_order[sub_state->lcu_order_count-1].size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1); #else char* job_description = NULL; #endif @@ -516,13 +519,15 @@ static void encoder_state_encode(encoder_state * const main_state) { char job_description[256]; switch (main_state->children[i].type) { case ENCODER_STATE_TYPE_TILE: - sprintf(job_description, "frame=%d,tile=%d,row=%d-%d,position_x=%d,position_y=%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].tile->lcu_offset_x, main_state->children[i].tile->lcu_offset_y); + sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, + main_state->children[i].lcu_order[0].position_px.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.x + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH - 1, + main_state->children[i].lcu_order[0].position_px.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.y + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH - 1); break; case ENCODER_STATE_TYPE_SLICE: - sprintf(job_description, "frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts); + sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts); break; default: - sprintf(job_description, "frame=%d,invalid", main_state->children[i].global->frame); + sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->children[i].global->frame); break; } #else @@ -554,7 +559,7 @@ static void encoder_state_encode(encoder_state * const main_state) { threadqueue_job *job; #ifdef _DEBUG char job_description[256]; - sprintf(job_description, "type=sao,frame=%d,tile=%d,position_y=%d", main_state->global->frame, main_state->tile->id, y + main_state->tile->lcu_offset_y); + sprintf(job_description, "type=sao,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d", main_state->global->frame, main_state->tile->id, main_state->tile->lcu_offset_x * LCU_WIDTH, main_state->tile->lcu_offset_x * LCU_WIDTH + main_state->tile->frame->width - 1, (main_state->tile->lcu_offset_y + y) * LCU_WIDTH, MIN(main_state->tile->lcu_offset_y * LCU_WIDTH + main_state->tile->frame->height, (main_state->tile->lcu_offset_y + y + 1) * LCU_WIDTH)-1); #else char* job_description = NULL; #endif @@ -683,14 +688,14 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state * const enc void encode_one_frame(encoder_state * const main_state) { { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL); encoder_state_new_frame(main_state); - PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=new_frame,frame=%d,poc=%d", main_state->global->frame, main_state->global->poc); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=new_frame,frame=%d,poc=%d", main_state->global->frame, main_state->global->poc); } { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL); encoder_state_encode(main_state); - PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame); } //threadqueue_flush(main_state->encoder_control->threadqueue); { diff --git a/src/global.h b/src/global.h index c7369a7e..b18e2527 100644 --- a/src/global.h +++ b/src/global.h @@ -184,6 +184,16 @@ typedef int16_t coefficient; #define MAX_TR_DYNAMIC_RANGE 15 + +//DEBUG BITMASK +#define _DEBUG_PERF_FRAME_LEVEL 0x0001 +#define _DEBUG_PERF_JOB 0x0002 +#define _DEBUG_PERF_ENCODE_LCU 0x0004 +#define _DEBUG_PERF_SAO_RECONSTRUCT_FRAME 0x0008 +#define _DEBUG_PERF_WRITE_BITSTREAM_LEAF 0x0010 +#define _DEBUG_PERF_SEARCH_PIXELS 0x0020 +#define _DEBUG_PERF_SEARCH_CU 0x0040 + //Constants typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, NUM_COLORS } color_index; enum { SLICE_B = 0, SLICE_P = 1, SLICE_I = 2 }; diff --git a/src/search.c b/src/search.c index 86eafc38..cdcdc807 100644 --- a/src/search.c +++ b/src/search.c @@ -209,13 +209,18 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign const vector2d *pattern = &large_hexbs[i]; unsigned cost; { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS); cost = image_calc_sad(pic, ref, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, block_width, block_width); + + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + orig->x + mv.x + pattern->x, + orig->x + mv.x + pattern->x + block_width, + orig->y + mv.y + pattern->y, + orig->y + mv.y + pattern->y + block_width); } if (cost < best_cost) { @@ -229,13 +234,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign if (!(mv.x == 0 && mv.y == 0)) { unsigned cost; { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS); cost = image_calc_sad(pic, ref, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, block_width, block_width); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=00vector,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + orig->x, + orig->x + block_width, + orig->y, + orig->y + block_width); } // If the 0,0 is better, redo the hexagon around that point. @@ -250,13 +259,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign const vector2d *pattern = &large_hexbs[i]; unsigned cost; { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS); cost = image_calc_sad(pic, ref, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y, block_width, block_width); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_around00,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + orig->x + pattern->x, + orig->x + pattern->x + block_width, + orig->y + pattern->y, + orig->y + pattern->y + block_width); } if (cost < best_cost) { @@ -290,13 +303,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign const vector2d *offset = &large_hexbs[start + i]; unsigned cost; { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS); cost = image_calc_sad(pic, ref, orig->x, orig->y, - (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, - (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, + orig->x + mv.x + offset->x, + orig->y + mv.y + offset->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + orig->x + mv.x + offset->x, + orig->x + mv.x + offset->x + block_width, + orig->y + mv.y + offset->y, + orig->y + mv.y + offset->y + block_width); } if (cost < best_cost) { @@ -318,13 +335,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign const vector2d *offset = &small_hexbs[i]; unsigned cost; { - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS); cost = image_calc_sad(pic, ref, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + orig->x + mv.x + offset->x, + orig->x + mv.x + offset->x + block_width, + orig->y + mv.y + offset->y, + orig->y + mv.y + offset->y + block_width); } if (cost > 0 && cost < best_cost) { @@ -1126,10 +1147,10 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept int cost = MAX_INT; cu_info *cur_cu; int x_local = (x&0x3f), y_local = (y&0x3f); -#if _DEBUG +#ifdef _DEBUG int debug_split = 0; #endif - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_CU); // Stop recursion if the CU is completely outside the frame. if (x >= frame->width || y >= frame->height) { @@ -1237,7 +1258,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept } } - PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,x=%d,y=%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, x, y, depth, debug_split, (cur_cu->type==CU_INTRA)?1:0); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_CU, encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,x=%d,y=%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, x, y, depth, debug_split, (cur_cu->type==CU_INTRA)?1:0); return cost; } diff --git a/src/threadqueue.c b/src/threadqueue.c index 8c5086ba..eb97e937 100644 --- a/src/threadqueue.c +++ b/src/threadqueue.c @@ -290,6 +290,7 @@ int threadqueue_init(threadqueue_queue * const threadqueue, int thread_count, in static void threadqueue_free_job(threadqueue_queue * const threadqueue, int i) { #ifdef _DEBUG +#if _DEBUG & _DEBUG_PERF_JOB int j; GET_TIME(&threadqueue->queue[i]->debug_clock_dequeue); fprintf(threadqueue->debug_log, "%p\t%d\t%lf\t+%lf\t+%lf\t+%lf\t%s\n", threadqueue->queue[i], threadqueue->queue[i]->debug_worker_id, CLOCK_T_AS_DOUBLE(threadqueue->queue[i]->debug_clock_enqueue), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_enqueue, threadqueue->queue[i]->debug_clock_start), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_start, threadqueue->queue[i]->debug_clock_stop), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_stop, threadqueue->queue[i]->debug_clock_dequeue), threadqueue->queue[i]->debug_description); @@ -299,6 +300,7 @@ static void threadqueue_free_job(threadqueue_queue * const threadqueue, int i) } FREE_POINTER(threadqueue->queue[i]->debug_description); +#endif #endif FREE_POINTER(threadqueue->queue[i]->rdepends); @@ -315,6 +317,7 @@ static void threadqueue_free_jobs(threadqueue_queue * const threadqueue) { threadqueue->queue_count = 0; threadqueue->queue_start = 0; #ifdef _DEBUG +#if _DEBUG & _DEBUG_PERF_JOB { CLOCK_T time; GET_TIME(&time); @@ -322,6 +325,7 @@ static void threadqueue_free_jobs(threadqueue_queue * const threadqueue) { fprintf(threadqueue->debug_log, "\t\t-\t-\t%lf\t-\tFLUSH\n", CLOCK_T_AS_DOUBLE(time)); } #endif +#endif } int threadqueue_finalize(threadqueue_queue * const threadqueue) { @@ -491,9 +495,9 @@ threadqueue_job * threadqueue_submit(threadqueue_queue * const threadqueue, void //No lock here... this should be constant if (threadqueue->threads_count == 0) { //FIXME: This should be improved in order to handle dependencies - PERFORMANCE_MEASURE_START(); + PERFORMANCE_MEASURE_START(_DEBUG_PERF_JOB); fptr(arg); - PERFORMANCE_MEASURE_END(threadqueue, "%s", debug_description); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_JOB, threadqueue, "%s", debug_description); return NULL; } diff --git a/src/threadqueue.h b/src/threadqueue.h index 36e9917c..37a14a71 100644 --- a/src/threadqueue.h +++ b/src/threadqueue.h @@ -118,12 +118,19 @@ int threadqueue_finalize(threadqueue_queue * threadqueue); #ifdef _DEBUG int threadqueue_log(threadqueue_queue * threadqueue, const CLOCK_T *start, const CLOCK_T *stop, const char* debug_description); -//This macro HAS TO BE at the beginning of a block -#define PERFORMANCE_MEASURE_START() CLOCK_T start, stop; GET_TIME(&start) -#define PERFORMANCE_MEASURE_END(threadqueue, str, ...) do {GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description); }} while (0) +#ifdef _GNUC + #define ATTR_UNUSED __attribute__((unused)) #else -#define PERFORMANCE_MEASURE_START() do {} while (0) -#define PERFORMANCE_MEASURE_END(threadqueue, str, ...) do {} while (0) + #define ATTR_UNUSED +#endif + +//This macro HAS TO BE at the beginning of a block +#define PERFORMANCE_MEASURE_START(mask) CLOCK_T start ATTR_UNUSED, stop ATTR_UNUSED; if (_DEBUG & mask) GET_TIME(&start) +#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) do {if (_DEBUG & mask) { GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description);}}} while (0) \ + +#else +#define PERFORMANCE_MEASURE_START(mask) do {} while (0) +#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) do {} while (0) #endif /* Constraints: From 6a937de9b2c72f800822825f2435409df1b188bc Mon Sep 17 00:00:00 2001 From: Laurent Fasnacht Date: Mon, 11 Aug 2014 11:46:21 +0200 Subject: [PATCH 3/4] Fix search_cu log --- src/global.h | 4 ++-- src/search.c | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/global.h b/src/global.h index b18e2527..d669b694 100644 --- a/src/global.h +++ b/src/global.h @@ -191,8 +191,8 @@ typedef int16_t coefficient; #define _DEBUG_PERF_ENCODE_LCU 0x0004 #define _DEBUG_PERF_SAO_RECONSTRUCT_FRAME 0x0008 #define _DEBUG_PERF_WRITE_BITSTREAM_LEAF 0x0010 -#define _DEBUG_PERF_SEARCH_PIXELS 0x0020 -#define _DEBUG_PERF_SEARCH_CU 0x0040 +#define _DEBUG_PERF_SEARCH_CU 0x0020 +#define _DEBUG_PERF_SEARCH_PIXELS 0x0040 //Constants typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, NUM_COLORS } color_index; diff --git a/src/search.c b/src/search.c index cdcdc807..26fb94d8 100644 --- a/src/search.c +++ b/src/search.c @@ -1258,7 +1258,12 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept } } - PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_CU, encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,x=%d,y=%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, x, y, depth, debug_split, (cur_cu->type==CU_INTRA)?1:0); + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_CU, encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, + (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + x, + (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + x + (LCU_WIDTH >> depth), + (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + y, + (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + y + (LCU_WIDTH >> depth), + depth, debug_split, (cur_cu->type==CU_INTRA)?1:0); return cost; } From f9bffe35a5a5fba1972ce3fc1badb389052b2834 Mon Sep 17 00:00:00 2001 From: Laurent Fasnacht Date: Mon, 11 Aug 2014 11:55:31 +0200 Subject: [PATCH 4/4] Log tile id in sad perf log --- src/search.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/search.c b/src/search.c index 26fb94d8..b3b84c9b 100644 --- a/src/search.c +++ b/src/search.c @@ -216,7 +216,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, orig->x + mv.x + pattern->x, orig->x + mv.x + pattern->x + block_width, orig->y + mv.y + pattern->y, @@ -240,7 +240,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=00vector,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=00vector,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, orig->x, orig->x + block_width, orig->y, @@ -265,7 +265,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_around00,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_around00,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, orig->x + pattern->x, orig->x + pattern->x + block_width, orig->y + pattern->y, @@ -309,7 +309,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign orig->y + mv.y + offset->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, orig->x + mv.x + offset->x, orig->x + mv.x + offset->x + block_width, orig->y + mv.y + offset->y, @@ -341,7 +341,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width, max_lcu_below); cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); - PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, + PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width, orig->x + mv.x + offset->x, orig->x + mv.x + offset->x + block_width, orig->y + mv.y + offset->y,