mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 10:34:05 +00:00
Merge pull request #79 from lfasnacht/thread_visualizer
Better log file format for visualizer
This commit is contained in:
commit
a38c3241e8
|
@ -729,21 +729,21 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
|
|||
main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
|
||||
}
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
|
||||
for (i = 0; main_state->children[i].encoder_control; ++i) {
|
||||
//Append bitstream to main stream
|
||||
bitstream_append(&main_state->stream, &main_state->children[i].stream);
|
||||
//FIXME: Move this...
|
||||
bitstream_clear(&main_state->children[i].stream);
|
||||
}
|
||||
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
|
||||
}
|
||||
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
|
||||
// Calculate checksum
|
||||
add_checksum(main_state);
|
||||
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", main_state->global->frame, main_state->type);
|
||||
}
|
||||
|
||||
assert(main_state->tile->frame->poc == main_state->global->poc);
|
||||
|
|
|
@ -316,29 +316,32 @@ static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
|
|||
//If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism
|
||||
if (encoder_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (encoder_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !encoder_state->parent->children[1].encoder_control)) {
|
||||
for (i = 0; i < encoder_state->lcu_order_count; ++i) {
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_ENCODE_LCU);
|
||||
|
||||
encoder_state_worker_encode_lcu(&encoder_state->lcu_order[i]);
|
||||
|
||||
#ifdef _DEBUG
|
||||
{
|
||||
const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_lcu,frame=%d,tile=%d,slice=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_ENCODE_LCU, encoder_state->encoder_control->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
|
||||
}
|
||||
#endif //_DEBUG
|
||||
}
|
||||
|
||||
if (encoder->sao_enable) {
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME);
|
||||
sao_reconstruct_frame(encoder_state);
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME, encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y,
|
||||
encoder_state->tile->lcu_offset_x * LCU_WIDTH, encoder_state->tile->frame->width + encoder_state->tile->lcu_offset_x * LCU_WIDTH - 1,
|
||||
encoder_state->tile->lcu_offset_y * LCU_WIDTH, encoder_state->tile->frame->height + encoder_state->tile->lcu_offset_y * LCU_WIDTH - 1
|
||||
);
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < encoder_state->lcu_order_count; ++i) {
|
||||
const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
|
||||
#ifdef _DEBUG
|
||||
char job_description[256];
|
||||
sprintf(job_description, "type=search_lcu,frame=%d,tile=%d,slice=%d,row=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->wfrow->lcu_offset_y, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
|
||||
sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + encoder_state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + encoder_state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
|
||||
#else
|
||||
char* job_description = NULL;
|
||||
#endif
|
||||
|
@ -392,14 +395,14 @@ static void encoder_state_worker_encode_children(void * opaque) {
|
|||
encoder_state_encode(sub_state);
|
||||
if (sub_state->is_leaf) {
|
||||
if (sub_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_WRITE_BITSTREAM_LEAF);
|
||||
encoder_state_write_bitstream_leaf(sub_state);
|
||||
PERFORMANCE_MEASURE_END(sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position.y + sub_state->tile->lcu_offset_y, sub_state->lcu_order[sub_state->lcu_order_count-1].position.y + sub_state->tile->lcu_offset_y);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_WRITE_BITSTREAM_LEAF, sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.x + sub_state->lcu_order[sub_state->lcu_order_count-1].size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order[0].position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.y + sub_state->lcu_order[sub_state->lcu_order_count-1].size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
|
||||
} else {
|
||||
threadqueue_job *job;
|
||||
#ifdef _DEBUG
|
||||
char job_description[256];
|
||||
sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->wfrow->lcu_offset_y);
|
||||
sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.x + sub_state->lcu_order[sub_state->lcu_order_count-1].size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order[0].position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_order[sub_state->lcu_order_count-1].position_px.y + sub_state->lcu_order[sub_state->lcu_order_count-1].size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
|
||||
#else
|
||||
char* job_description = NULL;
|
||||
#endif
|
||||
|
@ -516,13 +519,15 @@ static void encoder_state_encode(encoder_state * const main_state) {
|
|||
char job_description[256];
|
||||
switch (main_state->children[i].type) {
|
||||
case ENCODER_STATE_TYPE_TILE:
|
||||
sprintf(job_description, "frame=%d,tile=%d,row=%d-%d,position_x=%d,position_y=%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].tile->lcu_offset_x, main_state->children[i].tile->lcu_offset_y);
|
||||
sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y,
|
||||
main_state->children[i].lcu_order[0].position_px.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.x + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH - 1,
|
||||
main_state->children[i].lcu_order[0].position_px.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.y + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH - 1);
|
||||
break;
|
||||
case ENCODER_STATE_TYPE_SLICE:
|
||||
sprintf(job_description, "frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
|
||||
sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
|
||||
break;
|
||||
default:
|
||||
sprintf(job_description, "frame=%d,invalid", main_state->children[i].global->frame);
|
||||
sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->children[i].global->frame);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
|
@ -554,7 +559,7 @@ static void encoder_state_encode(encoder_state * const main_state) {
|
|||
threadqueue_job *job;
|
||||
#ifdef _DEBUG
|
||||
char job_description[256];
|
||||
sprintf(job_description, "type=sao,frame=%d,tile=%d,position_y=%d", main_state->global->frame, main_state->tile->id, y + main_state->tile->lcu_offset_y);
|
||||
sprintf(job_description, "type=sao,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d", main_state->global->frame, main_state->tile->id, main_state->tile->lcu_offset_x * LCU_WIDTH, main_state->tile->lcu_offset_x * LCU_WIDTH + main_state->tile->frame->width - 1, (main_state->tile->lcu_offset_y + y) * LCU_WIDTH, MIN(main_state->tile->lcu_offset_y * LCU_WIDTH + main_state->tile->frame->height, (main_state->tile->lcu_offset_y + y + 1) * LCU_WIDTH)-1);
|
||||
#else
|
||||
char* job_description = NULL;
|
||||
#endif
|
||||
|
@ -683,14 +688,14 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state * const enc
|
|||
void encode_one_frame(encoder_state * const main_state)
|
||||
{
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
|
||||
encoder_state_new_frame(main_state);
|
||||
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=new_frame,frame=%d,poc=%d", main_state->global->frame, main_state->global->poc);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=new_frame,frame=%d,poc=%d", main_state->global->frame, main_state->global->poc);
|
||||
}
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_FRAME_LEVEL);
|
||||
encoder_state_encode(main_state);
|
||||
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_FRAME_LEVEL, main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame);
|
||||
}
|
||||
//threadqueue_flush(main_state->encoder_control->threadqueue);
|
||||
{
|
||||
|
|
10
src/global.h
10
src/global.h
|
@ -184,6 +184,16 @@ typedef int16_t coefficient;
|
|||
|
||||
#define MAX_TR_DYNAMIC_RANGE 15
|
||||
|
||||
|
||||
//DEBUG BITMASK
|
||||
#define _DEBUG_PERF_FRAME_LEVEL 0x0001
|
||||
#define _DEBUG_PERF_JOB 0x0002
|
||||
#define _DEBUG_PERF_ENCODE_LCU 0x0004
|
||||
#define _DEBUG_PERF_SAO_RECONSTRUCT_FRAME 0x0008
|
||||
#define _DEBUG_PERF_WRITE_BITSTREAM_LEAF 0x0010
|
||||
#define _DEBUG_PERF_SEARCH_CU 0x0020
|
||||
#define _DEBUG_PERF_SEARCH_PIXELS 0x0040
|
||||
|
||||
//Constants
|
||||
typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, NUM_COLORS } color_index;
|
||||
enum { SLICE_B = 0, SLICE_P = 1, SLICE_I = 2 };
|
||||
|
|
56
src/search.c
56
src/search.c
|
@ -209,13 +209,18 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
|
|||
const vector2d *pattern = &large_hexbs[i];
|
||||
unsigned cost;
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
|
||||
cost = image_calc_sad(pic, ref, orig->x, orig->y,
|
||||
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x,
|
||||
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
|
||||
block_width, block_width, max_lcu_below);
|
||||
cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, block_width, block_width);
|
||||
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
|
||||
orig->x + mv.x + pattern->x,
|
||||
orig->x + mv.x + pattern->x + block_width,
|
||||
orig->y + mv.y + pattern->y,
|
||||
orig->y + mv.y + pattern->y + block_width);
|
||||
}
|
||||
|
||||
if (cost < best_cost) {
|
||||
|
@ -229,13 +234,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
|
|||
if (!(mv.x == 0 && mv.y == 0)) {
|
||||
unsigned cost;
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
|
||||
cost = image_calc_sad(pic, ref, orig->x, orig->y,
|
||||
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x,
|
||||
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
|
||||
block_width, block_width, max_lcu_below);
|
||||
cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, block_width, block_width);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=00vector,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
|
||||
orig->x,
|
||||
orig->x + block_width,
|
||||
orig->y,
|
||||
orig->y + block_width);
|
||||
}
|
||||
|
||||
// If the 0,0 is better, redo the hexagon around that point.
|
||||
|
@ -250,13 +259,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
|
|||
const vector2d *pattern = &large_hexbs[i];
|
||||
unsigned cost;
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
|
||||
cost = image_calc_sad(pic, ref, orig->x, orig->y,
|
||||
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x,
|
||||
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y,
|
||||
block_width, block_width, max_lcu_below);
|
||||
cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y, block_width, block_width);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_around00,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
|
||||
orig->x + pattern->x,
|
||||
orig->x + pattern->x + block_width,
|
||||
orig->y + pattern->y,
|
||||
orig->y + pattern->y + block_width);
|
||||
}
|
||||
|
||||
if (cost < best_cost) {
|
||||
|
@ -290,13 +303,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
|
|||
const vector2d *offset = &large_hexbs[start + i];
|
||||
unsigned cost;
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
|
||||
cost = image_calc_sad(pic, ref, orig->x, orig->y,
|
||||
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
|
||||
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
|
||||
orig->x + mv.x + offset->x,
|
||||
orig->y + mv.y + offset->y,
|
||||
block_width, block_width, max_lcu_below);
|
||||
cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
|
||||
orig->x + mv.x + offset->x,
|
||||
orig->x + mv.x + offset->x + block_width,
|
||||
orig->y + mv.y + offset->y,
|
||||
orig->y + mv.y + offset->y + block_width);
|
||||
}
|
||||
|
||||
if (cost < best_cost) {
|
||||
|
@ -318,13 +335,17 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
|
|||
const vector2d *offset = &small_hexbs[i];
|
||||
unsigned cost;
|
||||
{
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
|
||||
cost = image_calc_sad(pic, ref, orig->x, orig->y,
|
||||
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
|
||||
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
|
||||
block_width, block_width, max_lcu_below);
|
||||
cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sad,frame=%d,ref=%d,x=%d,y=%d,ref_x=%d,ref_y=%d,width=%d,height=%d", encoder_state->global->frame, ref->poc - encoder_state->global->poc, orig->x, orig->y, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, encoder_state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,ref=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", encoder_state->global->frame, encoder_state->tile->id, ref->poc - encoder_state->global->poc, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
|
||||
orig->x + mv.x + offset->x,
|
||||
orig->x + mv.x + offset->x + block_width,
|
||||
orig->y + mv.y + offset->y,
|
||||
orig->y + mv.y + offset->y + block_width);
|
||||
}
|
||||
|
||||
if (cost > 0 && cost < best_cost) {
|
||||
|
@ -1126,10 +1147,10 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
|
|||
int cost = MAX_INT;
|
||||
cu_info *cur_cu;
|
||||
int x_local = (x&0x3f), y_local = (y&0x3f);
|
||||
#if _DEBUG
|
||||
#ifdef _DEBUG
|
||||
int debug_split = 0;
|
||||
#endif
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_CU);
|
||||
|
||||
// Stop recursion if the CU is completely outside the frame.
|
||||
if (x >= frame->width || y >= frame->height) {
|
||||
|
@ -1237,7 +1258,12 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
|
|||
}
|
||||
}
|
||||
|
||||
PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,x=%d,y=%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, x, y, depth, debug_split, (cur_cu->type==CU_INTRA)?1:0);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_CU, encoder_state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id,
|
||||
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + x,
|
||||
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + x + (LCU_WIDTH >> depth),
|
||||
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + y,
|
||||
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + y + (LCU_WIDTH >> depth),
|
||||
depth, debug_split, (cur_cu->type==CU_INTRA)?1:0);
|
||||
|
||||
return cost;
|
||||
}
|
||||
|
|
|
@ -411,9 +411,9 @@ static void partial_butterfly_inverse_32_avx2(int16_t *src, int16_t *dst,
|
|||
#define DCT_NXN_AVX2(n) \
|
||||
static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
|
||||
\
|
||||
int16_t tmp[ ## n ## * ## n ##]; \
|
||||
int32_t shift_1st = g_convert_to_bit[ ## n ## ] + 1 + (bitdepth - 8); \
|
||||
int32_t shift_2nd = g_convert_to_bit[ ## n ## ] + 8; \
|
||||
int16_t tmp[ n * n ]; \
|
||||
int32_t shift_1st = g_convert_to_bit[ n ] + 1 + (bitdepth - 8); \
|
||||
int32_t shift_2nd = g_convert_to_bit[ n ] + 8; \
|
||||
\
|
||||
partial_butterfly_ ## n ## _avx2(block, tmp, shift_1st); \
|
||||
partial_butterfly_ ## n ## _avx2(tmp, coeff, shift_2nd); \
|
||||
|
@ -422,7 +422,7 @@ static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_
|
|||
#define IDCT_NXN_AVX2(n) \
|
||||
static void idct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
|
||||
\
|
||||
int16_t tmp[ ## n ## * ## n ##]; \
|
||||
int16_t tmp[ n * n ]; \
|
||||
int32_t shift_1st = 7; \
|
||||
int32_t shift_2nd = 12 - (bitdepth - 8); \
|
||||
\
|
||||
|
|
|
@ -407,9 +407,9 @@ static void partial_butterfly_inverse_32_generic(int16_t *src, int16_t *dst,
|
|||
#define DCT_NXN_GENERIC(n) \
|
||||
static void dct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
|
||||
\
|
||||
int16_t tmp[ ## n ## * ## n ##]; \
|
||||
int32_t shift_1st = g_convert_to_bit[ ## n ## ] + 1 + (bitdepth - 8); \
|
||||
int32_t shift_2nd = g_convert_to_bit[ ## n ## ] + 8; \
|
||||
int16_t tmp[ n * n ]; \
|
||||
int32_t shift_1st = g_convert_to_bit[ n ] + 1 + (bitdepth - 8); \
|
||||
int32_t shift_2nd = g_convert_to_bit[ n ] + 8; \
|
||||
\
|
||||
partial_butterfly_ ## n ## _generic(block, tmp, shift_1st); \
|
||||
partial_butterfly_ ## n ## _generic(tmp, coeff, shift_2nd); \
|
||||
|
@ -418,7 +418,7 @@ static void dct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int
|
|||
#define IDCT_NXN_GENERIC(n) \
|
||||
static void idct_ ## n ## x ## n ## _generic(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
|
||||
\
|
||||
int16_t tmp[ ## n ## * ## n ##]; \
|
||||
int16_t tmp[ n * n ]; \
|
||||
int32_t shift_1st = 7; \
|
||||
int32_t shift_2nd = 12 - (bitdepth - 8); \
|
||||
\
|
||||
|
|
|
@ -290,6 +290,7 @@ int threadqueue_init(threadqueue_queue * const threadqueue, int thread_count, in
|
|||
static void threadqueue_free_job(threadqueue_queue * const threadqueue, int i)
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
#if _DEBUG & _DEBUG_PERF_JOB
|
||||
int j;
|
||||
GET_TIME(&threadqueue->queue[i]->debug_clock_dequeue);
|
||||
fprintf(threadqueue->debug_log, "%p\t%d\t%lf\t+%lf\t+%lf\t+%lf\t%s\n", threadqueue->queue[i], threadqueue->queue[i]->debug_worker_id, CLOCK_T_AS_DOUBLE(threadqueue->queue[i]->debug_clock_enqueue), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_enqueue, threadqueue->queue[i]->debug_clock_start), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_start, threadqueue->queue[i]->debug_clock_stop), CLOCK_T_DIFF(threadqueue->queue[i]->debug_clock_stop, threadqueue->queue[i]->debug_clock_dequeue), threadqueue->queue[i]->debug_description);
|
||||
|
@ -299,6 +300,7 @@ static void threadqueue_free_job(threadqueue_queue * const threadqueue, int i)
|
|||
}
|
||||
|
||||
FREE_POINTER(threadqueue->queue[i]->debug_description);
|
||||
#endif
|
||||
#endif
|
||||
FREE_POINTER(threadqueue->queue[i]->rdepends);
|
||||
|
||||
|
@ -315,6 +317,7 @@ static void threadqueue_free_jobs(threadqueue_queue * const threadqueue) {
|
|||
threadqueue->queue_count = 0;
|
||||
threadqueue->queue_start = 0;
|
||||
#ifdef _DEBUG
|
||||
#if _DEBUG & _DEBUG_PERF_JOB
|
||||
{
|
||||
CLOCK_T time;
|
||||
GET_TIME(&time);
|
||||
|
@ -322,6 +325,7 @@ static void threadqueue_free_jobs(threadqueue_queue * const threadqueue) {
|
|||
fprintf(threadqueue->debug_log, "\t\t-\t-\t%lf\t-\tFLUSH\n", CLOCK_T_AS_DOUBLE(time));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
int threadqueue_finalize(threadqueue_queue * const threadqueue) {
|
||||
|
@ -491,9 +495,9 @@ threadqueue_job * threadqueue_submit(threadqueue_queue * const threadqueue, void
|
|||
//No lock here... this should be constant
|
||||
if (threadqueue->threads_count == 0) {
|
||||
//FIXME: This should be improved in order to handle dependencies
|
||||
PERFORMANCE_MEASURE_START();
|
||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_JOB);
|
||||
fptr(arg);
|
||||
PERFORMANCE_MEASURE_END(threadqueue, "%s", debug_description);
|
||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_JOB, threadqueue, "%s", debug_description);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
|
@ -118,12 +118,19 @@ int threadqueue_finalize(threadqueue_queue * threadqueue);
|
|||
#ifdef _DEBUG
|
||||
int threadqueue_log(threadqueue_queue * threadqueue, const CLOCK_T *start, const CLOCK_T *stop, const char* debug_description);
|
||||
|
||||
//This macro HAS TO BE at the beginning of a block
|
||||
#define PERFORMANCE_MEASURE_START() CLOCK_T start, stop; GET_TIME(&start)
|
||||
#define PERFORMANCE_MEASURE_END(threadqueue, str, ...) do {GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description); }} while (0)
|
||||
#ifdef _GNUC
|
||||
#define ATTR_UNUSED __attribute__((unused))
|
||||
#else
|
||||
#define PERFORMANCE_MEASURE_START() do {} while (0)
|
||||
#define PERFORMANCE_MEASURE_END(threadqueue, str, ...) do {} while (0)
|
||||
#define ATTR_UNUSED
|
||||
#endif
|
||||
|
||||
//This macro HAS TO BE at the beginning of a block
|
||||
#define PERFORMANCE_MEASURE_START(mask) CLOCK_T start ATTR_UNUSED, stop ATTR_UNUSED; if (_DEBUG & mask) GET_TIME(&start)
|
||||
#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) do {if (_DEBUG & mask) { GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description);}}} while (0) \
|
||||
|
||||
#else
|
||||
#define PERFORMANCE_MEASURE_START(mask) do {} while (0)
|
||||
#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) do {} while (0)
|
||||
#endif
|
||||
|
||||
/* Constraints:
|
||||
|
|
Loading…
Reference in a new issue