From de6faf623d54d64ac69bc876749fe567001ef632 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 16 Jan 2014 17:13:48 +0200 Subject: [PATCH 1/9] Imported entropy bits array from HM and added macro to access it --- src/cabac.h | 2 ++ src/context.c | 14 ++++++++++++++ src/context.h | 2 ++ src/encoder.c | 4 ++-- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/cabac.h b/src/cabac.h index 7f7b98c5..c0b9f4cd 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -75,6 +75,8 @@ void cabac_write_unary_max_symbol_ep(cabac_data *data, unsigned symbol, unsigned #define CTX_MPS(ctx) (ctx->uc_state & 1) #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = g_auc_next_state_lps[ (ctx)->uc_state ]; } #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = g_auc_next_state_mps[ (ctx)->uc_state ]; } +#define CTX_ENTROPY_BITS(ctx,val) entropy_bits[(ctx)->uc_state ^ val] + #ifdef VERBOSE #define CABAC_BIN(data, value, name) { \ uint32_t prev_state = (data)->ctx->uc_state; \ diff --git a/src/context.c b/src/context.c index f58433bd..ee9429c1 100644 --- a/src/context.c +++ b/src/context.c @@ -240,3 +240,17 @@ int32_t context_get_sig_ctx_inc(int32_t pattern_sig_ctx, uint32_t scan_idx, int3 return (( texture_type == 0 && ((pos_x>>2) + (pos_y>>2)) > 0 ) ? 3 : 0) + offset + cnt; } +/* + * Entropy bits to estimate coded bits in RDO / RDOQ (From HM 12.0) + */ +const uint32_t entropy_bits[128] = +{ + 0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f, + 0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d, + 0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c, + 0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a, + 0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759, + 0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458, + 0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156, + 0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55 +}; diff --git a/src/context.h b/src/context.h index dd2a6ab0..8c949c8b 100644 --- a/src/context.h +++ b/src/context.h @@ -214,4 +214,6 @@ static const uint8_t INIT_ABS_FLAG[3][6] = }; +const uint32_t entropy_bits[ 128 ]; + #endif diff --git a/src/encoder.c b/src/encoder.c index 7474dfa7..d872b9e7 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -633,7 +633,7 @@ void encode_seq_parameter_set(encoder_control* encoder) //TODO: VUI? //encode_VUI(encoder); - WRITE_U(encoder->stream, 0, 1, "sps_extension_flag"); + WRITE_U(encoder->stream, 0, 1, "sps_extension_flag"); } void encode_vid_parameter_set(encoder_control* encoder) @@ -668,7 +668,7 @@ void encode_vid_parameter_set(encoder_control* encoder) //IF timing info //END IF - WRITE_U(encoder->stream, 0, 1, "vps_extension_flag"); + WRITE_U(encoder->stream, 0, 1, "vps_extension_flag"); } void encode_VUI(encoder_control* encoder) From 9f70bf74f00dbde267184ae5e29b02e92790c649 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 20 Jan 2014 16:34:11 +0200 Subject: [PATCH 2/9] Imported and converted RDOQ from HM 12.0, NOT WORKING YET --- build/VS2010/HEVC_encoder.vcxproj | 2 + build/VS2010/HEVC_encoder.vcxproj.filters | 6 + src/global.h | 10 + src/rdo.c | 575 ++++++++++++++++++++++ src/rdo.h | 47 ++ src/transform.h | 1 + 6 files changed, 641 insertions(+) create mode 100644 src/rdo.c create mode 100644 src/rdo.h diff --git a/build/VS2010/HEVC_encoder.vcxproj b/build/VS2010/HEVC_encoder.vcxproj index 76e9fd7f..3416e7ad 100644 --- a/build/VS2010/HEVC_encoder.vcxproj +++ b/build/VS2010/HEVC_encoder.vcxproj @@ -84,6 +84,7 @@ + @@ -101,6 +102,7 @@ + diff --git a/build/VS2010/HEVC_encoder.vcxproj.filters b/build/VS2010/HEVC_encoder.vcxproj.filters index 560b3c4a..b0124e9c 100644 --- a/build/VS2010/HEVC_encoder.vcxproj.filters +++ b/build/VS2010/HEVC_encoder.vcxproj.filters @@ -72,6 +72,9 @@ Source Files + + Source Files + @@ -125,6 +128,9 @@ Header Files + + Header Files + diff --git a/src/global.h b/src/global.h index 0d174daa..6b3868d6 100644 --- a/src/global.h +++ b/src/global.h @@ -142,4 +142,14 @@ typedef int16_t coefficient; #define FREE_POINTER(pointer) { free(pointer); pointer = NULL; } #define MOVE_POINTER(dst_pointer,src_pointer) { dst_pointer = src_pointer; src_pointer = NULL; } +#ifndef MAX_INT +#define MAX_INT 0x7FFFFFFF +#endif +#ifndef MAX_INT64 +#define MAX_INT64 0x7FFFFFFFFFFFFFFFLL +#endif +#ifndef MAX_DOUBLE +#define MAX_DOUBLE 1.7e+308 +#endif + #endif \ No newline at end of file diff --git a/src/rdo.c b/src/rdo.c new file mode 100644 index 00000000..b8c1be4e --- /dev/null +++ b/src/rdo.c @@ -0,0 +1,575 @@ +/** + * \file + * + * \author Marko Viitanen ( fador@iki.fi ), + * Tampere University of Technology, + * Department of Pervasive Computing. + * \author Ari Koivula ( ari@koivu.la ), + * Tampere University of Technology, + * Department of Pervasive Computing. + */ + +#include +#include +#include + +#include "rdo.h" +#include "transform.h" +#include "context.h" +#include "cabac.h" + +#define QUANT_SHIFT 14 +#define MAX_TR_DYNAMIC_RANGE 15 +#define SCAN_SET_SIZE 16 +#define LOG2_SCAN_SET_SIZE 4 +#define SBH_THRESHOLD 4 + + + + +int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs, + uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type) +{ + int32_t iRate = 0; + uint32_t baseLevel = (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1; + cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0]; + cabac_ctx *base_abs_ctx = (type == 0) ? &g_cu_abs_model_luma[0] : &g_cu_abs_model_chroma[0]; + + if(!abs_level) return 0; + + if (abs_level >= baseLevel) { + uint32_t symbol = abs_level - baseLevel; + uint32_t max_vlc = g_go_rice_range[ abs_go_rice ]; + uint16_t pref_len,num_bins; + + if (symbol > max_vlc) { //Exp. Golomb + int32_t iEGS = 1; + uint32_t uiMax = 2; + abs_level = symbol - max_vlc; + for(; abs_level >= uiMax; uiMax <<= 1, iEGS += 2 ); + iRate += iEGS << 15; + symbol = MIN( symbol, ( max_vlc + 1 ) ); + } + + pref_len = (uint16_t)(symbol >> abs_go_rice) + 1; + num_bins = MIN( pref_len, g_go_rice_prefix_len[ abs_go_rice ] ) + abs_go_rice; + + iRate += num_bins << 15; + + if (c1_idx < C1FLAG_NUMBER) { + iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1); + if (c2_idx < C2FLAG_NUMBER) { + iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1); + } + } + } else if( abs_level == 1 ) { + iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0); + } else if( abs_level == 2 ) { + iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1); + iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0); + } + return iRate; +} + +/** Get the best level in RD sense + * \param coded_cost reference to coded cost + * \param coded_cost0 reference to cost when coefficient is 0 + * \param coded_cost_sig reference to cost of significant coefficient + * \param level_double reference to unscaled quantized level + * \param max_abs_level scaled quantized level + * \param ctx_num_sig current ctxInc for coeff_abs_significant_flag + * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC) + * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC) + * \param abs_go_rice current Rice parameter for coeff_abs_level_minus3 + * \param q_bits quantization step size + * \param temp correction factor + * \param last indicates if the coefficient is the last significant + * \returns best quantized transform level for given scan position + * This method calculates the best quantized transform level for a given scan position. + * From HM 12.0 + */ +uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double *coded_cost0, double *coded_cost_sig, + int32_t level_double, uint32_t max_abs_level, + uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs, + uint16_t abs_go_rice, + uint32_t c1_idx, uint32_t c2_idx, + int32_t q_bits,double temp, int8_t last, int8_t type) +{ + double cur_cost_sig = 0; + uint32_t best_abs_level = 0; + int32_t abs_level; + int32_t min_abs_level; + cabac_ctx* base_sig_model = type?g_cu_sig_model_chroma:g_cu_sig_model_luma; + + if( !last && max_abs_level < 3 ) { + *coded_cost_sig = g_lambda_cost[encoder->QP] * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0); + *coded_cost = *coded_cost0 + *coded_cost_sig; + if (max_abs_level == 0) return best_abs_level; + } else { + *coded_cost = MAX_DOUBLE; + } + + if( !last ) { + cur_cost_sig = g_lambda_cost[encoder->QP] * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1); + } + + min_abs_level = ( max_abs_level > 1 ? max_abs_level - 1 : 1 ); + for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) { + double err = (double)(level_double - ( abs_level << q_bits ) ); + double cur_cost = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type); + cur_cost += cur_cost_sig; + + if( cur_cost < *coded_cost ) { + best_abs_level = abs_level; + *coded_cost = cur_cost; + *coded_cost_sig = cur_cost_sig; + } + } + + return best_abs_level; +} + +/** RDOQ with CABAC + * \returns void + * Rate distortion optimized quantization for entropy + * coding engines using probability models like CABAC + * From HM 12.0 + */ +void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width, + int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth) +{ + uint32_t log2_tr_size = g_convert_to_bit[ width ] + 2; + int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size; // Represents scaling through forward transform + uint32_t go_rice_param = 0; + uint32_t log2_block_size = g_convert_to_bit[ width ] + 2; + uint32_t max_num_coeff = width * height; + int32_t scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]); + int32_t qp_base = encoder->QP; + + int32_t qp_scaled; + int32_t qp_offset = 0; + + if(type == 0) { + qp_scaled = qp_base + qp_offset; + } else { + qp_scaled = CLIP(-qp_offset, 57, qp_base); + if(qp_scaled < 0) { + qp_scaled = qp_scaled + qp_offset; + } else { + qp_scaled = g_chroma_scale[qp_scaled] + qp_offset; + } + } + + { + int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift; + int32_t add = ((encoder->in.cur_pic->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9); + + int32_t *quant_coeff_org = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6]; + int32_t *quant_coeff = quant_coeff_org; + + double *err_scale_org = NULL;//getErrScaleCoeff(scalingListType,uiLog2TrSize-2,m_cQP.m_iRem); + double *err_scale = err_scale_org; + + double block_uncoded_cost = 0; + + double cost_coeff [ 32 * 32 ]; + double cost_sig [ 32 * 32 ]; + double cost_coeff0[ 32 * 32 ]; + + int32_t rate_inc_up [ 32 * 32 ]; + int32_t rate_inc_down [ 32 * 32 ]; + int32_t sig_rate_delta[ 32 * 32 ]; + int32_t delta_u [ 32 * 32 ]; + + + const uint32_t *scan_cg = NULL; + const int32_t shift = 4>>1; + const uint32_t cg_size = 16; + const uint32_t num_blk_side = width >> shift; + double cost_coeffgroup_sig[ 64 ]; + uint32_t sig_coeffgroup_flag[ 64 ]; + + int32_t cg_last_scanpos = -1; + + uint32_t ctx_set = 0; + int32_t c1 = 1; + int32_t c2 = 0; + double base_cost = 0; + int32_t last_scanpos = -1; + + uint32_t c1_idx = 0; + uint32_t c2_idx = 0; + int32_t base_level; + + uint32_t *scan = g_sig_last_scan[ scan_idx ][ log2_block_size - 1 ]; + + + uint32_t cg_num = width * height >> 4; + int32_t scanpos; + + cabac_ctx *base_coeff_group_ctx = &g_cu_sig_coeff_group_model[type]; + cabac_ctx *baseCtx = (type == 0) ? &g_cu_sig_model_luma[0] : &g_cu_sig_model_chroma[0]; + cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0]; + + + double best_cost = 0; + int32_t ctx_cbf = 0; + int32_t best_last_idx_p1 = 0; + int8_t found_last = 0; + int32_t cg_scanpos, scanpos_in_cg; + + coeffgroup_rd_stats rd_stats; + + memset( cost_coeff, 0, sizeof(double) * max_num_coeff ); + memset( cost_sig, 0, sizeof(double) * max_num_coeff ); + memset( rate_inc_up, 0, sizeof(int32_t) * max_num_coeff ); + memset( rate_inc_down, 0, sizeof(int32_t) * max_num_coeff ); + memset( sig_rate_delta, 0, sizeof(int32_t) * max_num_coeff ); + memset( delta_u, 0, sizeof(int32_t) * max_num_coeff ); + + + memset( cost_coeffgroup_sig, 0, sizeof(double) * 64 ); + memset( sig_coeffgroup_flag, 0, sizeof(uint32_t) * 64 ); + + scan_cg = g_sig_last_scan[scan_mode][log2_block_size > 3 ? log2_block_size - 3 : 0]; + + if (log2_block_size == 3) { + scan_cg = g_sig_last_scan_8x8[scan_mode]; + } else if (log2_block_size == 5) { + scan_cg = g_sig_last_scan_32x32; + } + + for (cg_scanpos = cg_num-1; cg_scanpos >= 0; cg_scanpos--) { + uint32_t cg_blkpos = scan_cg[ cg_scanpos ]; + uint32_t cg_pos_y = cg_blkpos / num_blk_side; + uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side); + int32_t scanpos_in_cg; + + int32_t pattern_sig_ctx = context_calc_pattern_sig_ctx(sig_coeffgroup_flag, + cg_pos_x, cg_pos_y, width); + + memset( &rd_stats, 0, sizeof (coeffgroup_rd_stats)); + for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) { + uint32_t blkpos; + int32_t q; + double temp, err; + int32_t level_double; + uint32_t max_abs_level; + + scanpos = cg_scanpos*cg_size + scanpos_in_cg; + blkpos = scan[scanpos]; + q = quant_coeff[blkpos]; + temp = err_scale[blkpos]; + level_double = coef[blkpos]; + level_double = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1))); + max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits; + + err = (double)level_double; + cost_coeff0[ scanpos ] = err * err * temp; + block_uncoded_cost += cost_coeff0[ scanpos ]; + dest_coeff[ blkpos ] = max_abs_level; + + if ( max_abs_level > 0 && last_scanpos < 0 ) { + last_scanpos = scanpos; + ctx_set = (scanpos > 0 && type == 0) ? 2 : 0; + cg_last_scanpos = cg_scanpos; + } + + if ( last_scanpos >= 0 ) { + //===== coefficient level estimation ===== + int32_t level; + uint32_t one_ctx = 4 * ctx_set + c1; + uint32_t abs_ctx = ctx_set + c2; + + if( scanpos == last_scanpos ) { + level = get_coded_level(encoder, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ], + level_double, max_abs_level, 0, one_ctx, abs_ctx, go_rice_param, + c1_idx, c2_idx, q_bits, temp, 1, type ); + } else { + uint32_t pos_y = blkpos >> log2_block_size; + uint32_t pos_x = blkpos - ( pos_y << log2_block_size ); + uint16_t ctx_sig = context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y, + log2_block_size, width, type); + level = get_coded_level(encoder, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ], + level_double, max_abs_level, ctx_sig, one_ctx, abs_ctx, go_rice_param, + c1_idx, c2_idx, q_bits, temp, 0, type ); + sig_rate_delta[ blkpos ] = CTX_ENTROPY_BITS(&baseCtx[ctx_sig],1) - CTX_ENTROPY_BITS(&baseCtx[ctx_sig],0); + } + delta_u[ blkpos ] = (level_double - ((int32_t)level << q_bits)) >> (q_bits-8); + if( level > 0 ) { + int32_t rate_now = get_ic_rate( level, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type); + rate_inc_up [blkpos] = get_ic_rate( level+1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now; + rate_inc_down[blkpos] = get_ic_rate( level-1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now; + } else { // level == 0 + rate_inc_up[blkpos] = CTX_ENTROPY_BITS(&base_one_ctx[one_ctx],0); + } + dest_coeff[blkpos] = level; + base_cost += cost_coeff[scanpos]; + + base_level = (c1_idx < C1FLAG_NUMBER) ? (2 + (c2_idx < C2FLAG_NUMBER)) : 1; + if( level >= base_level ) { + if(level > 3*(1<= 1) c1_idx ++; + + //===== update bin model ===== + if (level > 1) { + c1 = 0; + c2 += (c2 < 2); + c2_idx ++; + } else if( (c1 < 3) && (c1 > 0) && level) { + c1++; + } + + //===== context set update ===== + if ((scanpos % SCAN_SET_SIZE == 0) && scanpos > 0) { + c2 = 0; + go_rice_param = 0; + + c1_idx = 0; + c2_idx = 0; + ctx_set = (scanpos == SCAN_SET_SIZE || type!=0) ? 0 : 2; + if( c1 == 0 ) { + ctx_set++; + } + c1 = 1; + } + } else { + base_cost += cost_coeff0[scanpos]; + } + rd_stats.sig_cost += cost_sig[scanpos]; + if (scanpos_in_cg == 0 ) { + rd_stats.sig_cost_0 = cost_sig[scanpos]; + } + if (dest_coeff[ blkpos ] ) { + sig_coeffgroup_flag[ cg_blkpos ] = 1; + rd_stats.coded_level_and_dist += cost_coeff[scanpos] - cost_sig[scanpos]; + rd_stats.uncoded_dist += cost_coeff0[scanpos]; + if ( scanpos_in_cg != 0 ) { + rd_stats.nnz_before_pos0++; + } + } + } //end for (scanpos_in_cg) + + if (cg_last_scanpos >= 0) { + if( cg_scanpos ) { + if (sig_coeffgroup_flag[ cg_blkpos ] == 0) { + uint32_t ctx_sig = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, + cg_pos_y, width); + cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); + base_cost += cost_coeffgroup_sig[ cg_scanpos ] - rd_stats.sig_cost; + + } else { + if (cg_scanpos < cg_last_scanpos) {//skip the last coefficient group, which will be handled together with last position below. + double cost_zero_cg; + uint32_t ctx_sig; + if (rd_stats.nnz_before_pos0 == 0) { + base_cost -= rd_stats.sig_cost_0; + rd_stats.sig_cost -= rd_stats.sig_cost_0; + } + // rd-cost if SigCoeffGroupFlag = 0, initialization + cost_zero_cg = base_cost; + + // add SigCoeffGroupFlag cost to total cost + ctx_sig = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, + cg_pos_y, width); + if (cg_scanpos < cg_last_scanpos) { + cost_coeffgroup_sig[cg_scanpos] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1); + base_cost += cost_coeffgroup_sig[cg_scanpos]; + cost_zero_cg += CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); + } + + // try to convert the current coeff group from non-zero to all-zero + cost_zero_cg += rd_stats.uncoded_dist; // distortion for resetting non-zero levels to zero levels + cost_zero_cg -= rd_stats.coded_level_and_dist; // distortion and level cost for keeping all non-zero levels + cost_zero_cg -= rd_stats.sig_cost; // sig cost for all coeffs, including zero levels and non-zerl levels + + // if we can save cost, change this block to all-zero block + if (cost_zero_cg < base_cost) { + int32_t scanpos_in_cg; + sig_coeffgroup_flag[ cg_blkpos ] = 0; + base_cost = cost_zero_cg; + if (cg_scanpos < cg_last_scanpos) { + cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); + } + // reset coeffs to 0 in this block + for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) { + uint32_t blkpos; + scanpos = cg_scanpos*cg_size + scanpos_in_cg; + blkpos = scan[ scanpos ]; + + if (dest_coeff[ blkpos ]) { + dest_coeff[ blkpos ] = 0; + cost_coeff[ scanpos ] = cost_coeff0[ scanpos ]; + cost_sig [ scanpos ] = 0; + } + } + } // end if ( cost_all_zeros < base_cost ) + } + } // end if if (sig_coeffgroup_flag[ cg_blkpos ] == 0) + } else { + sig_coeffgroup_flag[ cg_blkpos ] = 1; + } + } + } //end for (cg_scanpos) + + //===== estimate last position ===== + if (last_scanpos < 0) return; + + + if( block_type != CU_INTRA && !type/* && pcCU->getTransformIdx( uiAbsPartIdx ) == 0*/ ) { + best_cost = block_uncoded_cost + g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&g_cu_qt_root_cbf_model,0); + base_cost += g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&g_cu_qt_root_cbf_model,1); + } else { + cabac_ctx* base_cbf_model = type?g_qt_cbf_model_chroma:g_qt_cbf_model_luma; + ctx_cbf = ( type ? tr_depth : !tr_depth); + best_cost = block_uncoded_cost + g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0); + base_cost += g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1); + } + + for (cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) { + uint32_t cg_blkpos = scan_cg[cg_scanpos]; + + base_cost -= cost_coeffgroup_sig[cg_scanpos]; + if (sig_coeffgroup_flag[ cg_blkpos ]) { + for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) { + uint32_t blkpos; + scanpos = cg_scanpos*cg_size + scanpos_in_cg; + if (scanpos > last_scanpos) continue; + blkpos = scan[scanpos]; + + if( dest_coeff[ blkpos ] ) { + uint32_t pos_y = blkpos >> log2_block_size; + uint32_t pos_x = blkpos - ( pos_y << log2_block_size ); + + double cost_last = 0.0;//scan_idx == SCAN_VER ? xGetRateLast( pos_y, pos_x ) : xGetRateLast( pos_x, pos_y ); + double totalCost = base_cost + cost_last - cost_sig[ scanpos ]; + + if( totalCost < best_cost ) { + best_last_idx_p1 = scanpos + 1; + best_cost = totalCost; + } + if( dest_coeff[ blkpos ] > 1 ) { + found_last = 1; + break; + } + base_cost -= cost_coeff[ scanpos ]; + base_cost += cost_coeff0[ scanpos ]; + } else { + base_cost -= cost_sig[ scanpos ]; + } + } //end for + if (found_last) break; + } // end if (sig_coeffgroup_flag[ cg_blkpos ]) + } // end for + + for ( scanpos = 0; scanpos < best_last_idx_p1; scanpos++ ) { + int32_t blkPos = scan[ scanpos ]; + int32_t level = dest_coeff[ blkPos ]; + *abs_sum += level; + dest_coeff[ blkPos ] = ( coef[ blkPos ] < 0 ) ? -level : level; + } + + //===== clean uncoded coefficients ===== + for ( scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++ ) { + dest_coeff[ scan[ scanpos ] ] = 0; + } +#if ENABLE_SIGN_HIDING == 1 + if(*abs_sum >= 2) { + int64_t rd_factor = (int64_t) ( + g_inv_quant_scales[qp_scaled%6] * g_inv_quant_scales[qp_scaled%6] * (1<<(2*(qp_scaled/6))) + / g_lambda_cost[encoder->QP] / 16 / (1<<(2*(g_bitdepth-8))) + + 0.5); + int32_t lastCG = -1; + int32_t absSum = 0; + int32_t n,subset; + + for (subset = (width*height-1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) { + int32_t subPos = subset << LOG2_SCAN_SET_SIZE; + int32_t firstNZPosInCG=SCAN_SET_SIZE, lastNZPosInCG = -1; + absSum = 0; + + for(n = SCAN_SET_SIZE-1; n >= 0; --n ) { + if( dest_coeff[ scan[ n + subPos ]] ) { + lastNZPosInCG = n; + break; + } + } + + for(n = 0; n =0 && lastCG==-1) lastCG = 1; + + if (lastNZPosInCG-firstNZPosInCG >= SBH_THRESHOLD ) { + uint32_t signbit = (dest_coeff[scan[subPos+firstNZPosInCG]]>0?0:1); + if( signbit!=(absSum&0x1) ) { // hide but need tune + // calculate the cost + int64_t minCostInc = MAX_INT64, curCost=MAX_INT64; + int32_t minPos =-1, finalChange=0, curChange=0; + + for( n = (lastCG==1?lastNZPosInCG:SCAN_SET_SIZE-1) ; n >= 0; --n ) { + uint32_t blkpos = scan[ n + subPos ]; + if(dest_coeff[ blkpos ] != 0 ) { + int64_t costUp = rd_factor * (-delta_u[blkpos]) + rate_inc_up[blkpos]; + int64_t costDown = rd_factor * ( delta_u[blkpos]) + rate_inc_down[blkpos] + - ( abs(dest_coeff[blkpos])==1?((1<<15)+sig_rate_delta[blkpos]):0 ); + + if(lastCG==1 && lastNZPosInCG==n && abs(dest_coeff[blkpos])==1) { + costDown -= (4<<15); + } + + if(costUp= 0) ? 0 : 1) != signbit ) curCost = MAX_INT64; + } + } + + if( curCost=0) { + dest_coeff[minPos] += finalChange; + } else { + dest_coeff[minPos] -= finalChange; + } + } + } + if(lastCG==1) lastCG = 0; + } + } +#endif + } +} diff --git a/src/rdo.h b/src/rdo.h new file mode 100644 index 00000000..f8ac4c87 --- /dev/null +++ b/src/rdo.h @@ -0,0 +1,47 @@ +#ifndef RDO_H_ +#define RDO_H_ +/** + * \file + * \brief Handling Rate-Distortion Optimization related functionality + * + * \author Marko Viitanen ( fador@iki.fi ), + * Tampere University of Technology, + * Department of Pervasive Computing. + * \author Ari Koivula ( ari@koivu.la ), + * Tampere University of Technology, + * Department of Pervasive Computing. + */ + +#include "global.h" + +#include "encoder.h" + + +typedef struct +{ + double coded_level_and_dist; + double uncoded_dist; + double sig_cost; + double sig_cost_0; + int32_t nnz_before_pos0; +} coeffgroup_rd_stats; + +const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 }; +const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 }; + + +void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width, + int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth); + + +int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs, + uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type); +uint32_t get_coded_level ( encoder_control* encoder, double* coded_cost, double* coded_cost0, double* coded_cost_sig, + int32_t level_double, uint32_t max_abs_level, + uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs, + uint16_t abs_go_rice, + uint32_t c1_idx, uint32_t c2_idx, + int32_t q_bits,double temp, int8_t last, int8_t type); + + +#endif diff --git a/src/transform.h b/src/transform.h index 1767da55..960f410b 100644 --- a/src/transform.h +++ b/src/transform.h @@ -32,6 +32,7 @@ extern int32_t* g_quant_coeff[4][6][6]; extern const int32_t g_quant_intra_default_8x8[64]; extern const uint8_t g_chroma_scale[58]; +extern const int16_t g_inv_quant_scales[6]; void quant(encoder_control *encoder, int16_t *coef, int16_t *q_coef, int32_t width, From f447b92755739a7dd876f053cf5ba91bb26ae3ca Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 22 Jan 2014 14:11:55 +0200 Subject: [PATCH 3/9] Added error scaling list calculation from HM 12.0 --- src/transform.c | 33 +++++++++++++++++++++++++++++++-- src/transform.h | 4 ++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/transform.c b/src/transform.c index 5d7b5c59..06e89831 100644 --- a/src/transform.c +++ b/src/transform.c @@ -151,6 +151,7 @@ const uint8_t g_chroma_scale[58]= int32_t *g_quant_coeff[4][6][6]; int32_t *g_de_quant_coeff[4][6][6]; +double *g_error_scale[4][6][6]; const uint8_t g_scaling_list_num[4] = { 6, 6, 6, 2}; const uint16_t g_scaling_list_size[4] = { 16, 64, 256,1024}; @@ -177,6 +178,7 @@ void scalinglist_init() if (!(sizeId == 3 && listId == 3)) { g_quant_coeff[sizeId][listId][qp] = (int32_t*)calloc(g_scaling_list_size[sizeId], sizeof(int32_t)); g_de_quant_coeff[sizeId][listId][qp] = (int32_t*)calloc(g_scaling_list_size[sizeId], sizeof(int32_t)); + g_error_scale[sizeId][listId][qp] = (double*)calloc(g_scaling_list_size[sizeId], sizeof(double)); } } } @@ -185,6 +187,7 @@ void scalinglist_init() for (qp = 0; qp < 6; qp++) { g_quant_coeff[3][3][qp] = g_quant_coeff[3][1][qp]; g_de_quant_coeff[3][3][qp] = g_de_quant_coeff[3][1][qp]; + g_error_scale[3][3][qp] = g_error_scale[3][1][qp]; } } @@ -202,6 +205,7 @@ void scalinglist_destroy() if (!(sizeId == 3 && listId == 3)) { free( g_quant_coeff[sizeId][listId][qp]); free(g_de_quant_coeff[sizeId][listId][qp]); + free( g_error_scale[sizeId][listId][qp]); } } } @@ -238,6 +242,7 @@ void scalinglist_process() for (qp = 0; qp < SCALING_LIST_REM_NUM; qp++) { scalinglist_set(list_ptr, list, size, qp); + scalinglist_set_err_scale(list, size, qp); } } } @@ -246,9 +251,32 @@ void scalinglist_process() } +/** set error scale coefficients + * \param list List ID + * \param uiSize Size + * \param uiQP Quantization parameter + */ +#define MAX_TR_DYNAMIC_RANGE 15 +void scalinglist_set_err_scale(uint32_t list,uint32_t size, uint32_t qp) +{ + uint32_t log2_tr_size = g_convert_to_bit[ g_scaling_list_size_x[size] ] + 2; + int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size; // Represents scaling through forward transform + + uint32_t i,max_num_coeff = g_scaling_list_size[size]; + int32_t *quantcoeff = g_quant_coeff[size][list][qp]; + double *err_scale = g_error_scale[size][list][qp]; + + // Compensate for scaling of bitcount in Lagrange cost function + double scale = (double)(1<<15); + // Compensate for scaling through forward transform + scale = scale*pow(2.0,-2.0*transform_shift); + for(i=0;i + extern int32_t* g_quant_coeff[4][6][6]; +extern double* g_error_scale[4][6][6]; extern const int32_t g_quant_intra_default_8x8[64]; extern const uint8_t g_chroma_scale[58]; extern const int16_t g_inv_quant_scales[6]; @@ -47,6 +50,7 @@ void scalinglist_process_enc( int32_t *coeff, int32_t *quant_coeff, int32_t quan uint32_t height,uint32_t width, uint32_t ratio, int32_t size_num, uint32_t dc, uint8_t flat); void scalinglist_process(); void scalinglist_set(int32_t *coeff, uint32_t list_id, uint32_t size_id, uint32_t qp); +void scalinglist_set_err_scale(uint32_t list,uint32_t size, uint32_t qp); void scalinglist_destroy(); #endif From 144d5293b1f97a0cb1a3f9f7997a0f3d1917b014 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 22 Jan 2014 14:12:46 +0200 Subject: [PATCH 4/9] Implemented RDOQ function get_rate_last() --- src/rdo.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index b8c1be4e..07f98e39 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -114,21 +114,73 @@ uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double } min_abs_level = ( max_abs_level > 1 ? max_abs_level - 1 : 1 ); - for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) { - double err = (double)(level_double - ( abs_level << q_bits ) ); - double cur_cost = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type); - cur_cost += cur_cost_sig; + for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) { + double err = (double)(level_double - ( abs_level << q_bits ) ); + double cur_cost = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type); + cur_cost += cur_cost_sig; if( cur_cost < *coded_cost ) { - best_abs_level = abs_level; + best_abs_level = abs_level; *coded_cost = cur_cost; - *coded_cost_sig = cur_cost_sig; + *coded_cost_sig = cur_cost_sig; } } return best_abs_level; } + +/** Calculates the cost of signaling the last significant coefficient in the block + * \param pos_x X coordinate of the last significant coefficient + * \param pos_y Y coordinate of the last significant coefficient + * \returns cost of last significant coefficient + * \param uiWidth width of the transform unit (TU) + * + * From HM 12.0 +*/ +double get_rate_last(encoder_control* encoder, const uint32_t pos_x, const uint32_t pos_y, int32_t* last_x_bits, int32_t* last_y_bits) +{ + uint32_t ctx_x = g_group_idx[pos_x]; + uint32_t ctx_y = g_group_idx[pos_y]; + double uiCost = last_x_bits[ ctx_x ] + last_y_bits[ ctx_y ]; + if( ctx_x > 3 ) { + uiCost += 32768.0 * ((ctx_x-2)>>1); + } + if( ctx_y > 3 ) { + uiCost += 32768.0 * ((ctx_y-2)>>1); + } + return g_lambda_cost[encoder->QP]*uiCost; +} + +void calc_last_bits(int32_t width, int32_t height, int8_t type, int32_t* last_x_bits, int32_t* last_y_bits) +{ + int32_t bits_x = 0, bits_y = 0; + int32_t blk_size_offset_x, blk_size_offset_y, shiftX, shiftY; + int32_t ctx; + + cabac_ctx *base_ctx_x = (type ? g_cu_ctx_last_x_chroma : g_cu_ctx_last_x_luma); + cabac_ctx *base_ctx_y = (type ? g_cu_ctx_last_y_chroma : g_cu_ctx_last_y_luma); + + blk_size_offset_x = type ? 0: (g_convert_to_bit[ width ] *3 + ((g_convert_to_bit[ width ] +1)>>2)); + blk_size_offset_y = type ? 0: (g_convert_to_bit[ height ]*3 + ((g_convert_to_bit[ height ]+1)>>2)); + shiftX = type ? g_convert_to_bit[ width ] :((g_convert_to_bit[ width ]+3)>>2); + shiftY = type ? g_convert_to_bit[ height ] :((g_convert_to_bit[ height ]+3)>>2); + + + for (ctx = 0; ctx < g_group_idx[ width - 1 ]; ctx++) { + int32_t ctx_offset = blk_size_offset_x + (ctx >>shiftX); + last_x_bits[ ctx ] = bits_x + CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],0); + bits_x += CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],1); + } + last_x_bits[ctx] = bits_x; + for (ctx = 0; ctx < g_group_idx[ height - 1 ]; ctx++) { + int32_t ctx_offset = blk_size_offset_y + (ctx >>shiftY); + last_y_bits[ ctx ] = bits_y + CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],0); + bits_y += CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],1); + } + last_y_bits[ctx] = bits_y; +} + /** RDOQ with CABAC * \returns void * Rate distortion optimized quantization for entropy @@ -167,7 +219,7 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t *quant_coeff_org = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6]; int32_t *quant_coeff = quant_coeff_org; - double *err_scale_org = NULL;//getErrScaleCoeff(scalingListType,uiLog2TrSize-2,m_cQP.m_iRem); + double *err_scale_org = g_error_scale[scalinglist_type][log2_tr_size-2][qp_scaled%6]; double *err_scale = err_scale_org; double block_uncoded_cost = 0; @@ -220,6 +272,12 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, coeffgroup_rd_stats rd_stats; + int32_t last_x_bits[32],last_y_bits[32]; + + calc_last_bits(width, height, type,last_x_bits, last_y_bits); + + + memset( cost_coeff, 0, sizeof(double) * max_num_coeff ); memset( cost_sig, 0, sizeof(double) * max_num_coeff ); memset( rate_inc_up, 0, sizeof(int32_t) * max_num_coeff ); @@ -444,7 +502,7 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, uint32_t pos_y = blkpos >> log2_block_size; uint32_t pos_x = blkpos - ( pos_y << log2_block_size ); - double cost_last = 0.0;//scan_idx == SCAN_VER ? xGetRateLast( pos_y, pos_x ) : xGetRateLast( pos_x, pos_y ); + double cost_last = scan_idx == SCAN_VER ? get_rate_last(encoder, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(encoder, pos_x, pos_y, last_x_bits,last_y_bits ); double totalCost = base_cost + cost_last - cost_sig[ scanpos ]; if( totalCost < best_cost ) { From 80b3b4a6e0febd752061bab324226ca467fe4bac Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 22 Jan 2014 17:50:51 +0200 Subject: [PATCH 5/9] Added missing lambda parameter to some RDOQ costs and moved go_rice arrays from header file --- src/rdo.c | 42 +++++++++++++++++------------------------- src/rdo.h | 6 +++--- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index 07f98e39..55049db2 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -25,7 +25,8 @@ #define SBH_THRESHOLD 4 - +const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 }; +const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 }; int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs, uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type) @@ -115,7 +116,7 @@ uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double min_abs_level = ( max_abs_level > 1 ? max_abs_level - 1 : 1 ); for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) { - double err = (double)(level_double - ( abs_level << q_bits ) ); + double err = (double)(level_double - ( abs_level << q_bits ) ); double cur_cost = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type); cur_cost += cur_cost_sig; @@ -188,7 +189,7 @@ void calc_last_bits(int32_t width, int32_t height, int8_t type, int32_t* last_x_ * From HM 12.0 */ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width, - int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth) + int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth) { uint32_t log2_tr_size = g_convert_to_bit[ width ] + 2; int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size; // Represents scaling through forward transform @@ -214,13 +215,9 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, { int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift; - int32_t add = ((encoder->in.cur_pic->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9); - - int32_t *quant_coeff_org = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6]; - int32_t *quant_coeff = quant_coeff_org; - - double *err_scale_org = g_error_scale[scalinglist_type][log2_tr_size-2][qp_scaled%6]; - double *err_scale = err_scale_org; + + int32_t *quant_coeff = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6]; + double *err_scale = g_error_scale[log2_tr_size-2][scalinglist_type][qp_scaled%6]; double block_uncoded_cost = 0; @@ -253,7 +250,7 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, uint32_t c2_idx = 0; int32_t base_level; - uint32_t *scan = g_sig_last_scan[ scan_idx ][ log2_block_size - 1 ]; + uint32_t *scan = g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ]; uint32_t cg_num = width * height >> 4; @@ -263,28 +260,23 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, cabac_ctx *baseCtx = (type == 0) ? &g_cu_sig_model_luma[0] : &g_cu_sig_model_chroma[0]; cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0]; - double best_cost = 0; int32_t ctx_cbf = 0; int32_t best_last_idx_p1 = 0; int8_t found_last = 0; int32_t cg_scanpos, scanpos_in_cg; - coeffgroup_rd_stats rd_stats; + coeffgroup_rd_stats rd_stats; int32_t last_x_bits[32],last_y_bits[32]; - calc_last_bits(width, height, type,last_x_bits, last_y_bits); - - - - memset( cost_coeff, 0, sizeof(double) * max_num_coeff ); - memset( cost_sig, 0, sizeof(double) * max_num_coeff ); + + memset( cost_coeff, 0, sizeof(double) * max_num_coeff ); + memset( cost_sig, 0, sizeof(double) * max_num_coeff ); memset( rate_inc_up, 0, sizeof(int32_t) * max_num_coeff ); memset( rate_inc_down, 0, sizeof(int32_t) * max_num_coeff ); memset( sig_rate_delta, 0, sizeof(int32_t) * max_num_coeff ); memset( delta_u, 0, sizeof(int32_t) * max_num_coeff ); - memset( cost_coeffgroup_sig, 0, sizeof(double) * 64 ); memset( sig_coeffgroup_flag, 0, sizeof(uint32_t) * 64 ); @@ -416,7 +408,7 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, if (sig_coeffgroup_flag[ cg_blkpos ] == 0) { uint32_t ctx_sig = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, cg_pos_y, width); - cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); + cost_coeffgroup_sig[ cg_scanpos ] = g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); base_cost += cost_coeffgroup_sig[ cg_scanpos ] - rd_stats.sig_cost; } else { @@ -434,9 +426,9 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, ctx_sig = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, cg_pos_y, width); if (cg_scanpos < cg_last_scanpos) { - cost_coeffgroup_sig[cg_scanpos] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1); + cost_coeffgroup_sig[cg_scanpos] = g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1); base_cost += cost_coeffgroup_sig[cg_scanpos]; - cost_zero_cg += CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); + cost_zero_cg += g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); } // try to convert the current coeff group from non-zero to all-zero @@ -450,7 +442,7 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, sig_coeffgroup_flag[ cg_blkpos ] = 0; base_cost = cost_zero_cg; if (cg_scanpos < cg_last_scanpos) { - cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); + cost_coeffgroup_sig[ cg_scanpos ] = g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); } // reset coeffs to 0 in this block for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) { @@ -502,7 +494,7 @@ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, uint32_t pos_y = blkpos >> log2_block_size; uint32_t pos_x = blkpos - ( pos_y << log2_block_size ); - double cost_last = scan_idx == SCAN_VER ? get_rate_last(encoder, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(encoder, pos_x, pos_y, last_x_bits,last_y_bits ); + double cost_last = (scan_mode == SCAN_VER) ? get_rate_last(encoder, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(encoder, pos_x, pos_y, last_x_bits,last_y_bits ); double totalCost = base_cost + cost_last - cost_sig[ scanpos ]; if( totalCost < best_cost ) { diff --git a/src/rdo.h b/src/rdo.h index f8ac4c87..8354332c 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -26,12 +26,12 @@ typedef struct int32_t nnz_before_pos0; } coeffgroup_rd_stats; -const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 }; -const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 }; +extern const uint32_t g_go_rice_range[5]; +extern const uint32_t g_go_rice_prefix_len[5]; void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width, - int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth); + int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth); int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs, From 83a1e9a555218143ef512a4cb2d1bbc6a03788f5 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 27 Jan 2014 14:36:10 +0200 Subject: [PATCH 6/9] Added rdo to Makefile --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index eb4489a8..43dcba6e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -28,7 +28,7 @@ LDFLAGS = -lm LD = gcc YASM = yasm ASMOBJS = test64.o -OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o debug.o encoder.o filter.o inter.o intra.o nal.o picture.o sao.o search.o transform.o +OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o debug.o encoder.o filter.o inter.o intra.o nal.o picture.o rdo.o sao.o search.o transform.o PROG = ./kvazaar PROGS = $(PROG) From 0cdd9d032f740321dd49cef85668a9ae526c2cf0 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Mon, 27 Jan 2014 14:39:56 +0200 Subject: [PATCH 7/9] Added GPLv2 headers to rdo.c/.h --- src/rdo.c | 28 ++++++++++++++++++++-------- src/rdo.h | 28 ++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index 55049db2..2c97d5a7 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -1,12 +1,24 @@ -/** - * \file +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. * - * \author Marko Viitanen ( fador@iki.fi ), - * Tampere University of Technology, - * Department of Pervasive Computing. - * \author Ari Koivula ( ari@koivu.la ), - * Tampere University of Technology, - * Department of Pervasive Computing. + * Copyright (C) 2013-2014 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Kvazaar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kvazaar. If not, see . + ****************************************************************************/ + +/* + * \file */ #include diff --git a/src/rdo.h b/src/rdo.h index 8354332c..88589202 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -1,15 +1,27 @@ #ifndef RDO_H_ #define RDO_H_ -/** +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2014 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Kvazaar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kvazaar. If not, see . + ****************************************************************************/ + +/* * \file * \brief Handling Rate-Distortion Optimization related functionality - * - * \author Marko Viitanen ( fador@iki.fi ), - * Tampere University of Technology, - * Department of Pervasive Computing. - * \author Ari Koivula ( ari@koivu.la ), - * Tampere University of Technology, - * Department of Pervasive Computing. */ #include "global.h" From 5e759b8e1dae2ffb00aae634fd0538ded39ba058 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Tue, 28 Jan 2014 11:00:17 +0200 Subject: [PATCH 8/9] Fix for RDOQ, added missing cost function --- src/rdo.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++--------- src/rdo.h | 4 ++- 2 files changed, 76 insertions(+), 15 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index 2c97d5a7..47b32dc5 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -40,18 +40,75 @@ const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 }; const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 }; + +#define COEF_REMAIN_BIN_REDUCTION 3 +/** Calculates the cost for specific absolute transform level + * \param abs_level scaled quantized level + * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC) + * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC) + * \param abs_go_rice Rice parameter for coeff_abs_level_minus3 + * \returns cost of given absolute transform level + * From HM 12.0 + */ +double get_ic_rate_cost (uint32_t abs_level, + uint16_t ctx_num_one, + uint16_t ctx_num_abs, + uint16_t abs_go_rice, + uint32_t c1_idx, + uint32_t c2_idx, + int8_t type + ) +{ + double rate = 32768.0; + uint32_t base_level = (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1; + cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0]; + cabac_ctx *base_abs_ctx = (type == 0) ? &g_cu_abs_model_luma[0] : &g_cu_abs_model_chroma[0]; + + if ( abs_level >= base_level ) { + uint32_t symbol = abs_level - base_level; + uint32_t length; + if (symbol < (COEF_REMAIN_BIN_REDUCTION << abs_go_rice)) { + length = symbol>>abs_go_rice; + rate += (length+1+abs_go_rice)<< 15; + } else { + length = abs_go_rice; + symbol = symbol - ( COEF_REMAIN_BIN_REDUCTION << abs_go_rice); + while (symbol >= (1<= baseLevel) { - uint32_t symbol = abs_level - baseLevel; + if (abs_level >= base_level) { + uint32_t symbol = abs_level - base_level; uint32_t max_vlc = g_go_rice_range[ abs_go_rice ]; uint16_t pref_len,num_bins; @@ -60,28 +117,28 @@ int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_a uint32_t uiMax = 2; abs_level = symbol - max_vlc; for(; abs_level >= uiMax; uiMax <<= 1, iEGS += 2 ); - iRate += iEGS << 15; + rate += iEGS << 15; symbol = MIN( symbol, ( max_vlc + 1 ) ); } pref_len = (uint16_t)(symbol >> abs_go_rice) + 1; num_bins = MIN( pref_len, g_go_rice_prefix_len[ abs_go_rice ] ) + abs_go_rice; - iRate += num_bins << 15; + rate += num_bins << 15; if (c1_idx < C1FLAG_NUMBER) { - iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1); + rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1); if (c2_idx < C2FLAG_NUMBER) { - iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1); + rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1); } } } else if( abs_level == 1 ) { - iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0); + rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0); } else if( abs_level == 2 ) { - iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1); - iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0); + rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1); + rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0); } - return iRate; + return rate; } /** Get the best level in RD sense @@ -129,7 +186,9 @@ uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double min_abs_level = ( max_abs_level > 1 ? max_abs_level - 1 : 1 ); for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) { double err = (double)(level_double - ( abs_level << q_bits ) ); - double cur_cost = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type); + double cur_cost = err * err * temp + g_lambda_cost[encoder->QP] * + get_ic_rate_cost( abs_level, ctx_num_one, ctx_num_abs, + abs_go_rice, c1_idx, c2_idx, type); cur_cost += cur_cost_sig; if( cur_cost < *coded_cost ) { @@ -201,7 +260,7 @@ void calc_last_bits(int32_t width, int32_t height, int8_t type, int32_t* last_x_ * From HM 12.0 */ void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width, - int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth) + int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth) { uint32_t log2_tr_size = g_convert_to_bit[ width ] + 2; int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size; // Represents scaling through forward transform diff --git a/src/rdo.h b/src/rdo.h index 88589202..7c6c46c4 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -43,11 +43,13 @@ extern const uint32_t g_go_rice_prefix_len[5]; void rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width, - int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth); + int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth); int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs, uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type); +double get_ic_rate_cost (uint32_t abs_level, uint16_t ctx_num_one, uint16_t ctx_num_abs, + uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type); uint32_t get_coded_level ( encoder_control* encoder, double* coded_cost, double* coded_cost0, double* coded_cost_sig, int32_t level_double, uint32_t max_abs_level, uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs, From 7a21b9b769aa26d4652b89dd80e1e42286dcdc3a Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Tue, 28 Jan 2014 12:14:43 +0200 Subject: [PATCH 9/9] Changed lambda calculation to fix RDOQ issues, RDOQ is now in use by default. --- src/encoder.c | 27 ++++++++++++++++++++++++--- src/global.h | 2 ++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/encoder.c b/src/encoder.c index d872b9e7..f396042f 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -40,6 +40,7 @@ #include "filter.h" #include "search.h" #include "sao.h" +#include "rdo.h" int16_t g_lambda_cost[55]; uint32_t* g_sig_last_scan[3][7]; @@ -200,7 +201,6 @@ void init_tables(void) // Lambda cost // TODO: cleanup - //g_lambda_cost = (int16_t*)malloc(sizeof(int16_t)*55); for (i = 0; i < 55; i++) { if (i < 12) { g_lambda_cost[i] = 0; @@ -208,7 +208,14 @@ void init_tables(void) g_lambda_cost[i] = (int16_t)sqrt(0.57 * pow(2.0, (i - 12) / 3)); } - //g_lambda_cost[i] = g_lambda_cost[i]*g_lambda_cost[i]; + /** + * While working on RDOQ it was clear that the current lambda cost is wrong (compared to HM) + * so the cost is now lambda*lambda to fix some of those issues. + * This is not the final solution and this should be fixed by calculating the lambda like HM. + * TODO: fix lambda cost calculation + * - Marko Viitanen (Fador) + **/ + g_lambda_cost[i] = g_lambda_cost[i]*g_lambda_cost[i]; } } @@ -317,7 +324,7 @@ void encode_one_frame(encoder_control* encoder) bitstream_clear_buffer(encoder->stream); } else { cabac_start(&cabac); - encoder->in.cur_pic->slicetype = SLICE_P; + encoder->in.cur_pic->slicetype = SLICE_I; encoder->in.cur_pic->type = NAL_TRAIL_R; scalinglist_process(); search_slice_data(encoder); @@ -1488,7 +1495,11 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu, // Transform and quant residual to coeffs transform2d(block,pre_quant_coeff,width,0); + #if RDOQ == 1 + rdoq(encoder, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type,cur_cu->tr_depth-cur_cu->depth); + #else quant(encoder, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type); + #endif // Check for non-zero coeffs for (i = 0; i < width * width; i++) { @@ -1547,8 +1558,13 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu, } transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535); + #if RDOQ == 1 + rdoq(encoder, pre_quant_coeff, coeff_u, width >> 1, width >> 1, &ac_sum, 2, + scan_idx_chroma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth); + #else quant(encoder, pre_quant_coeff, coeff_u, width >> 1, width >> 1, &ac_sum, 2, scan_idx_chroma, cur_cu->type); + #endif for (i = 0; i < width *width >> 2; i++) { if (coeff_u[i] != 0) { @@ -1571,8 +1587,13 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu, } transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535); + #if RDOQ == 1 + rdoq(encoder, pre_quant_coeff, coeff_v, width >> 1, width >> 1, &ac_sum, 3, + scan_idx_chroma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth); + #else quant(encoder, pre_quant_coeff, coeff_v, width >> 1, width >> 1, &ac_sum, 3, scan_idx_chroma, cur_cu->type); + #endif for (i = 0; i < width *width >> 2; i++) { if (coeff_v[i] != 0) { diff --git a/src/global.h b/src/global.h index 6b3868d6..a079ebe3 100644 --- a/src/global.h +++ b/src/global.h @@ -77,6 +77,8 @@ typedef int16_t coefficient; #define OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD 0 /*!< skip residual coding when it's under _some_ threshold */ +#define RDOQ 1 /*!< Rate-Distortion Optimized Quantization */ + /* END OF CONFIG VARIABLES */ #define LCU_LUMA_SIZE (LCU_WIDTH * LCU_WIDTH)