From de6faf623d54d64ac69bc876749fe567001ef632 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 16 Jan 2014 17:13:48 +0200
Subject: [PATCH 1/9] Imported entropy bits array from HM and added macro to
 access it

---
 src/cabac.h   |  2 ++
 src/context.c | 14 ++++++++++++++
 src/context.h |  2 ++
 src/encoder.c |  4 ++--
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/cabac.h b/src/cabac.h
index 7f7b98c5..c0b9f4cd 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -75,6 +75,8 @@ void cabac_write_unary_max_symbol_ep(cabac_data *data, unsigned symbol, unsigned
 #define CTX_MPS(ctx) (ctx->uc_state & 1)
 #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = g_auc_next_state_lps[ (ctx)->uc_state ]; }
 #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = g_auc_next_state_mps[ (ctx)->uc_state ]; }
+#define CTX_ENTROPY_BITS(ctx,val) entropy_bits[(ctx)->uc_state ^ val]
+
 #ifdef VERBOSE
   #define CABAC_BIN(data, value, name) { \
     uint32_t prev_state = (data)->ctx->uc_state; \
diff --git a/src/context.c b/src/context.c
index f58433bd..ee9429c1 100644
--- a/src/context.c
+++ b/src/context.c
@@ -240,3 +240,17 @@ int32_t context_get_sig_ctx_inc(int32_t pattern_sig_ctx, uint32_t scan_idx, int3
   return (( texture_type == 0 && ((pos_x>>2) + (pos_y>>2)) > 0 ) ? 3 : 0) + offset + cnt;
 }
 
+/*
+ * Entropy bits to estimate coded bits in RDO / RDOQ (From HM 12.0)
+ */
+const uint32_t entropy_bits[128] =
+{
+  0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f,
+  0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d,
+  0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c,
+  0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a,
+  0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759,
+  0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458,
+  0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156,
+  0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55
+};
diff --git a/src/context.h b/src/context.h
index dd2a6ab0..8c949c8b 100644
--- a/src/context.h
+++ b/src/context.h
@@ -214,4 +214,6 @@ static const uint8_t INIT_ABS_FLAG[3][6] =
 };
 
 
+const uint32_t entropy_bits[ 128 ];
+
 #endif
diff --git a/src/encoder.c b/src/encoder.c
index 7474dfa7..d872b9e7 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -633,7 +633,7 @@ void encode_seq_parameter_set(encoder_control* encoder)
   //TODO: VUI?
   //encode_VUI(encoder);
   
-	WRITE_U(encoder->stream, 0, 1, "sps_extension_flag");
+  WRITE_U(encoder->stream, 0, 1, "sps_extension_flag");
 }
 
 void encode_vid_parameter_set(encoder_control* encoder)
@@ -668,7 +668,7 @@ void encode_vid_parameter_set(encoder_control* encoder)
   //IF timing info
   //END IF
 
-	WRITE_U(encoder->stream, 0, 1, "vps_extension_flag");
+  WRITE_U(encoder->stream, 0, 1, "vps_extension_flag");
 }
 
 void encode_VUI(encoder_control* encoder)

From 9f70bf74f00dbde267184ae5e29b02e92790c649 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Mon, 20 Jan 2014 16:34:11 +0200
Subject: [PATCH 2/9] Imported and converted RDOQ from HM 12.0, NOT WORKING YET

---
 build/VS2010/HEVC_encoder.vcxproj         |   2 +
 build/VS2010/HEVC_encoder.vcxproj.filters |   6 +
 src/global.h                              |  10 +
 src/rdo.c                                 | 575 ++++++++++++++++++++++
 src/rdo.h                                 |  47 ++
 src/transform.h                           |   1 +
 6 files changed, 641 insertions(+)
 create mode 100644 src/rdo.c
 create mode 100644 src/rdo.h

diff --git a/build/VS2010/HEVC_encoder.vcxproj b/build/VS2010/HEVC_encoder.vcxproj
index 76e9fd7f..3416e7ad 100644
--- a/build/VS2010/HEVC_encoder.vcxproj
+++ b/build/VS2010/HEVC_encoder.vcxproj
@@ -84,6 +84,7 @@
     <ClCompile Include="..\..\src\intra.c" />
     <ClCompile Include="..\..\src\nal.c" />
     <ClCompile Include="..\..\src\picture.c" />
+    <ClCompile Include="..\..\src\rdo.c" />
     <ClCompile Include="..\..\src\sao.c" />
     <ClCompile Include="..\..\src\search.c" />
     <ClCompile Include="..\..\src\transform.c" />
@@ -101,6 +102,7 @@
     <ClInclude Include="..\..\src\intra.h" />
     <ClInclude Include="..\..\src\nal.h" />
     <ClInclude Include="..\..\src\picture.h" />
+    <ClInclude Include="..\..\src\rdo.h" />
     <ClInclude Include="..\..\src\sao.h" />
     <ClInclude Include="..\..\src\search.h" />
     <ClInclude Include="..\..\src\transform.h" />
diff --git a/build/VS2010/HEVC_encoder.vcxproj.filters b/build/VS2010/HEVC_encoder.vcxproj.filters
index 560b3c4a..b0124e9c 100644
--- a/build/VS2010/HEVC_encoder.vcxproj.filters
+++ b/build/VS2010/HEVC_encoder.vcxproj.filters
@@ -72,6 +72,9 @@
     <ClCompile Include="..\..\src\sao.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\rdo.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\global.h">
@@ -125,6 +128,9 @@
     <ClInclude Include="..\..\src\sao.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\rdo.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\x86\test.asm">
diff --git a/src/global.h b/src/global.h
index 0d174daa..6b3868d6 100644
--- a/src/global.h
+++ b/src/global.h
@@ -142,4 +142,14 @@ typedef int16_t coefficient;
 #define FREE_POINTER(pointer) { free(pointer); pointer = NULL; }
 #define MOVE_POINTER(dst_pointer,src_pointer) { dst_pointer = src_pointer; src_pointer = NULL; }
 
+#ifndef MAX_INT
+#define MAX_INT 0x7FFFFFFF
+#endif
+#ifndef MAX_INT64
+#define MAX_INT64 0x7FFFFFFFFFFFFFFFLL
+#endif
+#ifndef MAX_DOUBLE
+#define MAX_DOUBLE 1.7e+308
+#endif
+
 #endif
\ No newline at end of file
diff --git a/src/rdo.c b/src/rdo.c
new file mode 100644
index 00000000..b8c1be4e
--- /dev/null
+++ b/src/rdo.c
@@ -0,0 +1,575 @@
+/**
+ * \file
+ * 
+ * \author Marko Viitanen ( fador@iki.fi ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ * \author Ari Koivula ( ari@koivu.la ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "rdo.h"
+#include "transform.h"
+#include "context.h"
+#include "cabac.h"
+
+#define QUANT_SHIFT          14
+#define MAX_TR_DYNAMIC_RANGE 15
+#define SCAN_SET_SIZE        16
+#define LOG2_SCAN_SET_SIZE    4
+#define SBH_THRESHOLD         4
+
+
+
+
+int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs,
+                     uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type)
+{
+  int32_t iRate = 0;
+  uint32_t baseLevel  =  (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
+  cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0];
+  cabac_ctx *base_abs_ctx = (type == 0) ? &g_cu_abs_model_luma[0] : &g_cu_abs_model_chroma[0];
+  
+  if(!abs_level) return 0;
+
+  if (abs_level >= baseLevel) {
+    uint32_t symbol     = abs_level - baseLevel;
+    uint32_t max_vlc     = g_go_rice_range[ abs_go_rice ];
+    uint16_t pref_len,num_bins;
+
+    if (symbol > max_vlc) { //Exp. Golomb
+      int32_t iEGS    = 1;
+      uint32_t uiMax = 2;
+      abs_level  = symbol - max_vlc;
+      for(; abs_level >= uiMax; uiMax <<= 1, iEGS += 2 );
+      iRate      += iEGS << 15;
+      symbol    = MIN( symbol, ( max_vlc + 1 ) );
+    }
+
+    pref_len = (uint16_t)(symbol >> abs_go_rice) + 1;
+    num_bins = MIN( pref_len, g_go_rice_prefix_len[ abs_go_rice ] ) + abs_go_rice;
+
+    iRate += num_bins << 15;
+
+    if (c1_idx < C1FLAG_NUMBER) {      
+      iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
+      if (c2_idx < C2FLAG_NUMBER) {        
+        iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1);
+      }
+    }
+  } else if( abs_level == 1 ) {
+    iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0);
+  } else if( abs_level == 2 ) {
+    iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
+    iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0);
+  }
+  return iRate;
+}
+
+/** Get the best level in RD sense
+ * \param coded_cost reference to coded cost
+ * \param coded_cost0 reference to cost when coefficient is 0
+ * \param coded_cost_sig reference to cost of significant coefficient
+ * \param level_double reference to unscaled quantized level
+ * \param max_abs_level scaled quantized level
+ * \param ctx_num_sig current ctxInc for coeff_abs_significant_flag
+ * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
+ * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
+ * \param abs_go_rice current Rice parameter for coeff_abs_level_minus3
+ * \param q_bits quantization step size
+ * \param temp correction factor
+ * \param last indicates if the coefficient is the last significant
+ * \returns best quantized transform level for given scan position
+ * This method calculates the best quantized transform level for a given scan position.
+ * From HM 12.0
+ */
+uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double *coded_cost0, double *coded_cost_sig,
+                           int32_t level_double, uint32_t max_abs_level,
+                           uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs,
+                           uint16_t abs_go_rice,
+                           uint32_t c1_idx, uint32_t c2_idx,
+                           int32_t q_bits,double temp, int8_t last, int8_t type)
+{
+  double cur_cost_sig   = 0;
+  uint32_t best_abs_level = 0;
+  int32_t abs_level;
+  int32_t min_abs_level;
+  cabac_ctx* base_sig_model = type?g_cu_sig_model_chroma:g_cu_sig_model_luma;
+
+  if( !last && max_abs_level < 3 ) {
+    *coded_cost_sig = g_lambda_cost[encoder->QP] * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
+    *coded_cost     = *coded_cost0 + *coded_cost_sig;
+    if (max_abs_level == 0) return best_abs_level;
+  } else {
+    *coded_cost = MAX_DOUBLE;
+  }
+
+  if( !last ) {
+    cur_cost_sig = g_lambda_cost[encoder->QP] * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
+  }
+
+  min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
+  for (abs_level  = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
+    double err         = (double)(level_double  - ( abs_level << q_bits ) );
+    double cur_cost    = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type);
+    cur_cost          += cur_cost_sig;
+
+    if( cur_cost < *coded_cost ) {
+      best_abs_level    = abs_level;
+      *coded_cost     = cur_cost;
+      *coded_cost_sig  = cur_cost_sig;
+    }
+  }
+
+  return best_abs_level;
+}
+
+/** RDOQ with CABAC
+ * \returns void
+ * Rate distortion optimized quantization for entropy
+ * coding engines using probability models like CABAC
+ * From HM 12.0
+ */
+void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width,
+           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth)
+{
+  uint32_t log2_tr_size    = g_convert_to_bit[ width ] + 2;  
+  int32_t  transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size;  // Represents scaling through forward transform
+  uint32_t go_rice_param   = 0;  
+  uint32_t log2_block_size = g_convert_to_bit[ width ] + 2;
+  uint32_t max_num_coeff   = width * height;
+  int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);  
+  int32_t  qp_base         = encoder->QP;
+
+  int32_t  qp_scaled;
+  int32_t  qp_offset = 0;
+
+  if(type == 0) {
+    qp_scaled = qp_base + qp_offset;
+  } else {
+    qp_scaled = CLIP(-qp_offset, 57, qp_base);
+    if(qp_scaled < 0) {
+      qp_scaled = qp_scaled + qp_offset;
+    } else {
+      qp_scaled = g_chroma_scale[qp_scaled] + qp_offset;
+    }
+  }
+
+  {
+  int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
+  int32_t add    = ((encoder->in.cur_pic->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9);
+
+  int32_t *quant_coeff_org = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6];
+  int32_t *quant_coeff     = quant_coeff_org;
+
+  double *err_scale_org = NULL;//getErrScaleCoeff(scalingListType,uiLog2TrSize-2,m_cQP.m_iRem);
+  double *err_scale     = err_scale_org;
+
+  double block_uncoded_cost = 0;
+
+  double cost_coeff [ 32 * 32 ];
+  double cost_sig   [ 32 * 32 ];
+  double cost_coeff0[ 32 * 32 ];
+
+  int32_t rate_inc_up   [ 32 * 32 ];
+  int32_t rate_inc_down [ 32 * 32 ];
+  int32_t sig_rate_delta[ 32 * 32 ];
+  int32_t delta_u       [ 32 * 32 ];
+
+  
+  const uint32_t *scan_cg = NULL;
+  const int32_t  shift   = 4>>1;
+  const uint32_t cg_size = 16;
+  const uint32_t num_blk_side    = width >> shift;
+  double   cost_coeffgroup_sig[ 64 ];
+  uint32_t sig_coeffgroup_flag[ 64 ];
+  
+  int32_t  cg_last_scanpos = -1;
+  
+  uint32_t    ctx_set        = 0;
+  int32_t     c1             = 1;
+  int32_t     c2             = 0;
+  double      base_cost      = 0;
+  int32_t     last_scanpos   = -1;
+  
+  uint32_t    c1_idx     = 0;
+  uint32_t    c2_idx     = 0;
+  int32_t     base_level;
+  
+  uint32_t *scan = g_sig_last_scan[ scan_idx ][ log2_block_size - 1 ];
+
+  
+  uint32_t cg_num = width * height >> 4;
+  int32_t  scanpos;
+
+  cabac_ctx *base_coeff_group_ctx = &g_cu_sig_coeff_group_model[type];
+  cabac_ctx *baseCtx              = (type == 0) ? &g_cu_sig_model_luma[0] : &g_cu_sig_model_chroma[0];
+  cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0];
+
+
+  double  best_cost        = 0;
+  int32_t ctx_cbf          = 0;
+  int32_t best_last_idx_p1 = 0;
+  int8_t found_last        = 0;
+  int32_t cg_scanpos, scanpos_in_cg;
+
+  coeffgroup_rd_stats rd_stats;     
+
+  memset( cost_coeff, 0, sizeof(double) *  max_num_coeff );
+  memset( cost_sig,   0, sizeof(double) *  max_num_coeff );
+  memset( rate_inc_up,    0, sizeof(int32_t) *  max_num_coeff );
+  memset( rate_inc_down,  0, sizeof(int32_t) *  max_num_coeff );
+  memset( sig_rate_delta, 0, sizeof(int32_t) *  max_num_coeff );
+  memset( delta_u,        0, sizeof(int32_t) *  max_num_coeff );
+
+    
+  memset( cost_coeffgroup_sig,   0, sizeof(double)   * 64 );
+  memset( sig_coeffgroup_flag,   0, sizeof(uint32_t) * 64 );
+
+  scan_cg = g_sig_last_scan[scan_mode][log2_block_size > 3 ? log2_block_size - 3 : 0];
+
+  if (log2_block_size == 3) {
+    scan_cg = g_sig_last_scan_8x8[scan_mode];
+  } else if (log2_block_size == 5) {
+    scan_cg = g_sig_last_scan_32x32;
+  }
+  
+  for (cg_scanpos = cg_num-1; cg_scanpos >= 0; cg_scanpos--) {
+    uint32_t cg_blkpos = scan_cg[ cg_scanpos ];
+    uint32_t cg_pos_y   = cg_blkpos / num_blk_side;
+    uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
+    int32_t  scanpos_in_cg;
+    
+    int32_t pattern_sig_ctx = context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
+                                                           cg_pos_x, cg_pos_y, width);
+
+    memset( &rd_stats, 0, sizeof (coeffgroup_rd_stats));
+    for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--)  {
+      uint32_t blkpos;
+      int32_t q;
+      double temp, err;
+      int32_t level_double;
+      uint32_t max_abs_level;
+
+      scanpos = cg_scanpos*cg_size + scanpos_in_cg;
+      blkpos          = scan[scanpos];
+      q  = quant_coeff[blkpos];
+      temp = err_scale[blkpos];
+      level_double        = coef[blkpos];
+      level_double        = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
+      max_abs_level       = (level_double + (1 << (q_bits - 1))) >> q_bits;
+      
+      err               = (double)level_double;
+      cost_coeff0[ scanpos ]  = err * err * temp;
+      block_uncoded_cost      += cost_coeff0[ scanpos ];
+      dest_coeff[ blkpos ]    = max_abs_level;
+      
+      if ( max_abs_level > 0 && last_scanpos < 0 ) {
+        last_scanpos             = scanpos;
+        ctx_set                  = (scanpos > 0 && type == 0) ? 2 : 0;
+        cg_last_scanpos          = cg_scanpos;
+      }
+      
+      if ( last_scanpos >= 0 ) {
+        //===== coefficient level estimation =====
+        int32_t  level;
+        uint32_t  one_ctx = 4 * ctx_set + c1;
+        uint32_t  abs_ctx = ctx_set + c2;
+        
+        if( scanpos == last_scanpos ) {
+          level            = get_coded_level(encoder, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ], 
+                                               level_double, max_abs_level, 0, one_ctx, abs_ctx, go_rice_param, 
+                                               c1_idx, c2_idx, q_bits, temp, 1, type );
+        } else {
+          uint32_t  pos_y    = blkpos >> log2_block_size;
+          uint32_t  pos_x    = blkpos - ( pos_y << log2_block_size );
+          uint16_t  ctx_sig  = context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
+                                                       log2_block_size, width, type);
+          level              = get_coded_level(encoder, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ],
+                                               level_double, max_abs_level, ctx_sig, one_ctx, abs_ctx, go_rice_param,
+                                               c1_idx, c2_idx, q_bits, temp, 0, type );
+          sig_rate_delta[ blkpos ] = CTX_ENTROPY_BITS(&baseCtx[ctx_sig],1) - CTX_ENTROPY_BITS(&baseCtx[ctx_sig],0);
+        }
+        delta_u[ blkpos ] = (level_double - ((int32_t)level << q_bits)) >> (q_bits-8);
+        if( level > 0 ) {
+          int32_t rate_now = get_ic_rate( level, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
+          rate_inc_up  [blkpos] = get_ic_rate( level+1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now;
+          rate_inc_down[blkpos] = get_ic_rate( level-1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now;
+        } else { // level == 0
+          rate_inc_up[blkpos] = CTX_ENTROPY_BITS(&base_one_ctx[one_ctx],0);
+        }
+        dest_coeff[blkpos] = level;
+        base_cost         += cost_coeff[scanpos];
+        
+        base_level = (c1_idx < C1FLAG_NUMBER) ? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
+        if( level >= base_level ) {
+          if(level  > 3*(1<<go_rice_param)) {
+            go_rice_param = MIN(go_rice_param + 1, 4);
+          }
+        }
+        if (level >= 1) c1_idx ++;
+        
+        //===== update bin model =====
+        if (level > 1) {
+          c1 = 0;
+          c2 += (c2 < 2);
+          c2_idx ++;
+        } else if( (c1 < 3) && (c1 > 0) && level) {
+          c1++;
+        }
+        
+        //===== context set update =====
+        if ((scanpos % SCAN_SET_SIZE == 0) && scanpos > 0) {
+          c2                = 0;
+          go_rice_param     = 0;
+
+          c1_idx   = 0;
+          c2_idx   = 0;
+          ctx_set = (scanpos == SCAN_SET_SIZE || type!=0) ? 0 : 2;
+          if( c1 == 0 ) {
+            ctx_set++;
+          }
+          c1 = 1;
+        }
+      } else {
+        base_cost += cost_coeff0[scanpos];
+      }
+      rd_stats.sig_cost += cost_sig[scanpos];
+      if (scanpos_in_cg == 0 ) {
+        rd_stats.sig_cost_0 = cost_sig[scanpos];
+      }
+      if (dest_coeff[ blkpos ] )  {
+        sig_coeffgroup_flag[ cg_blkpos ] = 1;
+        rd_stats.coded_level_and_dist += cost_coeff[scanpos] - cost_sig[scanpos];
+        rd_stats.uncoded_dist += cost_coeff0[scanpos];
+        if ( scanpos_in_cg != 0 ) {
+          rd_stats.nnz_before_pos0++;
+        }
+      }
+    } //end for (scanpos_in_cg)
+    
+    if (cg_last_scanpos >= 0) {
+      if( cg_scanpos ) {
+        if (sig_coeffgroup_flag[ cg_blkpos ] == 0) {          
+          uint32_t ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+                                                          cg_pos_y, width);
+          cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+          base_cost += cost_coeffgroup_sig[ cg_scanpos ]  - rd_stats.sig_cost;
+          
+        } else {
+          if (cg_scanpos < cg_last_scanpos) {//skip the last coefficient group, which will be handled together with last position below.          
+            double cost_zero_cg;
+            uint32_t ctx_sig;
+            if (rd_stats.nnz_before_pos0 == 0) {
+              base_cost -= rd_stats.sig_cost_0;
+              rd_stats.sig_cost -= rd_stats.sig_cost_0;
+            }
+            // rd-cost if SigCoeffGroupFlag = 0, initialization
+            cost_zero_cg = base_cost;
+            
+            // add SigCoeffGroupFlag cost to total cost
+            ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+                                                            cg_pos_y, width);
+            if (cg_scanpos < cg_last_scanpos) {
+              cost_coeffgroup_sig[cg_scanpos] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1);
+              base_cost    += cost_coeffgroup_sig[cg_scanpos];
+              cost_zero_cg += CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+            }
+            
+            // try to convert the current coeff group from non-zero to all-zero
+            cost_zero_cg += rd_stats.uncoded_dist;          // distortion for resetting non-zero levels to zero levels
+            cost_zero_cg -= rd_stats.coded_level_and_dist;  // distortion and level cost for keeping all non-zero levels
+            cost_zero_cg -= rd_stats.sig_cost;              // sig cost for all coeffs, including zero levels and non-zerl levels
+            
+            // if we can save cost, change this block to all-zero block
+            if (cost_zero_cg < base_cost) {
+              int32_t scanpos_in_cg;
+              sig_coeffgroup_flag[ cg_blkpos ] = 0;
+              base_cost = cost_zero_cg;
+              if (cg_scanpos < cg_last_scanpos) {
+                cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+              }
+              // reset coeffs to 0 in this block
+              for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+                uint32_t blkpos;
+                scanpos      = cg_scanpos*cg_size + scanpos_in_cg;
+                blkpos = scan[ scanpos ];
+                
+                if (dest_coeff[ blkpos ]) {
+                  dest_coeff[ blkpos ]  = 0;
+                  cost_coeff[ scanpos ] = cost_coeff0[ scanpos ];
+                  cost_sig  [ scanpos ] = 0;
+                }
+              }
+            } // end if ( cost_all_zeros < base_cost )
+          }
+        } // end if if (sig_coeffgroup_flag[ cg_blkpos ] == 0)
+      } else {
+        sig_coeffgroup_flag[ cg_blkpos ] = 1;
+      }
+    }
+  } //end for (cg_scanpos)
+  
+  //===== estimate last position =====
+  if (last_scanpos < 0) return;
+
+
+  if( block_type != CU_INTRA && !type/* && pcCU->getTransformIdx( uiAbsPartIdx ) == 0*/ ) {
+    best_cost  = block_uncoded_cost +   g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&g_cu_qt_root_cbf_model,0);
+    base_cost +=   g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&g_cu_qt_root_cbf_model,1);
+  } else {    
+    cabac_ctx* base_cbf_model = type?g_qt_cbf_model_chroma:g_qt_cbf_model_luma;
+    ctx_cbf   = ( type ? tr_depth : !tr_depth);
+    best_cost  = block_uncoded_cost +  g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
+    base_cost +=   g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
+  }
+  
+  for (cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
+    uint32_t cg_blkpos = scan_cg[cg_scanpos];
+    
+    base_cost -= cost_coeffgroup_sig[cg_scanpos];
+    if (sig_coeffgroup_flag[ cg_blkpos ]) {
+      for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+        uint32_t   blkpos;
+        scanpos = cg_scanpos*cg_size + scanpos_in_cg;
+        if (scanpos > last_scanpos) continue;
+        blkpos  = scan[scanpos];
+        
+        if( dest_coeff[ blkpos ] ) {
+          uint32_t   pos_y       = blkpos >> log2_block_size;
+          uint32_t   pos_x       = blkpos - ( pos_y << log2_block_size );
+          
+          double cost_last = 0.0;//scan_idx == SCAN_VER ? xGetRateLast( pos_y, pos_x ) : xGetRateLast( pos_x, pos_y );
+          double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
+          
+          if( totalCost < best_cost ) {
+            best_last_idx_p1  = scanpos + 1;
+            best_cost         = totalCost;
+          }
+          if( dest_coeff[ blkpos ] > 1 ) {
+            found_last = 1;
+            break;
+          }
+          base_cost  -= cost_coeff[ scanpos ];
+          base_cost  += cost_coeff0[ scanpos ];
+        } else {
+          base_cost  -= cost_sig[ scanpos ];
+        }
+      } //end for
+      if (found_last) break;
+    } // end if (sig_coeffgroup_flag[ cg_blkpos ])
+  } // end for
+  
+  for ( scanpos = 0; scanpos < best_last_idx_p1; scanpos++ ) {
+    int32_t blkPos = scan[ scanpos ];
+    int32_t level  = dest_coeff[ blkPos ];
+    *abs_sum += level;
+    dest_coeff[ blkPos ] = ( coef[ blkPos ] < 0 ) ? -level : level;
+  }
+  
+  //===== clean uncoded coefficients =====
+  for ( scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++ ) {
+    dest_coeff[ scan[ scanpos ] ] = 0;
+  }
+#if ENABLE_SIGN_HIDING == 1
+  if(*abs_sum >= 2) {    
+    int64_t rd_factor = (int64_t) (
+                     g_inv_quant_scales[qp_scaled%6] * g_inv_quant_scales[qp_scaled%6] * (1<<(2*(qp_scaled/6)))
+                   /  g_lambda_cost[encoder->QP] / 16 / (1<<(2*(g_bitdepth-8)))
+                   + 0.5);
+    int32_t lastCG = -1;
+    int32_t absSum = 0;
+    int32_t n,subset;
+    
+    for (subset = (width*height-1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {
+      int32_t  subPos     = subset << LOG2_SCAN_SET_SIZE;
+      int32_t  firstNZPosInCG=SCAN_SET_SIZE, lastNZPosInCG = -1;
+      absSum = 0;
+      
+      for(n = SCAN_SET_SIZE-1; n >= 0; --n ) {
+        if( dest_coeff[ scan[ n + subPos ]] ) {
+          lastNZPosInCG = n;
+          break;
+        }
+      }
+      
+      for(n = 0; n <SCAN_SET_SIZE; n++ ) {
+        if( dest_coeff[ scan[ n + subPos ]] ) {
+          firstNZPosInCG = n;
+          break;
+        }
+      }
+      
+      for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ ) {
+        absSum += dest_coeff[ scan[ n + subPos ]];
+      }
+      
+      if(lastNZPosInCG>=0 && lastCG==-1) lastCG = 1;
+      
+      if (lastNZPosInCG-firstNZPosInCG >= SBH_THRESHOLD ) {
+        uint32_t signbit = (dest_coeff[scan[subPos+firstNZPosInCG]]>0?0:1);
+        if( signbit!=(absSum&0x1) ) {  // hide but need tune        
+          // calculate the cost 
+          int64_t minCostInc = MAX_INT64, curCost=MAX_INT64;
+          int32_t minPos =-1, finalChange=0, curChange=0;
+          
+          for( n = (lastCG==1?lastNZPosInCG:SCAN_SET_SIZE-1) ; n >= 0; --n ) {
+            uint32_t blkpos   = scan[ n + subPos ];
+            if(dest_coeff[ blkpos ] != 0 ) {
+              int64_t costUp   = rd_factor * (-delta_u[blkpos]) + rate_inc_up[blkpos];
+              int64_t costDown = rd_factor * ( delta_u[blkpos]) + rate_inc_down[blkpos]
+                                 - ( abs(dest_coeff[blkpos])==1?((1<<15)+sig_rate_delta[blkpos]):0 );
+              
+              if(lastCG==1 && lastNZPosInCG==n && abs(dest_coeff[blkpos])==1) {
+                costDown -= (4<<15);
+              }
+              
+              if(costUp<costDown) {
+                curCost = costUp;
+                curChange =  1;
+              } else {
+                curChange = -1;
+                if(n==firstNZPosInCG && abs(dest_coeff[blkpos])==1) {
+                  curCost = MAX_INT64;
+                } else {
+                  curCost = costDown;
+                }
+              }
+            } else {
+              curCost = rd_factor * ( - (abs(delta_u[blkpos])) ) + (1<<15) + rate_inc_up[blkpos] + sig_rate_delta[blkpos];
+              curChange = 1;
+              
+              if(n<firstNZPosInCG) {
+                if( ((coef[blkpos] >= 0) ? 0 : 1) != signbit ) curCost = MAX_INT64;
+              }
+            }
+            
+            if( curCost<minCostInc) {
+              minCostInc  = curCost;
+              finalChange = curChange;
+              minPos      = blkpos;
+            }
+          }
+          
+          if(dest_coeff[minPos] == 32767 || dest_coeff[minPos] == -32768) {
+            finalChange = -1;
+          }
+          
+          if(coef[minPos]>=0) {
+            dest_coeff[minPos] += finalChange;
+          } else {
+            dest_coeff[minPos] -= finalChange;
+          }
+        }
+      }
+      if(lastCG==1) lastCG = 0;
+    }
+  }
+#endif
+  }
+}
diff --git a/src/rdo.h b/src/rdo.h
new file mode 100644
index 00000000..f8ac4c87
--- /dev/null
+++ b/src/rdo.h
@@ -0,0 +1,47 @@
+#ifndef RDO_H_
+#define RDO_H_
+/**
+ * \file
+ * \brief Handling Rate-Distortion Optimization related functionality
+ * 
+ * \author Marko Viitanen ( fador@iki.fi ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ * \author Ari Koivula ( ari@koivu.la ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ */
+
+#include "global.h"
+
+#include "encoder.h"
+
+
+typedef struct
+{  
+  double coded_level_and_dist;
+  double uncoded_dist;
+  double sig_cost;
+  double sig_cost_0;
+  int32_t nnz_before_pos0;
+} coeffgroup_rd_stats;
+
+const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
+const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
+
+
+void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width,
+           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth);
+
+
+int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs,
+                     uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type);
+uint32_t get_coded_level ( encoder_control* encoder, double* coded_cost, double* coded_cost0, double* coded_cost_sig,
+                           int32_t level_double, uint32_t max_abs_level,
+                           uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs,
+                           uint16_t abs_go_rice,
+                           uint32_t c1_idx, uint32_t c2_idx,
+                           int32_t q_bits,double temp, int8_t last, int8_t type);
+
+
+#endif
diff --git a/src/transform.h b/src/transform.h
index 1767da55..960f410b 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -32,6 +32,7 @@
 extern int32_t* g_quant_coeff[4][6][6];
 extern const int32_t g_quant_intra_default_8x8[64];
 extern const uint8_t g_chroma_scale[58];
+extern const int16_t g_inv_quant_scales[6];
 
 
 void quant(encoder_control *encoder, int16_t *coef, int16_t *q_coef, int32_t width,

From f447b92755739a7dd876f053cf5ba91bb26ae3ca Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 22 Jan 2014 14:11:55 +0200
Subject: [PATCH 3/9] Added error scaling list calculation from HM 12.0

---
 src/transform.c | 33 +++++++++++++++++++++++++++++++--
 src/transform.h |  4 ++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/transform.c b/src/transform.c
index 5d7b5c59..06e89831 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -151,6 +151,7 @@ const uint8_t g_chroma_scale[58]=
 
 int32_t *g_quant_coeff[4][6][6];
 int32_t *g_de_quant_coeff[4][6][6];
+double *g_error_scale[4][6][6];
 
 const uint8_t g_scaling_list_num[4]    = { 6, 6, 6, 2};
 const uint16_t g_scaling_list_size[4]  = {   16,  64, 256,1024}; 
@@ -177,6 +178,7 @@ void scalinglist_init()
         if (!(sizeId == 3 && listId == 3)) {
           g_quant_coeff[sizeId][listId][qp]    = (int32_t*)calloc(g_scaling_list_size[sizeId], sizeof(int32_t));
           g_de_quant_coeff[sizeId][listId][qp] = (int32_t*)calloc(g_scaling_list_size[sizeId], sizeof(int32_t));
+          g_error_scale[sizeId][listId][qp]    = (double*)calloc(g_scaling_list_size[sizeId], sizeof(double));
         }
       }
     }
@@ -185,6 +187,7 @@ void scalinglist_init()
   for (qp = 0; qp < 6; qp++) {
     g_quant_coeff[3][3][qp]    = g_quant_coeff[3][1][qp];
     g_de_quant_coeff[3][3][qp] = g_de_quant_coeff[3][1][qp];
+    g_error_scale[3][3][qp]    = g_error_scale[3][1][qp];
   }
 }
 
@@ -202,6 +205,7 @@ void scalinglist_destroy()
         if (!(sizeId == 3 && listId == 3)) {
           free(   g_quant_coeff[sizeId][listId][qp]);
           free(g_de_quant_coeff[sizeId][listId][qp]);
+          free(   g_error_scale[sizeId][listId][qp]);
         }
       }
     }
@@ -238,6 +242,7 @@ void scalinglist_process()
 
       for (qp = 0; qp < SCALING_LIST_REM_NUM; qp++) {
         scalinglist_set(list_ptr, list, size, qp);
+        scalinglist_set_err_scale(list, size, qp);
       }
     }
   }
@@ -246,9 +251,32 @@ void scalinglist_process()
 }
 
 
+/** set error scale coefficients
+ * \param list List ID
+ * \param uiSize Size
+ * \param uiQP Quantization parameter
+ */
+#define MAX_TR_DYNAMIC_RANGE 15
+void scalinglist_set_err_scale(uint32_t list,uint32_t size, uint32_t qp)
+{
+  uint32_t log2_tr_size   = g_convert_to_bit[ g_scaling_list_size_x[size] ] + 2;
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size;  // Represents scaling through forward transform
+
+  uint32_t i,max_num_coeff = g_scaling_list_size[size];
+  int32_t *quantcoeff      = g_quant_coeff[size][list][qp];
+  double *err_scale        = g_error_scale[size][list][qp];
+
+  // Compensate for scaling of bitcount in Lagrange cost function
+  double scale = (double)(1<<15);
+  // Compensate for scaling through forward transform
+  scale = scale*pow(2.0,-2.0*transform_shift);
+  for(i=0;i<max_num_coeff;i++) {
+    err_scale[i] = scale / quantcoeff[i] / quantcoeff[i] / (1<<(2*(g_bitdepth-8)));
+  }
+}
 
 /**
- * \brief get staling list for encoder
+ * \brief get scaling list for encoder
  * 
  */
 void scalinglist_process_enc( int32_t *coeff, int32_t *quantcoeff, int32_t quant_scales, uint32_t height,uint32_t width, uint32_t ratio, int32_t size_num, uint32_t dc, uint8_t flat)
@@ -276,7 +304,7 @@ void scalinglist_process_enc( int32_t *coeff, int32_t *quantcoeff, int32_t quant
 }
 
 /**
- * \brief get staling list for decoder
+ * \brief get scaling list for decoder
  * 
  */
 void scalinglist_process_dec( int32_t *coeff, int32_t *dequantcoeff, int32_t inv_quant_scales, uint32_t height,uint32_t width, uint32_t ratio, int32_t size_num, uint32_t dc, uint8_t flat)
@@ -320,6 +348,7 @@ void scalinglist_set(int32_t *coeff, uint32_t listId, uint32_t sizeId, uint32_t
   scalinglist_process_dec(coeff, dequantcoeff, g_inv_quant_scales[qp], height, width, ratio,
                           MIN(8, g_scaling_list_size_x[sizeId]), SCALING_LIST_DC, ENABLE_SCALING_LIST ? 0 : 1);
 
+
   // TODO: support NSQT
   // if(sizeId == /*SCALING_LIST_32x32*/3 || sizeId == /*SCALING_LIST_16x16*/2) { //for NSQT
   //   quantcoeff   = g_quant_coeff[listId][qp][sizeId-1][/*SCALING_LIST_VER*/1];
diff --git a/src/transform.h b/src/transform.h
index 960f410b..6bbf00a3 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -28,8 +28,11 @@
 
 #include "encoder.h"
 
+#include <math.h>
+
 
 extern int32_t* g_quant_coeff[4][6][6];
+extern double* g_error_scale[4][6][6];
 extern const int32_t g_quant_intra_default_8x8[64];
 extern const uint8_t g_chroma_scale[58];
 extern const int16_t g_inv_quant_scales[6];
@@ -47,6 +50,7 @@ void scalinglist_process_enc( int32_t *coeff, int32_t *quant_coeff, int32_t quan
                              uint32_t height,uint32_t width, uint32_t ratio, int32_t size_num, uint32_t dc, uint8_t flat);
 void scalinglist_process();
 void scalinglist_set(int32_t *coeff, uint32_t list_id, uint32_t size_id, uint32_t qp);
+void scalinglist_set_err_scale(uint32_t list,uint32_t size, uint32_t qp);
 void scalinglist_destroy();
 
 #endif

From 144d5293b1f97a0cb1a3f9f7997a0f3d1917b014 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 22 Jan 2014 14:12:46 +0200
Subject: [PATCH 4/9] Implemented RDOQ function get_rate_last()

---
 src/rdo.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index b8c1be4e..07f98e39 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -114,21 +114,73 @@ uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double
   }
 
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
-  for (abs_level  = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
-    double err         = (double)(level_double  - ( abs_level << q_bits ) );
-    double cur_cost    = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type);
-    cur_cost          += cur_cost_sig;
+  for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
+    double err       = (double)(level_double  - ( abs_level << q_bits ) );
+    double cur_cost  = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type);
+    cur_cost        += cur_cost_sig;
 
     if( cur_cost < *coded_cost ) {
-      best_abs_level    = abs_level;
+      best_abs_level  = abs_level;
       *coded_cost     = cur_cost;
-      *coded_cost_sig  = cur_cost_sig;
+      *coded_cost_sig = cur_cost_sig;
     }
   }
 
   return best_abs_level;
 }
 
+
+/** Calculates the cost of signaling the last significant coefficient in the block
+ * \param pos_x X coordinate of the last significant coefficient
+ * \param pos_y Y coordinate of the last significant coefficient
+ * \returns cost of last significant coefficient
+ * \param uiWidth width of the transform unit (TU)
+ *
+ * From HM 12.0
+*/
+double get_rate_last(encoder_control* encoder, const uint32_t  pos_x, const uint32_t pos_y, int32_t* last_x_bits, int32_t* last_y_bits)
+{
+  uint32_t ctx_x   = g_group_idx[pos_x];
+  uint32_t ctx_y   = g_group_idx[pos_y];
+  double uiCost = last_x_bits[ ctx_x ] + last_y_bits[ ctx_y ];
+  if( ctx_x > 3 ) {
+    uiCost += 32768.0 * ((ctx_x-2)>>1);
+  }
+  if( ctx_y > 3 ) {
+    uiCost += 32768.0 * ((ctx_y-2)>>1);
+  }
+  return g_lambda_cost[encoder->QP]*uiCost;
+}
+
+void calc_last_bits(int32_t width, int32_t height, int8_t type, int32_t* last_x_bits, int32_t* last_y_bits)
+{  
+  int32_t bits_x = 0, bits_y = 0;
+  int32_t blk_size_offset_x, blk_size_offset_y, shiftX, shiftY;
+  int32_t ctx;
+  
+  cabac_ctx *base_ctx_x = (type ? g_cu_ctx_last_x_chroma : g_cu_ctx_last_x_luma);
+  cabac_ctx *base_ctx_y = (type ? g_cu_ctx_last_y_chroma : g_cu_ctx_last_y_luma);
+
+  blk_size_offset_x = type ? 0: (g_convert_to_bit[ width ] *3 + ((g_convert_to_bit[ width ] +1)>>2));
+  blk_size_offset_y = type ? 0: (g_convert_to_bit[ height ]*3 + ((g_convert_to_bit[ height ]+1)>>2));
+  shiftX = type ? g_convert_to_bit[ width  ] :((g_convert_to_bit[ width  ]+3)>>2);
+  shiftY = type ? g_convert_to_bit[ height ] :((g_convert_to_bit[ height ]+3)>>2);
+
+
+  for (ctx = 0; ctx < g_group_idx[ width - 1 ]; ctx++) {
+    int32_t ctx_offset = blk_size_offset_x + (ctx >>shiftX);
+    last_x_bits[ ctx ] = bits_x + CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],0);
+    bits_x += CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],1);
+  }
+  last_x_bits[ctx] = bits_x;
+  for (ctx = 0; ctx < g_group_idx[ height - 1 ]; ctx++) {
+    int32_t ctx_offset = blk_size_offset_y + (ctx >>shiftY);
+    last_y_bits[ ctx ] = bits_y + CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],0);
+    bits_y +=  CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],1);
+  }
+  last_y_bits[ctx] = bits_y;
+}
+
 /** RDOQ with CABAC
  * \returns void
  * Rate distortion optimized quantization for entropy
@@ -167,7 +219,7 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
   int32_t *quant_coeff_org = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6];
   int32_t *quant_coeff     = quant_coeff_org;
 
-  double *err_scale_org = NULL;//getErrScaleCoeff(scalingListType,uiLog2TrSize-2,m_cQP.m_iRem);
+  double *err_scale_org = g_error_scale[scalinglist_type][log2_tr_size-2][qp_scaled%6];
   double *err_scale     = err_scale_org;
 
   double block_uncoded_cost = 0;
@@ -220,6 +272,12 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
 
   coeffgroup_rd_stats rd_stats;     
 
+  int32_t last_x_bits[32],last_y_bits[32];
+
+  calc_last_bits(width, height, type,last_x_bits, last_y_bits);
+
+
+
   memset( cost_coeff, 0, sizeof(double) *  max_num_coeff );
   memset( cost_sig,   0, sizeof(double) *  max_num_coeff );
   memset( rate_inc_up,    0, sizeof(int32_t) *  max_num_coeff );
@@ -444,7 +502,7 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
           uint32_t   pos_y       = blkpos >> log2_block_size;
           uint32_t   pos_x       = blkpos - ( pos_y << log2_block_size );
           
-          double cost_last = 0.0;//scan_idx == SCAN_VER ? xGetRateLast( pos_y, pos_x ) : xGetRateLast( pos_x, pos_y );
+          double cost_last = scan_idx == SCAN_VER ? get_rate_last(encoder, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(encoder, pos_x, pos_y, last_x_bits,last_y_bits );
           double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
           
           if( totalCost < best_cost ) {

From 80b3b4a6e0febd752061bab324226ca467fe4bac Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 22 Jan 2014 17:50:51 +0200
Subject: [PATCH 5/9] Added missing lambda parameter to some RDOQ costs and
 moved go_rice arrays from header file

---
 src/rdo.c | 42 +++++++++++++++++-------------------------
 src/rdo.h |  6 +++---
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index 07f98e39..55049db2 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -25,7 +25,8 @@
 #define SBH_THRESHOLD         4
 
 
-
+const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
+const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
 
 int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs,
                      uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type)
@@ -115,7 +116,7 @@ uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double
 
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
-    double err       = (double)(level_double  - ( abs_level << q_bits ) );
+    double err       = (double)(level_double - ( abs_level << q_bits ) );
     double cur_cost  = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type);
     cur_cost        += cur_cost_sig;
 
@@ -188,7 +189,7 @@ void calc_last_bits(int32_t width, int32_t height, int8_t type, int32_t* last_x_
  * From HM 12.0
  */
 void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width,
-           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth)
+           int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth)
 {
   uint32_t log2_tr_size    = g_convert_to_bit[ width ] + 2;  
   int32_t  transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size;  // Represents scaling through forward transform
@@ -214,13 +215,9 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
 
   {
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
-  int32_t add    = ((encoder->in.cur_pic->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9);
-
-  int32_t *quant_coeff_org = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6];
-  int32_t *quant_coeff     = quant_coeff_org;
-
-  double *err_scale_org = g_error_scale[scalinglist_type][log2_tr_size-2][qp_scaled%6];
-  double *err_scale     = err_scale_org;
+  
+  int32_t *quant_coeff  = g_quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6];
+  double *err_scale     = g_error_scale[log2_tr_size-2][scalinglist_type][qp_scaled%6];
 
   double block_uncoded_cost = 0;
 
@@ -253,7 +250,7 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
   uint32_t    c2_idx     = 0;
   int32_t     base_level;
   
-  uint32_t *scan = g_sig_last_scan[ scan_idx ][ log2_block_size - 1 ];
+  uint32_t *scan = g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
 
   
   uint32_t cg_num = width * height >> 4;
@@ -263,28 +260,23 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
   cabac_ctx *baseCtx              = (type == 0) ? &g_cu_sig_model_luma[0] : &g_cu_sig_model_chroma[0];
   cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0];
 
-
   double  best_cost        = 0;
   int32_t ctx_cbf          = 0;
   int32_t best_last_idx_p1 = 0;
   int8_t found_last        = 0;
   int32_t cg_scanpos, scanpos_in_cg;
 
-  coeffgroup_rd_stats rd_stats;     
+  coeffgroup_rd_stats rd_stats;
 
   int32_t last_x_bits[32],last_y_bits[32];
-
   calc_last_bits(width, height, type,last_x_bits, last_y_bits);
-
-
-
-  memset( cost_coeff, 0, sizeof(double) *  max_num_coeff );
-  memset( cost_sig,   0, sizeof(double) *  max_num_coeff );
+  
+  memset( cost_coeff,     0, sizeof(double) *  max_num_coeff );
+  memset( cost_sig,       0, sizeof(double) *  max_num_coeff );
   memset( rate_inc_up,    0, sizeof(int32_t) *  max_num_coeff );
   memset( rate_inc_down,  0, sizeof(int32_t) *  max_num_coeff );
   memset( sig_rate_delta, 0, sizeof(int32_t) *  max_num_coeff );
   memset( delta_u,        0, sizeof(int32_t) *  max_num_coeff );
-
     
   memset( cost_coeffgroup_sig,   0, sizeof(double)   * 64 );
   memset( sig_coeffgroup_flag,   0, sizeof(uint32_t) * 64 );
@@ -416,7 +408,7 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
         if (sig_coeffgroup_flag[ cg_blkpos ] == 0) {          
           uint32_t ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
                                                           cg_pos_y, width);
-          cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+          cost_coeffgroup_sig[ cg_scanpos ] = g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
           base_cost += cost_coeffgroup_sig[ cg_scanpos ]  - rd_stats.sig_cost;
           
         } else {
@@ -434,9 +426,9 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
             ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
                                                             cg_pos_y, width);
             if (cg_scanpos < cg_last_scanpos) {
-              cost_coeffgroup_sig[cg_scanpos] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1);
+              cost_coeffgroup_sig[cg_scanpos] = g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1);
               base_cost    += cost_coeffgroup_sig[cg_scanpos];
-              cost_zero_cg += CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+              cost_zero_cg += g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
             }
             
             // try to convert the current coeff group from non-zero to all-zero
@@ -450,7 +442,7 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
               sig_coeffgroup_flag[ cg_blkpos ] = 0;
               base_cost = cost_zero_cg;
               if (cg_scanpos < cg_last_scanpos) {
-                cost_coeffgroup_sig[ cg_scanpos ] = CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+                cost_coeffgroup_sig[ cg_scanpos ] = g_lambda_cost[encoder->QP]*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
               }
               // reset coeffs to 0 in this block
               for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) {
@@ -502,7 +494,7 @@ void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff,
           uint32_t   pos_y       = blkpos >> log2_block_size;
           uint32_t   pos_x       = blkpos - ( pos_y << log2_block_size );
           
-          double cost_last = scan_idx == SCAN_VER ? get_rate_last(encoder, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(encoder, pos_x, pos_y, last_x_bits,last_y_bits );
+          double cost_last = (scan_mode == SCAN_VER) ? get_rate_last(encoder, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(encoder, pos_x, pos_y, last_x_bits,last_y_bits );
           double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
           
           if( totalCost < best_cost ) {
diff --git a/src/rdo.h b/src/rdo.h
index f8ac4c87..8354332c 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -26,12 +26,12 @@ typedef struct
   int32_t nnz_before_pos0;
 } coeffgroup_rd_stats;
 
-const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
-const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
+extern const uint32_t g_go_rice_range[5];
+extern const uint32_t g_go_rice_prefix_len[5];
 
 
 void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width,
-           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_idx, int8_t block_type, int8_t scan_mode, int8_t tr_depth);
+           int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth);
 
 
 int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs,

From 83a1e9a555218143ef512a4cb2d1bbc6a03788f5 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Mon, 27 Jan 2014 14:36:10 +0200
Subject: [PATCH 6/9] Added rdo to Makefile

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index eb4489a8..43dcba6e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,7 +28,7 @@ LDFLAGS = -lm
 LD = gcc
 YASM = yasm
 ASMOBJS = test64.o
-OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o debug.o encoder.o filter.o inter.o intra.o nal.o picture.o sao.o search.o transform.o
+OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o debug.o encoder.o filter.o inter.o intra.o nal.o picture.o rdo.o sao.o search.o transform.o
 PROG  = ./kvazaar
 PROGS = $(PROG)
 

From 0cdd9d032f740321dd49cef85668a9ae526c2cf0 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Mon, 27 Jan 2014 14:39:56 +0200
Subject: [PATCH 7/9] Added GPLv2 headers to rdo.c/.h

---
 src/rdo.c | 28 ++++++++++++++++++++--------
 src/rdo.h | 28 ++++++++++++++++++++--------
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index 55049db2..2c97d5a7 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -1,12 +1,24 @@
-/**
- * \file
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
  * 
- * \author Marko Viitanen ( fador@iki.fi ), 
- *         Tampere University of Technology,
- *         Department of Pervasive Computing.
- * \author Ari Koivula ( ari@koivu.la ), 
- *         Tampere University of Technology,
- *         Department of Pervasive Computing.
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see 
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
+ * \file
  */
 
 #include <stdio.h>
diff --git a/src/rdo.h b/src/rdo.h
index 8354332c..88589202 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -1,15 +1,27 @@
 #ifndef RDO_H_
 #define RDO_H_
-/**
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ * 
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see 
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
  * \file
  * \brief Handling Rate-Distortion Optimization related functionality
- * 
- * \author Marko Viitanen ( fador@iki.fi ), 
- *         Tampere University of Technology,
- *         Department of Pervasive Computing.
- * \author Ari Koivula ( ari@koivu.la ), 
- *         Tampere University of Technology,
- *         Department of Pervasive Computing.
  */
 
 #include "global.h"

From 5e759b8e1dae2ffb00aae634fd0538ded39ba058 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 28 Jan 2014 11:00:17 +0200
Subject: [PATCH 8/9] Fix for RDOQ, added missing cost function

---
 src/rdo.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++---------
 src/rdo.h |  4 ++-
 2 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index 2c97d5a7..47b32dc5 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -40,18 +40,75 @@
 const uint32_t g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
 const uint32_t g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
 
+
+#define COEF_REMAIN_BIN_REDUCTION 3
+/** Calculates the cost for specific absolute transform level
+ * \param abs_level scaled quantized level
+ * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
+ * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
+ * \param abs_go_rice Rice parameter for coeff_abs_level_minus3
+ * \returns cost of given absolute transform level
+ * From HM 12.0
+ */
+double get_ic_rate_cost  (uint32_t abs_level,
+                          uint16_t ctx_num_one,
+                          uint16_t ctx_num_abs,
+                          uint16_t abs_go_rice,
+                          uint32_t c1_idx,
+                          uint32_t c2_idx,
+                          int8_t type
+                          )
+{
+  double rate = 32768.0;
+  uint32_t base_level  =  (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
+  cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0];
+  cabac_ctx *base_abs_ctx = (type == 0) ? &g_cu_abs_model_luma[0] : &g_cu_abs_model_chroma[0];
+
+  if ( abs_level >= base_level ) {
+    uint32_t symbol     = abs_level - base_level;
+    uint32_t length;
+    if (symbol < (COEF_REMAIN_BIN_REDUCTION << abs_go_rice)) {
+      length = symbol>>abs_go_rice;
+      rate += (length+1+abs_go_rice)<< 15;
+    } else {
+      length = abs_go_rice;
+      symbol  = symbol - ( COEF_REMAIN_BIN_REDUCTION << abs_go_rice);
+      while (symbol >= (1<<length)) {
+        symbol -=  (1<<(length++));
+      }
+      rate += (COEF_REMAIN_BIN_REDUCTION+length+1-abs_go_rice+length)<< 15;
+    }
+    if (c1_idx < C1FLAG_NUMBER) {
+      rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
+
+      if (c2_idx < C2FLAG_NUMBER) {
+        rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1);
+      }
+    }
+  }
+  else if( abs_level == 1 ) {
+    rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0);
+  } else if( abs_level == 2 ) {
+    rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
+    rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0);
+  }
+
+  return rate;
+}
+
+
 int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs,
                      uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type)
 {
-  int32_t iRate = 0;
-  uint32_t baseLevel  =  (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
+  int32_t rate = 0;
+  uint32_t base_level  =  (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
   cabac_ctx *base_one_ctx = (type == 0) ? &g_cu_one_model_luma[0] : &g_cu_one_model_chroma[0];
   cabac_ctx *base_abs_ctx = (type == 0) ? &g_cu_abs_model_luma[0] : &g_cu_abs_model_chroma[0];
   
   if(!abs_level) return 0;
 
-  if (abs_level >= baseLevel) {
-    uint32_t symbol     = abs_level - baseLevel;
+  if (abs_level >= base_level) {
+    uint32_t symbol     = abs_level - base_level;
     uint32_t max_vlc     = g_go_rice_range[ abs_go_rice ];
     uint16_t pref_len,num_bins;
 
@@ -60,28 +117,28 @@ int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_a
       uint32_t uiMax = 2;
       abs_level  = symbol - max_vlc;
       for(; abs_level >= uiMax; uiMax <<= 1, iEGS += 2 );
-      iRate      += iEGS << 15;
+      rate      += iEGS << 15;
       symbol    = MIN( symbol, ( max_vlc + 1 ) );
     }
 
     pref_len = (uint16_t)(symbol >> abs_go_rice) + 1;
     num_bins = MIN( pref_len, g_go_rice_prefix_len[ abs_go_rice ] ) + abs_go_rice;
 
-    iRate += num_bins << 15;
+    rate += num_bins << 15;
 
     if (c1_idx < C1FLAG_NUMBER) {      
-      iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
+      rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
       if (c2_idx < C2FLAG_NUMBER) {        
-        iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1);
+        rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1);
       }
     }
   } else if( abs_level == 1 ) {
-    iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0);
+    rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0);
   } else if( abs_level == 2 ) {
-    iRate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
-    iRate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0);
+    rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
+    rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0);
   }
-  return iRate;
+  return rate;
 }
 
 /** Get the best level in RD sense
@@ -129,7 +186,9 @@ uint32_t get_coded_level ( encoder_control* encoder, double *coded_cost, double
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
     double err       = (double)(level_double - ( abs_level << q_bits ) );
-    double cur_cost  = err * err * temp + get_ic_rate( abs_level, ctx_num_one, ctx_num_abs, abs_go_rice, c1_idx, c2_idx, type);
+    double cur_cost  = err * err * temp + g_lambda_cost[encoder->QP] *
+                       get_ic_rate_cost( abs_level, ctx_num_one, ctx_num_abs, 
+                                         abs_go_rice, c1_idx, c2_idx, type);
     cur_cost        += cur_cost_sig;
 
     if( cur_cost < *coded_cost ) {
@@ -201,7 +260,7 @@ void calc_last_bits(int32_t width, int32_t height, int8_t type, int32_t* last_x_
  * From HM 12.0
  */
 void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width,
-           int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth)
+           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth)
 {
   uint32_t log2_tr_size    = g_convert_to_bit[ width ] + 2;  
   int32_t  transform_shift = MAX_TR_DYNAMIC_RANGE - g_bitdepth - log2_tr_size;  // Represents scaling through forward transform
diff --git a/src/rdo.h b/src/rdo.h
index 88589202..7c6c46c4 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -43,11 +43,13 @@ extern const uint32_t g_go_rice_prefix_len[5];
 
 
 void  rdoq(encoder_control *encoder, coefficient *coef, coefficient *dest_coeff, int32_t width,
-           int32_t height, uint32_t *abs_sum, int8_t type, int8_t block_type, int8_t scan_mode, int8_t tr_depth);
+           int32_t height, uint32_t *abs_sum, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
 
 int32_t get_ic_rate( uint32_t abs_level, uint16_t ctx_num_one,uint16_t ctx_num_abs,
                      uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type);
+double get_ic_rate_cost  (uint32_t abs_level, uint16_t ctx_num_one, uint16_t ctx_num_abs,
+                          uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type);                          
 uint32_t get_coded_level ( encoder_control* encoder, double* coded_cost, double* coded_cost0, double* coded_cost_sig,
                            int32_t level_double, uint32_t max_abs_level,
                            uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs,

From 7a21b9b769aa26d4652b89dd80e1e42286dcdc3a Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 28 Jan 2014 12:14:43 +0200
Subject: [PATCH 9/9] Changed lambda calculation to fix RDOQ issues, RDOQ is
 now in use by default.

---
 src/encoder.c | 27 ++++++++++++++++++++++++---
 src/global.h  |  2 ++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index d872b9e7..f396042f 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -40,6 +40,7 @@
 #include "filter.h"
 #include "search.h"
 #include "sao.h"
+#include "rdo.h"
 
 int16_t g_lambda_cost[55];
 uint32_t* g_sig_last_scan[3][7];
@@ -200,7 +201,6 @@ void init_tables(void)
 
   // Lambda cost
   // TODO: cleanup
-  //g_lambda_cost = (int16_t*)malloc(sizeof(int16_t)*55);
   for (i = 0; i < 55; i++) {
     if (i < 12) {
       g_lambda_cost[i] = 0;
@@ -208,7 +208,14 @@ void init_tables(void)
       g_lambda_cost[i] = (int16_t)sqrt(0.57 * pow(2.0, (i - 12) / 3));
     }
 
-    //g_lambda_cost[i] = g_lambda_cost[i]*g_lambda_cost[i];
+    /**
+     * While working on RDOQ it was clear that the current lambda cost is wrong (compared to HM)
+     * so the cost is now lambda*lambda to fix some of those issues.
+     * This is not the final solution and this should be fixed by calculating the lambda like HM.
+     * TODO: fix lambda cost calculation
+     * - Marko Viitanen (Fador)
+     **/
+    g_lambda_cost[i] = g_lambda_cost[i]*g_lambda_cost[i];
   }
 
 }
@@ -317,7 +324,7 @@ void encode_one_frame(encoder_control* encoder)
     bitstream_clear_buffer(encoder->stream);
   } else {
     cabac_start(&cabac);
-    encoder->in.cur_pic->slicetype = SLICE_P;
+    encoder->in.cur_pic->slicetype = SLICE_I;
     encoder->in.cur_pic->type = NAL_TRAIL_R;
     scalinglist_process();
     search_slice_data(encoder);
@@ -1488,7 +1495,11 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 
     // Transform and quant residual to coeffs
     transform2d(block,pre_quant_coeff,width,0);
+    #if RDOQ == 1
+    rdoq(encoder, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type,cur_cu->tr_depth-cur_cu->depth);    
+    #else
     quant(encoder, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type);
+    #endif
 
     // Check for non-zero coeffs
     for (i = 0; i < width * width; i++) {
@@ -1547,8 +1558,13 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       }
 
       transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
+      #if RDOQ == 1
+      rdoq(encoder, pre_quant_coeff, coeff_u, width >> 1, width >> 1, &ac_sum, 2,
+            scan_idx_chroma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
+      #else      
       quant(encoder, pre_quant_coeff, coeff_u, width >> 1, width >> 1, &ac_sum, 2,
             scan_idx_chroma, cur_cu->type);
+      #endif
 
       for (i = 0; i < width *width >> 2; i++) {
         if (coeff_u[i] != 0) {
@@ -1571,8 +1587,13 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       }
 
       transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
+      #if RDOQ == 1
+      rdoq(encoder, pre_quant_coeff, coeff_v, width >> 1, width >> 1, &ac_sum, 3,
+           scan_idx_chroma, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
+      #else
       quant(encoder, pre_quant_coeff, coeff_v, width >> 1, width >> 1, &ac_sum, 3,
             scan_idx_chroma, cur_cu->type);
+      #endif
 
       for (i = 0; i < width *width >> 2; i++) {
         if (coeff_v[i] != 0) {
diff --git a/src/global.h b/src/global.h
index 6b3868d6..a079ebe3 100644
--- a/src/global.h
+++ b/src/global.h
@@ -77,6 +77,8 @@ typedef int16_t coefficient;
 
 #define OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD 0 /*!< skip residual coding when it's under _some_ threshold */
 
+#define RDOQ 1 /*!< Rate-Distortion Optimized Quantization */
+
 /* END OF CONFIG VARIABLES */
 
 #define LCU_LUMA_SIZE (LCU_WIDTH * LCU_WIDTH)