From 06fd1d6fa9a1504c046ffa058722f2b2abd514be Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 12 Jun 2013 15:41:57 +0300
Subject: [PATCH] Fix for sign bit hiding, not working yet

---
 build/VS2010/HEVC_encoder.vcxproj |   2 +-
 src/encmain.c                     |  18 ++--
 src/encoder.c                     |  38 ++++++--
 src/encoder.h                     |  19 ++--
 src/global.h                      |   4 +-
 src/inter.c                       |   2 +-
 src/intra.c                       |  12 +--
 src/picture.c                     |  70 +++++++++------
 src/picture.h                     |   6 +-
 src/search.c                      |   4 +-
 src/transform.c                   | 140 +++++++++++++++++++++++++++---
 11 files changed, 241 insertions(+), 74 deletions(-)
diff --git a/build/VS2010/HEVC_encoder.vcxproj b/build/VS2010/HEVC_encoder.vcxproj
index b51c3dbb..11d8e4b1 100644
--- a/build/VS2010/HEVC_encoder.vcxproj
+++ b/build/VS2010/HEVC_encoder.vcxproj
@@ -100,7 +100,7 @@
       </PrecompiledHeader>
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;X64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;WIN64;X64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <CompileAs>CompileAsC</CompileAs>
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
     </ClCompile>
diff --git a/src/encmain.c b/src/encmain.c
index 7208d407..3eddee1b 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -68,6 +68,8 @@
     FILE *input  = NULL;
     FILE *output = NULL;
     double PSNR[3] = { 0.0, 0.0, 0.0 };
+    fpos_t curpos = 0;
+    fpos_t lastpos = 0;
     #ifdef _DEBUG
     FILE *recout = fopen("encrec.yuv","wb");
     #endif
@@ -149,7 +151,7 @@
     encoder->stream->buffer_pos = 0;
     encoder->stream->output = 0;
     /* Alloc 1MB */
-    bitstream_alloc(encoder->stream, 1024*1024);
+    bitstream_alloc(encoder->stream, 1024*2*cfg->width);
 
     /* Config pointer to encoder struct */
     encoder->cfg = cfg;
@@ -161,10 +163,10 @@
     /* input init (ToDo: read from commandline / config) */
     encoder->bitdepth = 8;
     encoder->frame    = 0;
-    encoder->QP       = 36;
+    encoder->QP       = 32;
     encoder->in.video_format = FORMAT_420;
     /* deblocking */
-    encoder->deblock_enable = 1;
+    encoder->deblock_enable  = 1;
     encoder->betaOffsetdiv2  = 0;
     encoder->tcOffsetdiv2    = 0;
     /* SAO */
@@ -197,12 +199,17 @@
       fwrite(encoder->in.cur_pic.vRecData,cfg->width*cfg->height>>2,1,recout);
       #endif
       {
+        int32_t diff;
         double temp_PSNR[3];
+        fgetpos(output,&curpos);
+        diff = (int32_t)(curpos-lastpos);
+        lastpos = curpos;
+
         temp_PSNR[0] = imagePSNR(encoder->in.cur_pic.yData,encoder->in.cur_pic.yRecData,cfg->width,cfg->height);
         temp_PSNR[1] = imagePSNR(encoder->in.cur_pic.uData,encoder->in.cur_pic.uRecData,cfg->width>>1,cfg->height>>1);
         temp_PSNR[2] = imagePSNR(encoder->in.cur_pic.vData,encoder->in.cur_pic.vRecData,cfg->width>>1,cfg->height>>1);
         
-        printf("[%d] %c-frame PSNR: %2.4f %2.4f %2.4f\n", encoder->frame, "BPI"[encoder->in.cur_pic.slicetype%3],
+        printf("POC %4d (%c-frame) %10d bits PSNR: %2.4f %2.4f %2.4f\n", encoder->frame, "BPI"[encoder->in.cur_pic.slicetype%3],diff<<3,
                                                         temp_PSNR[0],temp_PSNR[1],temp_PSNR[2]);
         PSNR[0]+=temp_PSNR[0];
         PSNR[1]+=temp_PSNR[1];
@@ -211,8 +218,9 @@
       encoder->frame++;
     }
     /* Coding finished */
+    fgetpos(output,&curpos);
 
-    printf(" Processed %d frames, AVG PSNR: %2.4f %2.4f %2.4f\n", encoder->frame,PSNR[0]/encoder->frame,PSNR[1]/encoder->frame,PSNR[2]/encoder->frame);
+    printf(" Processed %d frames, %10d bits AVG PSNR: %2.4f %2.4f %2.4f\n", encoder->frame,((int32_t)curpos)<<3,PSNR[0]/encoder->frame,PSNR[1]/encoder->frame,PSNR[2]/encoder->frame);
 
     fclose(input);
     fclose(output);
diff --git a/src/encoder.c b/src/encoder.c
index bb89656f..8a843e8c 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -32,6 +32,7 @@
 #include "search.h"
 
 int16_t g_lambda_cost[55];
+uint32_t* g_auiSigLastScan[3][7];
 
 void initSigLastScan(uint32_t* pBuffD, uint32_t* pBuffH, uint32_t* pBuffV, int32_t iWidth, int32_t iHeight)
 {
@@ -229,6 +230,8 @@ void init_encoder_input(encoder_input* input,FILE* inputfile, int32_t width, int
 
   input->cur_pic.width  = width;
   input->cur_pic.height = height;
+  input->cur_pic.width_in_LCU  = input->width_in_LCU;
+  input->cur_pic.height_in_LCU = input->height_in_LCU;
   input->cur_pic.referenced = 0;
   /* Allocate buffers */
   input->cur_pic.yData = (uint8_t *)malloc(width*height);
@@ -859,7 +862,7 @@ void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, ui
       uint32_t width = LCU_WIDTH>>depth;
 
       /* INTRAPREDICTION VARIABLES */
-      int16_t pred[LCU_WIDTH*LCU_WIDTH];
+      int16_t pred[LCU_WIDTH*LCU_WIDTH+1];
       int16_t predU[LCU_WIDTH*LCU_WIDTH>>2];
       int16_t predV[LCU_WIDTH*LCU_WIDTH>>2];
 
@@ -867,11 +870,22 @@ void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, ui
       uint8_t *recbaseU  = &encoder->in.cur_pic.uRecData[xCtb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (yCtb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
       uint8_t *recbaseV  = &encoder->in.cur_pic.vRecData[xCtb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (yCtb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
 
+
+      /* SEARCH BEST INTRA MODE (AGAIN) */  
+      
+      int16_t rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
+      int16_t *recShift = &rec[(LCU_WIDTH>>(depth))*2+8+1];      
+      intra_buildReferenceBorder(&encoder->in.cur_pic, xCtb, yCtb,(LCU_WIDTH>>(depth))*2+8, rec, (LCU_WIDTH>>(depth))*2+8, 0);
+      cur_CU->intra.mode = (int8_t)intra_prediction(encoder->in.cur_pic.yData,encoder->in.width,recShift,(LCU_WIDTH>>(depth))*2+8,xCtb*(LCU_WIDTH>>(MAX_DEPTH)),yCtb*(LCU_WIDTH>>(MAX_DEPTH)),width,pred,width,&cur_CU->intra.cost);
+      intraPredMode = cur_CU->intra.mode;
+      intra_setBlockMode(&encoder->in.cur_pic,xCtb, yCtb, depth, intraPredMode);
+      
       #if ENABLE_PCM == 1
       /* Code must start after variable initialization */
       cabac_encodeBinTrm(&cabac, 0); /* IPCMFlag == 0 */
       #endif
-
+      
+      
       /*
         PREDINFO CODING
         If intra prediction mode is found from the predictors,
@@ -1142,6 +1156,8 @@ void encode_transform_tree(encoder_control* encoder,transform_info* ti,uint8_t d
     int16_t rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
     int16_t *recShift  = &rec[(LCU_WIDTH>>(depth))*2+8+1];
     int16_t *recShiftU = &rec[(LCU_WIDTH>>(depth+1))*2+8+1];
+
+    uint32_t ac_sum = 0;
     
     /* Build reconstructed block to use in prediction with extrapolated borders */
     intra_buildReferenceBorder(&encoder->in.cur_pic, ti->xCtb, ti->yCtb,(LCU_WIDTH>>(depth))*2+8, rec, (LCU_WIDTH>>(depth))*2+8, 0);
@@ -1168,17 +1184,19 @@ void encode_transform_tree(encoder_control* encoder,transform_info* ti,uint8_t d
 
     /* Get residual by subtracting prediction */
     i = 0;
+    ac_sum = 0;
     for(y = 0; y < LCU_WIDTH>>depth; y++)
     {
       for(x = 0; x < LCU_WIDTH>>depth; x++)
       {
-        block[i++]=((int16_t)base[x+y*base_stride])-pred[x+y*pred_stride];
+        block[i]=((int16_t)base[x+y*base_stride])-pred[x+y*pred_stride];
+        i++;
       }
     }
 
     /* Transform and quant residual to coeffs */
     transform2d(block,pre_quant_coeff,width,0);
-    quant(encoder,pre_quant_coeff,coeff,width, width,0, 0, SCAN_DIAG);
+    quant(encoder,pre_quant_coeff,coeff,width, width,&ac_sum, 0, SCAN_DIAG);
 
     /* Check for non-zero coeffs */
     for(i = 0; i < width*width; i++)
@@ -1226,15 +1244,17 @@ void encode_transform_tree(encoder_control* encoder,transform_info* ti,uint8_t d
     {
       /* U */
       i = 0;
+      ac_sum = 0;
       for(y = 0; y < LCU_WIDTH>>(depth+1); y++)
       {
         for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
         {
-          block[i++]=((int16_t)baseU[x+y*(base_stride>>1)])-predU[x+y*(pred_stride>>1)];
+          block[i]=((int16_t)baseU[x+y*(base_stride>>1)])-predU[x+y*(pred_stride>>1)];
+          i++;
         }
       }
       transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
-      quant(encoder,pre_quant_coeff,coeffU, width>>1, width>>1, 0,2,SCAN_DIAG);
+      quant(encoder,pre_quant_coeff,coeffU, width>>1, width>>1, &ac_sum,2,SCAN_DIAG);
       for(i = 0; i < width*width>>2; i++)
       {
         if(coeffU[i] != 0)
@@ -1247,15 +1267,17 @@ void encode_transform_tree(encoder_control* encoder,transform_info* ti,uint8_t d
 
       /* V */
       i = 0;
+      ac_sum = 0;
       for(y = 0; y < LCU_WIDTH>>(depth+1); y++)
       {
         for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
         {
-          block[i++]=((int16_t)baseV[x+y*(base_stride>>1)])-predV[x+y*(pred_stride>>1)];
+          block[i]=((int16_t)baseV[x+y*(base_stride>>1)])-predV[x+y*(pred_stride>>1)];
+          i++;
         }
       }
       transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
-      quant(encoder,pre_quant_coeff,coeffV, width>>1, width>>1, 0,3,SCAN_DIAG);
+      quant(encoder,pre_quant_coeff,coeffV, width>>1, width>>1, &ac_sum,3,SCAN_DIAG);
       for(i = 0; i < width*width>>2; i++)
       {
         if(coeffV[i] != 0)
diff --git a/src/encoder.h b/src/encoder.h
index b979c84a..10add525 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -33,12 +33,13 @@ enum { FORMAT_400 = 0, FORMAT_420, FORMAT_422, FORMAT_444 };
 typedef struct
 {
   FILE* file;
-  int32_t width;
-  int32_t height;
-  int32_t height_in_LCU;
-  int32_t width_in_LCU;
+  int32_t width;  /*!< \brief input picture width */
+  int32_t height; /*!< \brief input picture height */
+  int32_t height_in_LCU; /*!< \brief input picture width in LCU*/
+  int32_t width_in_LCU;  /*!< \brief input picture height in LCU */
   picture cur_pic;
   int8_t video_format;
+  int8_t bitdepth;  /*!< \brief input bit depth (8,10) */
 } encoder_input;
 
 /* Encoder control options, the main struct */
@@ -55,10 +56,10 @@ typedef struct
   int8_t bitdepth;
 
   /* Filtering */
-  int8_t deblock_enable;
-  int8_t sao_enable;
-  int8_t betaOffsetdiv2;
-  int8_t tcOffsetdiv2;
+  int8_t deblock_enable; /*!< \brief Flag to enable deblocking filter */
+  int8_t sao_enable;     /*!< \brief Flag to enable sample adaptive offset filter */
+  int8_t betaOffsetdiv2; /*!< \brief (deblocking) beta offset (div 2), range -6...6 */
+  int8_t tcOffsetdiv2;   /*!< \brief (deblocking)tc offset (div 2), range -6...6 */
 } encoder_control;
 
 typedef struct
@@ -110,7 +111,7 @@ void encode_transform_tree(encoder_control* encoder,transform_info* ti,uint8_t d
 void encode_transform_coeff(encoder_control* encoder,transform_info* ti,int8_t depth, int8_t trDepth);
 
 extern int16_t g_lambda_cost[55];
-static uint32_t* g_auiSigLastScan[3][7];
+extern uint32_t* g_auiSigLastScan[3][7];
 int8_t g_aucConvertToBit[LCU_WIDTH+1];
 static int8_t g_bitDepth     = 8;
 static int8_t g_uiBitIncrement = 0;
diff --git a/src/global.h b/src/global.h
index 8c097278..f05ee8da 100644
--- a/src/global.h
+++ b/src/global.h
@@ -17,13 +17,13 @@
 #define LCU_WIDTH 64 /*!< Largest Coding Unit */
 
 #define MAX_SEARCH_DEPTH 3
-#define MIN_SEARCH_DEPTH 2
+#define MIN_SEARCH_DEPTH 1
 
 #define MAX_DEPTH 3  /*!< smallest CU is LCU_WIDTH>>MAX_DEPTH */
 #define MIN_SIZE 3   /*!< log2_min_coding_block_size */
 
 #define ENABLE_PCM 0 /*!< Setting to 1 will enable using PCM blocks (current intra-search does not consider PCM) */
-#define ENABLE_SIGN_HIDING 0 /*!< NEED QUANT CHANGES! */
+#define ENABLE_SIGN_HIDING 1 /*!< NEED QUANT CHANGES! */
 
 /* END OF CONFIG VARIABLES */
 
diff --git a/src/inter.c b/src/inter.c
index d61eccdc..8999262b 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -34,7 +34,7 @@ void inter_setBlockMode(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t depth
 {
   uint32_t x,y,d;
   /* Width in smallest CU */
-  int width_in_SCU = pic->width/(LCU_WIDTH>>MAX_DEPTH);
+  int width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;
   int block_SCU_width = (LCU_WIDTH>>depth)/(LCU_WIDTH>>MAX_DEPTH);
   for(y = yCtb; y < yCtb+block_SCU_width; y++)
   {
diff --git a/src/intra.c b/src/intra.c
index 46aa014b..2d07c464 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -37,7 +37,7 @@ void intra_setBlockMode(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t depth
 {
   uint32_t x,y,d;
   /* Width in smallest CU */
-  int width_in_SCU = pic->width/(LCU_WIDTH>>MAX_DEPTH);
+  int width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;
   int block_SCU_width = (LCU_WIDTH>>depth)/(LCU_WIDTH>>MAX_DEPTH);
   for(y = yCtb; y < yCtb+block_SCU_width; y++)
   {
@@ -65,7 +65,7 @@ void intra_setBlockMode(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t depth
 int8_t intra_getBlockMode(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t depth)
 {
   //Width in smallest CU
-  int width_in_SCU = pic->width/(LCU_WIDTH>>MAX_DEPTH);
+  int width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;
   int CUpos = yCtb*width_in_SCU+xCtb;
   if(pic->CU[depth][CUpos].type == CU_INTRA)
   {
@@ -115,7 +115,7 @@ int8_t intra_getDirLumaPredictor(picture* pic,uint32_t xCtb, uint32_t yCtb, uint
 {
   int32_t iLeftIntraDir  = 1; //DC_IDX
   int32_t iAboveIntraDir = 1; //DC_IDX
-  int32_t width_in_SCU = pic->width/(LCU_WIDTH>>MAX_DEPTH);
+  int width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;
   int32_t CUpos = yCtb*width_in_SCU+xCtb;
   
   // Left PU predictor
@@ -297,7 +297,7 @@ int16_t intra_prediction(uint8_t* orig,int32_t origstride,int16_t* rec,int32_t r
     if(distance <= threshold)
     {
       intra_getAngularPred(rec,recstride,pred, width,width,width,i, xpos?1:0, ypos?1:0, filter);
-      CHECK_FOR_BEST(i,distance*width); /* Favor modes closer to 26 and 10 */
+      CHECK_FOR_BEST(i,0); /* Favor modes closer to 26 and 10 */
     }
   } 
 
@@ -319,7 +319,7 @@ int16_t intra_prediction(uint8_t* orig,int32_t origstride,int16_t* rec,int32_t r
     if(distance > threshold)
     {
       intra_getAngularPred(recFiltered,recstride,pred, width,width,width,i, xpos?1:0, ypos?1:0, filter);
-      CHECK_FOR_BEST(i,distance*width); /* Favor modes closer to 26 and 10 */
+      CHECK_FOR_BEST(i,0); /* Favor modes closer to 26 and 10 */
     }
   }
 
@@ -399,7 +399,7 @@ void intra_buildReferenceBorder(picture* pic, int32_t xCtb, int32_t yCtb,int16_t
   uint8_t* srcPic      = (!chroma)?pic->yRecData: ((chroma==1)?pic->uRecData: pic->vRecData); /*!< input picture pointer */  
   int16_t SCU_width    = LCU_WIDTH>>(MAX_DEPTH+(chroma?1:0)); /*!< Smallest Coding Unit width */
   uint8_t* srcShifted  = &srcPic[xCtb*SCU_width+(yCtb*SCU_width)*srcWidth];  /*!< input picture pointer shifted to start from the left-top corner of the current block */
-  int32_t width_in_SCU = srcWidth/SCU_width;     /*!< picture width in SCU */
+  int width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;     /*!< picture width in SCU */
 
   /* Fill left column */
   if(xCtb)
diff --git a/src/picture.c b/src/picture.c
index 0007923d..810eeae3 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -6,7 +6,7 @@
 /*! \file picture.c
     \brief Functions to handle pictures
     \author Marko Viitanen
-    \date 2013-04
+    \date 2013-06
     
   This file contains all the needed functions to handle pictures
 
@@ -20,6 +20,12 @@
 #include "picture.h"
 
 
+/** \defgroup picture_group Picture handler group
+ *  This group contains all picture related stuff
+ *  @{
+ */
+
+
 
 /*!
  \brief Set block splitflag
@@ -34,7 +40,7 @@ void picture_setBlockSplit(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t de
 {
   uint32_t x,y;//,d;
   //Width in smallest CU
-  int width_in_SCU = pic->width/(LCU_WIDTH>>MAX_DEPTH);
+  int width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;
   int block_SCU_width = (LCU_WIDTH>>depth)/(LCU_WIDTH>>MAX_DEPTH);
   for(y = yCtb; y < yCtb+block_SCU_width; y++)
   {
@@ -46,7 +52,6 @@ void picture_setBlockSplit(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t de
   }
 }
 
-
 /*!
  \brief Set block coded status
  \param pic picture to use
@@ -60,7 +65,7 @@ void picture_setBlockCoded(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t de
 {
   uint32_t x,y,d;
   //Width in smallest CU
-  int width_in_SCU = pic->width/(LCU_WIDTH>>MAX_DEPTH);
+  int width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;
   int block_SCU_width = (LCU_WIDTH>>depth)/(LCU_WIDTH>>MAX_DEPTH);
   for(y = yCtb; y < yCtb+block_SCU_width; y++)
   {
@@ -76,11 +81,6 @@ void picture_setBlockCoded(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t de
 }
 
 
-/** \defgroup picture_group Picture handler group
- *  This group contains all picture related stuff
- *  @{
- */
-
 
 /*!
     \brief Allocate memory for picture_list
@@ -188,10 +188,10 @@ void picture_setBlockCoded(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t de
 //Calculates image PSNR value
 double imagePSNR(uint8_t *frame1, uint8_t *frame2, int32_t x, int32_t y)
 {   
-  int64_t MSE=0;
-  int64_t MSEtemp=0;
+  uint64_t MSE=0;
+  int32_t MSEtemp=0;
   double psnr=0.0;
-  double pixels = x*y;
+  int32_t pixels = x*y;
   int32_t index;
 
   //Calculate MSE
@@ -205,7 +205,7 @@ double imagePSNR(uint8_t *frame1, uint8_t *frame2, int32_t x, int32_t y)
   if(MSE==0) return 99.0;
 
   //The PSNR
-  psnr=10*log10(PSNRMAX/((double)MSE/pixels));
+  psnr=10*log10((pixels*PSNRMAX)/((double)MSE));
 
   //Thats it.
   return psnr;
@@ -327,6 +327,8 @@ uint32_t SAD64x64(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stri
 {
   int32_t i,ii,y,x;
   uint32_t sum=0;
+  /*
+
   for(y=0;y<64;y++)
   {
     i = y*stride1; 
@@ -336,6 +338,17 @@ uint32_t SAD64x64(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stri
       sum+=abs((int16_t)block[i+x]-(int16_t)block2[ii+x]);
     }
 
+  }*/
+  int32_t  iOffsetOrg = stride1<<3;
+  int32_t  iOffsetCur = stride2<<3;
+  for ( y=0; y<64; y+= 8 )
+  {
+    for ( x=0; x<64; x+= 8 )
+    {
+      sum += Hadamard8x8( &block[x], stride1,&block2[x],  stride2 );
+    }
+    block += iOffsetOrg;
+    block2 += iOffsetCur;
   }
 
   return sum;    
@@ -344,7 +357,7 @@ uint32_t SAD64x64(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stri
 uint32_t SAD32x32(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2)
 {
   int32_t y;
-  /*
+  
   int32_t x,sum = 0;
   int32_t  iOffsetOrg = stride1<<3;
   int32_t  iOffsetCur = stride2<<3;
@@ -352,13 +365,13 @@ uint32_t SAD32x32(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stri
   {
     for ( x=0; x<32; x+= 8 )
     {
-      sum += Hadamard8x8( &block[x], &block2[x], stride1, stride2 );
+      sum += Hadamard8x8( &block[x], stride1,&block2[x],  stride2 );
     }
     block += iOffsetOrg;
     block2 += iOffsetCur;
   }
 
-  */
+  /*
   uint32_t sum=0;
   int32_t i,ii;
   for(y=0;y<32;y++)
@@ -398,7 +411,7 @@ uint32_t SAD32x32(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stri
     sum+=abs((int32_t)block[i+30]-(int32_t)block2[ii+30]);
     sum+=abs((int32_t)block[i+31]-(int32_t)block2[ii+31]);
   }
-  
+  */
   return sum;    
 }
 
@@ -406,22 +419,22 @@ uint32_t SAD32x32(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stri
 uint32_t SAD16x16(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2)
 {
   int32_t y;
-    /*
+    
   int32_t x,sum = 0;
   int32_t  iOffsetOrg = stride1<<3;
   int32_t  iOffsetCur = stride2<<3;
-    for ( y=0; y<16; y+= 8 )
+  for ( y=0; y<16; y+= 8 )
+  {
+    for ( x=0; x<16; x+= 8 )
     {
-      for ( x=0; x<16; x+= 8 )
-      {
-        sum += Hadamard8x8( &block[x], &block2[x], stride1, stride2 );
-      }
-      block += iOffsetOrg;
-      block2 += iOffsetCur;
+      sum += Hadamard8x8( &block[x], stride1,&block2[x],  stride2 );
     }
+    block += iOffsetOrg;
+    block2 += iOffsetCur;
+  }
   
   
-  */
+  /*
   uint32_t sum=0;
   int32_t i,ii;
   for(y=0;y<16;y++)
@@ -445,6 +458,7 @@ uint32_t SAD16x16(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stri
     sum+=abs((int32_t)block[i+14]-(int32_t)block2[ii+14]);
     sum+=abs((int32_t)block[i+15]-(int32_t)block2[ii+15]);
   }  
+  */
   return sum;    
 }
 
@@ -453,6 +467,9 @@ uint32_t SAD8x8(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride
 {
   int32_t i,ii,y;
   uint32_t sum=0;
+  sum = Hadamard8x8( block, stride1,block2,  stride2 );
+  /*
+  
   for(y=0;y<8;y++)
   {
     i = y*stride1; 
@@ -466,6 +483,7 @@ uint32_t SAD8x8(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride
     sum+=abs((int32_t)block[i+6]-(int32_t)block2[ii+6]);
     sum+=abs((int32_t)block[i+7]-(int32_t)block2[ii+7]);
   }
+  */
 
   return sum;    
 }
diff --git a/src/picture.h b/src/picture.h
index e8637952..9d4c46aa 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -79,8 +79,10 @@ typedef struct
   uint8_t* uRecData;     /*!< \brief Pointer to reconstructed U-data  */
   uint8_t* vRecData;     /*!< \brief Pointer to reconstructed V-data  */
 
-  int width;          /*!< \brief Picture width */
-  int height;         /*!< \brief Picture height  */
+  int32_t width;          /*!< \brief Picture width */
+  int32_t height;         /*!< \brief Picture height  */
+  int32_t height_in_LCU;  /*!< \brief input picture width in LCU*/
+  int32_t width_in_LCU;   /*!< \brief input picture height in LCU */
   uint8_t referenced; /*!< \brief Is this picture referenced */
   CU_info** CU;     /*!< \brief info for each CU at each depth */
   uint8_t type;
diff --git a/src/search.c b/src/search.c
index 111b64e3..7b015cd0 100644
--- a/src/search.c
+++ b/src/search.c
@@ -36,7 +36,7 @@ void search_buildReferenceBorder(picture* pic, int32_t xCtb, int32_t yCtb,int16_
   uint8_t* srcPic      = (!chroma)?pic->yData: ((chroma==1)?pic->uData: pic->vData); /*!< input picture pointer */  
   int16_t SCU_width    = LCU_WIDTH>>(MAX_DEPTH+(chroma?1:0)); /*!< Smallest Coding Unit width */
   uint8_t* srcShifted  = &srcPic[xCtb*SCU_width+(yCtb*SCU_width)*srcWidth];  /*!< input picture pointer shifted to start from the left-top corner of the current block */
-  int32_t width_in_SCU = srcWidth/SCU_width;     /*!< picture width in SCU */
+  int32_t width_in_SCU = pic->width_in_LCU<<MAX_DEPTH;     /*!< picture width in SCU */
 
   /* Fill left column */
   if(xCtb)
@@ -205,7 +205,7 @@ uint32_t search_best_mode(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb,
   CU_info *cur_CU = &encoder->in.cur_pic.CU[depth][xCtb+yCtb*(encoder->in.width_in_LCU<<MAX_DEPTH)];
   uint32_t bestCost = cur_CU->intra.cost;
   uint32_t cost = 0;
-  uint32_t lambdaCost = 4*g_lambda_cost[encoder->QP]<<5;
+  uint32_t lambdaCost = 4*g_lambda_cost[encoder->QP]<<4;//<<5;
 
   /* Split and search to max_depth */
   if(depth != MAX_SEARCH_DEPTH)
diff --git a/src/transform.c b/src/transform.c
index 79452309..be9cc620 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -781,16 +781,14 @@ void quant(encoder_control* encoder, int16_t* pSrc, int16_t* pDes, int32_t iWidt
 {
   int16_t*   piCoef    = pSrc;
   int16_t*   piQCoef   = pDes;
-  uint32_t*  scan;
+  
  
   int8_t useRDOQForTransformSkip = 0;
   uint32_t log2BlockSize = g_aucConvertToBit[ iWidth ] + 2;
-
+  uint32_t* scan = g_auiSigLastScan[ scanIdx ][ log2BlockSize - 1 ];
   //uint32_t scanIdx = SCAN_DIAG;
 
-  scan = g_auiSigLastScan[ scanIdx ][ log2BlockSize - 1 ];
-  {
-  int32_t deltaU[LCU_WIDTH*LCU_WIDTH] ;
+  int32_t deltaU[LCU_WIDTH*LCU_WIDTH>>2];
   int32_t iQpBase = encoder->QP;
 
   int32_t qpScaled;
@@ -802,7 +800,7 @@ void quant(encoder_control* encoder, int16_t* pSrc, int16_t* pDes, int32_t iWidt
   }
   else
   {
-    qpScaled = MAX( -qpBDOffset, MIN(57, iQpBase));
+    qpScaled = CLIP(-qpBDOffset, 57, iQpBase);
     if(qpScaled < 0)
     {
       qpScaled = qpScaled +  qpBDOffset;
@@ -830,22 +828,140 @@ void quant(encoder_control* encoder, int16_t* pSrc, int16_t* pDes, int32_t iWidt
   int32_t iAdd = ((encoder->in.cur_pic.slicetype == SLICE_I) ? 171 : 85) << (iQBits-9);
 
   int32_t qBits8 = iQBits-8;
-  for( n = 0; n < iWidth*iHeight; n++ )
+  for(n = 0; n < iWidth*iHeight; n++)
   {
     int32_t iLevel;
     int32_t  iSign;
-    int64_t tmpLevel;
+    //int64_t tmpLevel;
     iLevel  = piCoef[n];
     iSign   = (iLevel < 0 ? -1: 1);
 
-    tmpLevel  = (int64_t)abs(iLevel) * piQuantCoeff[n];
-    iLevel    = (int32_t)((tmpLevel + iAdd ) >> iQBits);
-    deltaU[n] = (int32_t)((tmpLevel - (iLevel<<iQBits) )>> qBits8);
+    iLevel = ((int64_t)abs(iLevel) * piQuantCoeff[n] + iAdd ) >> iQBits;
+    deltaU[n] = (int32_t)( ((int64_t)abs(piCoef[n]) * piQuantCoeff[n] - (iLevel<<iQBits) )>> qBits8 );
+    
+    #if ENABLE_SIGN_HIDING == 1
+    *uiAcSum += iLevel;
+    #endif
 
-    iLevel *= iSign;        
+    iLevel *= iSign;
     piQCoef[n] = CLIP( -32768, 32767, iLevel);
   } // for n
+
+  #if ENABLE_SIGN_HIDING == 1
+  if(*uiAcSum >= 2)
+  {
+    #define SCAN_SET_SIZE 16
+    #define LOG2_SCAN_SET_SIZE 4
+    int32_t n,lastCG = -1, abssum = 0, subset, subpos;    
+    uint32_t* scan_subpos;
+    for(subset = (iWidth*iHeight-1)>>LOG2_SCAN_SET_SIZE; subset >= 0; subset--)
+    {
+      int32_t firstNZPosInCG=SCAN_SET_SIZE , lastNZPosInCG=-1;
+      subpos = subset<<LOG2_SCAN_SET_SIZE;
+      //scan_subpos = &scan[subpos];
+      abssum = 0;
+
+      /* Find last coeff pos */
+      for(n = SCAN_SET_SIZE-1; n>=0; n--)
+      {
+        if(piQCoef[scan[n + subpos]])
+        {
+          lastNZPosInCG = n;
+          break;
+        }
+      }
+
+      /* First coeff pos */
+      for(n = 0; n <SCAN_SET_SIZE; n++)
+      {
+        if(piQCoef[scan[n + subpos]])
+        {
+          firstNZPosInCG = n;
+          break;
+        }
+      }
+
+      /* Sum all quant coeffs between first and last */
+      for(n = firstNZPosInCG; n <= lastNZPosInCG; n++)
+      {
+        abssum += piQCoef[scan[n + subpos]];
+      }
+
+      if(lastNZPosInCG>=0 && lastCG==-1) 
+      {
+        lastCG = 1; 
+      }
+
+      if(lastNZPosInCG-firstNZPosInCG >= /*SBH_THRESHOLD*/4)
+      {
+        uint32_t signbit = (piQCoef[scan[subpos+firstNZPosInCG]]>0?0:1) ;
+        if(signbit != (abssum&0x1))  /* compare signbit with sum_parity */
+        {
+          int32_t minCostInc = 0x7fffffff,  minPos =-1, finalChange=0, curCost=0x7fffffff, curChange=0;
+        
+          for(n = (lastCG==1?lastNZPosInCG:SCAN_SET_SIZE-1) ; n >= 0; n--)
+          {
+            uint32_t blkPos  = scan[n+subpos];
+            if(piQCoef[blkPos] != 0)
+            {
+              if(deltaU[blkPos] > 0)
+              {
+                curCost = -deltaU[blkPos]; 
+                curChange=1;
+              }
+              else if(n == firstNZPosInCG && abs(piQCoef[blkPos]) == 1)
+              {
+                curCost=0x7fffffff;
+              }
+              else
+              {
+                curCost = deltaU[blkPos]; 
+                curChange =-1;
+              }
+            }
+            else if(n < firstNZPosInCG && ((piCoef[blkPos] >= 0)?0:1) != signbit)
+            {
+              curCost = 0x7fffffff;
+            }
+            else
+            {
+              curCost = -deltaU[blkPos];
+              curChange = 1;
+            }
+
+            if(curCost < minCostInc)
+            {
+              minCostInc = curCost;
+              finalChange = curChange;
+              minPos = blkPos;
+            }
+          } //CG loop
+
+          if(piQCoef[minPos] == 32767 || piQCoef[minPos] == -32768)
+          {
+            finalChange = -1;
+          }
+
+          if(piCoef[minPos] >= 0)
+          {
+            piQCoef[minPos] += finalChange; 
+          }
+          else 
+          {
+            piQCoef[minPos] -= finalChange;
+          }  
+        } // Hide
+      }
+      if(lastCG == 1) 
+      {
+        lastCG=0;
+      }
+    }
+
+    #undef SCAN_SET_SIZE
+    #undef LOG2_SCAN_SET_SIZE
   }
+  #endif
   }
 
 }