Code refactoring to allow transform split (and 64x64 prediction with 32x32 transform)

This commit is contained in:
Marko Viitanen 2013-03-25 17:17:24 +02:00
parent 43122a1f0a
commit 55cc82925d
6 changed files with 154 additions and 83 deletions

View file

@ -630,7 +630,7 @@ void encode_slice_data(encoder_control* encoder)
void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, uint8_t depth)
{
uint8_t split_flag = (depth<2)?1:0; /* ToDo: get from CU data */
uint8_t split_flag = 0;//(depth<1)?1:0; /* ToDo: get from CU data */
uint8_t split_model = 0;
/* Check for slice border */
@ -699,7 +699,7 @@ void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, ui
uint8_t intraPredModeChroma = 36; /* 36 = Chroma derived from luma */
int8_t intraPreds[3] = {-1, -1, -1};
int8_t mpmPred = -1;
int i,x,y;
int i;
uint32_t flag;
uint32_t bestSAD;
uint8_t *base = &encoder->in.cur_pic.yData[xCtb*(LCU_WIDTH>>(MAX_DEPTH)) + (yCtb*(LCU_WIDTH>>(MAX_DEPTH))) *encoder->in.width];
@ -713,7 +713,7 @@ void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, ui
int16_t predU[LCU_WIDTH*LCU_WIDTH>>2];
int16_t predV[LCU_WIDTH*LCU_WIDTH>>2];
int16_t rec[(LCU_WIDTH+8)*(LCU_WIDTH+8)];
int16_t rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
int16_t *recShift = &rec[(LCU_WIDTH>>(depth))*2+8+1];
int16_t *recShiftU = &rec[(LCU_WIDTH>>(depth+1))*2+8+1];
uint8_t *recbase = &encoder->in.cur_pic.yRecData[xCtb*(LCU_WIDTH>>(MAX_DEPTH)) + (yCtb*(LCU_WIDTH>>(MAX_DEPTH))) *encoder->in.width];
@ -845,10 +845,35 @@ void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, ui
/* Coeff */
/* Transform tree */
encode_transform_tree(encoder,base, baseU, baseV, encoder->in.width,
{
int16_t coeff[LCU_WIDTH*LCU_WIDTH];
int16_t coeffU[LCU_WIDTH*LCU_WIDTH>>2];
int16_t coeffV[LCU_WIDTH*LCU_WIDTH>>2];
int8_t split = 0;
int32_t cb = encode_transform_tree(encoder,base, baseU, baseV, encoder->in.width,
recbase,recbaseU, recbaseV, encoder->in.width,
pred,predU,predV,LCU_WIDTH,
depth, intraPredMode, intraPredModeChroma);
pred,predU,predV,(LCU_WIDTH>>depth),
coeff,coeffU,coeffV,
depth, &split);
if(split)
{
encode_transform_coeff(encoder, coeff, coeffU, coeffV, cb&0x1, cb&0x2, cb&0x4,width>>1, intraPredMode, intraPredModeChroma, 1);
cb >>= 3;
encode_transform_coeff(encoder, &coeff[LCU_WIDTH*LCU_WIDTH>>2], &coeffU[LCU_WIDTH*LCU_WIDTH>>4], &coeffV[LCU_WIDTH*LCU_WIDTH>>4],
(cb&0x1)?1:0, (cb&0x2)?1:0, (cb&0x4)?1:0,width>>1, intraPredMode, intraPredModeChroma, 0);
cb >>= 3;
encode_transform_coeff(encoder, &coeff[LCU_WIDTH*LCU_WIDTH>>1], &coeffU[LCU_WIDTH*LCU_WIDTH>>3], &coeffV[LCU_WIDTH*LCU_WIDTH>>3],
(cb&0x1)?1:0, (cb&0x2)?1:0, (cb&0x4)?1:0,width>>1, intraPredMode, intraPredModeChroma, 0);
cb >>= 3;
encode_transform_coeff(encoder, &coeff[3*LCU_WIDTH*LCU_WIDTH>>2], &coeffU[3*LCU_WIDTH*LCU_WIDTH>>4], &coeffV[3*LCU_WIDTH*LCU_WIDTH>>4],
(cb&0x1)?1:0, (cb&0x2)?1:0, (cb&0x4)?1:0,width>>1, intraPredMode, intraPredModeChroma, 0);
}
else
{
encode_transform_coeff(encoder, coeff, coeffU, coeffV, (cb&0x1)?1:0, (cb&0x2)?1:0, (cb&0x4)?1:0,width, intraPredMode, intraPredModeChroma, 1);
}
}
/* end Transform tree */
/* end Coeff */
@ -908,52 +933,74 @@ void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, ui
}
void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *baseU, uint8_t *baseV,int32_t base_stride,
int32_t encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *baseU, uint8_t *baseV,int32_t base_stride,
uint8_t *recbase,uint8_t *recbaseU, uint8_t *recbaseV,int32_t recbase_stride,
int16_t *pred, int16_t *predU, int16_t *predV,int32_t pred_stride,
uint8_t depth, int8_t intraPredMode, int8_t intraPredModeChroma)
int16_t *coeff, int16_t *coeffU, int16_t *coeffV,
uint8_t depth, int8_t* split)
{
/* we have 64>>depth transform size */
int x,y,i;
int32_t width = LCU_WIDTH>>depth;
int8_t split = 0;
if(depth < MAX_DEPTH)
int32_t half_width = width >> 1;
uint8_t CbY = 0,CbU = 0,CbV = 0;
if(depth == 0)
{
*split |= 1<<depth;
}
else if(depth < MAX_DEPTH)
{
cabac.ctx = &g_TransSubdivSCModel[5-(g_aucConvertToBit[LCU_WIDTH]+2-depth)];
CABAC_BIN(&cabac,split,"TransformSubdivFlag");
}
if(split)
{
encode_transform_tree(encoder,base, baseU, baseV, base_stride,
recbase,recbaseU, recbaseV, recbase_stride,
pred,predU,predV,pred_stride,
depth+1, intraPredMode, intraPredModeChroma);
encode_transform_tree(encoder,base, baseU, baseV, base_stride,
recbase,recbaseU, recbaseV, recbase_stride,
pred,predU,predV,pred_stride,
depth+1, intraPredMode, intraPredModeChroma);
encode_transform_tree(encoder,base, baseU, baseV, base_stride,
recbase,recbaseU, recbaseV, recbase_stride,
pred,predU,predV,pred_stride,
depth+1, intraPredMode, intraPredModeChroma);
encode_transform_tree(encoder,base, baseU, baseV, base_stride,
recbase,recbaseU, recbaseV, recbase_stride,
pred,predU,predV,pred_stride,
depth+1, intraPredMode, intraPredModeChroma);
CABAC_BIN(&cabac,(*split)&(1<<depth),"TransformSubdivFlag");
}
/* We don't subdiv and we have 64>>depth transform size */
/* ToDo: allow other sized */
if((*split)&(1<<depth))
{
uint8_t CbY = 0,CbU = 0,CbV = 0;
int32_t recbase_offset_y = recbase_stride*(half_width);
int32_t base_offset_y = base_stride*(half_width);
int32_t pred_offset_y = pred_stride*(half_width);
int32_t recbase_offset_c_y = (recbase_stride>>1)*(half_width>>1);
int32_t base_offset_c_y = (base_stride>>1)*(half_width>>1);
int32_t pred_offset_c_y = (pred_stride>>1)*(half_width>>1);
int32_t coeff_fourth = (LCU_WIDTH*LCU_WIDTH>>4);
int32_t output = 0;
int32_t outhelper = 0;
output = encode_transform_tree(encoder,base, baseU, baseV, base_stride,
recbase,recbaseU, recbaseV, recbase_stride,
pred,predU,predV,pred_stride,
coeff,coeffU,coeffV,
depth+1, split);
outhelper = encode_transform_tree(encoder,&base[half_width], &baseU[half_width>>1], &baseV[half_width>>1], base_stride,
&recbase[half_width],&recbaseU[half_width>>1], &recbaseV[half_width>>1], recbase_stride,
&pred[half_width],&predU[half_width>>1],&predV[half_width>>1],pred_stride,
&coeff[coeff_fourth],&coeffU[coeff_fourth>>1],&coeffV[coeff_fourth>>1],
depth+1, split);
output |= outhelper<<3;
outhelper = encode_transform_tree(encoder,&base[base_offset_y], &baseU[base_offset_c_y], &baseV[base_offset_c_y], base_stride,
&recbase[recbase_offset_y],&recbaseU[recbase_offset_c_y], &recbaseV[recbase_offset_c_y], recbase_stride,
&pred[pred_offset_y],&predU[pred_offset_c_y>>1],&predV[pred_offset_c_y>>1],pred_stride,
&coeff[coeff_fourth<<1],&coeffU[coeff_fourth],&coeffV[coeff_fourth],
depth+1, split);
output |= outhelper<<6;
outhelper = encode_transform_tree(encoder,&base[base_offset_y+half_width], &baseU[base_offset_c_y+(half_width>>1)], &baseV[base_offset_c_y+(half_width>>1)], base_stride,
&recbase[recbase_offset_y+half_width],&recbaseU[recbase_offset_c_y+(half_width>>1)], &recbaseV[recbase_offset_c_y+(half_width>>1)], recbase_stride,
&pred[pred_offset_y+half_width],&predU[pred_offset_c_y+(half_width>>1)],&predV[pred_offset_c_y+(half_width>>1)],pred_stride,
&coeff[3*coeff_fourth],&coeffU[(3*coeff_fourth)>>1],&coeffV[(3*coeff_fourth)>>1],
depth+1, split);
output |= outhelper<<9;
return output;
}
{
/*
Quant and transform here...
*/
int16_t block[LCU_WIDTH*LCU_WIDTH];
int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH];
int16_t coeff[LCU_WIDTH*LCU_WIDTH];
int16_t coeffU[LCU_WIDTH*LCU_WIDTH>>2];
int16_t coeffV[LCU_WIDTH*LCU_WIDTH>>2];
int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
/* Get residual by subtracting prediction */
i = 0;
@ -961,7 +1008,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>depth; x++)
{
block[i++]=((int16_t)base[x+y*encoder->in.width])-pred[x+y*(LCU_WIDTH>>depth)];
block[i++]=((int16_t)base[x+y*encoder->in.width])-pred[x+y*pred_stride];
}
}
@ -970,7 +1017,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
quant(encoder,pre_quant_coeff,coeff,width, width,0, 0, SCAN_DIAG);
/* Check for non-zero coeffs */
for(i = 0; (uint32_t)i < width*width; i++)
for(i = 0; i < width*width; i++)
{
if(coeff[i] != 0)
{
@ -980,7 +1027,6 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
}
}
/* if non-zero coeffs */
if(CbY)
{
@ -993,7 +1039,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>depth; x++)
{
int16_t val = block[i++]+pred[x+y*(LCU_WIDTH>>depth)];
int16_t val = block[i++]+pred[x+y*pred_stride];
//ToDo: support 10+bits
recbase[x+y*encoder->in.width] = (uint8_t)CLIP(0,255,val);
}
@ -1007,7 +1053,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>depth; x++)
{
recbase[x+y*encoder->in.width] = (uint8_t)CLIP(0,255,pred[x+y*(LCU_WIDTH>>depth)]);
recbase[x+y*encoder->in.width] = (uint8_t)CLIP(0,255,pred[x+y*pred_stride]);
}
}
}
@ -1020,12 +1066,12 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
{
block[i++]=((int16_t)baseU[x+y*(encoder->in.width>>1)])-predU[x+y*(LCU_WIDTH>>(depth+1))];
block[i++]=((int16_t)baseU[x+y*(encoder->in.width>>1)])-predU[x+y*(pred_stride>>1)];
}
}
transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),0);
quant(encoder,pre_quant_coeff,coeffU, width>>1, width>>1, 0,2,SCAN_DIAG);
for(i = 0; (uint32_t)i < width*width>>2; i++)
for(i = 0; i < width*width>>2; i++)
{
if(coeffU[i] != 0)
{
@ -1041,12 +1087,12 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
{
block[i++]=((int16_t)baseV[x+y*(encoder->in.width>>1)])-predV[x+y*(LCU_WIDTH>>(depth+1))];
block[i++]=((int16_t)baseV[x+y*(encoder->in.width>>1)])-predV[x+y*(pred_stride>>1)];
}
}
transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),0);
quant(encoder,pre_quant_coeff,coeffV, width>>1, width>>1, 0,3,SCAN_DIAG);
for(i = 0; (uint32_t)i < width*width>>2; i++)
for(i = 0; i < width*width>>2; i++)
{
if(coeffV[i] != 0)
{
@ -1067,7 +1113,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
{
int16_t val = block[i++]+predU[x+y*(LCU_WIDTH>>(depth+1))];
int16_t val = block[i++]+predU[x+y*(pred_stride>>1)];
//ToDo: support 10+bits
recbaseU[x+y*(encoder->in.width>>1)] = (uint8_t)CLIP(0,255,val);
}
@ -1081,7 +1127,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
{
recbaseU[x+y*(encoder->in.width>>1)] = (uint8_t)CLIP(0,255,predU[x+y*(LCU_WIDTH>>(depth+1))]);
recbaseU[x+y*(encoder->in.width>>1)] = (uint8_t)CLIP(0,255,predU[x+y*(pred_stride>>1)]);
}
}
}
@ -1097,7 +1143,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
{
int16_t val = block[i++]+predV[x+y*(LCU_WIDTH>>(depth+1))];
int16_t val = block[i++]+predV[x+y*(pred_stride>>1)];
//ToDo: support 10+bits
recbaseV[x+y*(encoder->in.width>>1)] = (uint8_t)CLIP(0,255,val);
}
@ -1111,17 +1157,25 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
{
for(x = 0; x < LCU_WIDTH>>(depth+1); x++)
{
recbaseV[x+y*(encoder->in.width>>1)] = (uint8_t)CLIP(0,255,predV[x+y*(LCU_WIDTH>>(depth+1))]);
recbaseV[x+y*(encoder->in.width>>1)] = (uint8_t)CLIP(0,255,predV[x+y*(pred_stride>>1)]);
}
}
}
}
/* END INTRAPREDICTION */
}
/* end Residual Coding */
return CbY | (CbU<<1) | (CbV<<2);
}
void encode_transform_coeff(encoder_control* encoder, int16_t *coeff, int16_t *coeffU, int16_t *coeffV,
int8_t CbY, int8_t CbU, int8_t CbV,int8_t width, int8_t intraPredMode, int8_t intraPredModeChroma, int8_t toplevel)
{
/* Signal if chroma data is present */
if(encoder->in.video_format != FORMAT_400)
if(toplevel && encoder->in.video_format != FORMAT_400)
{
/* Non-zero chroma U Tcoeffs */
cabac.ctx = &g_QtCbfSCModelU[0]; /*<- */
@ -1132,6 +1186,7 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
CABAC_BIN(&cabac,CbV,"cbf_chroma_v");
}
/* Non-zero luma Tcoeffs */
cabac.ctx = &g_QtCbfSCModelY[1];
CABAC_BIN(&cabac,CbY,"cbf_luma");
@ -1188,13 +1243,8 @@ void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *base
}
}
}
}
/* end Residual Coding */
}
void encode_CoeffNxN(encoder_control* encoder,int16_t* coeff, uint8_t width, uint8_t type, int8_t scanMode)
{
int c1 = 1;//,c1_num;

View file

@ -68,10 +68,13 @@ void encode_slice_header(encoder_control* encoder);
void encode_coding_tree(encoder_control* encoder,uint16_t xCtb,uint16_t yCtb, uint8_t depth);
void encode_lastSignificantXY(encoder_control* encoder,uint8_t lastpos_x, uint8_t lastpos_y, uint8_t width, uint8_t height, uint8_t type, uint8_t scan);
void encode_CoeffNxN(encoder_control* encoder,int16_t* coeff, uint8_t width, uint8_t type, int8_t scanMode);
void encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *baseU, uint8_t *baseV,int32_t base_stride,
int32_t encode_transform_tree(encoder_control* encoder,uint8_t *base, uint8_t *baseU, uint8_t *baseV,int32_t base_stride,
uint8_t *recbase,uint8_t *recbaseU, uint8_t *recbaseV,int32_t recbase_stride,
int16_t *pred, int16_t *predU, int16_t *predV,int32_t pred_stride,
uint8_t depth, int8_t intraPredMode, int8_t intraPredModeChroma);
int16_t *coeff, int16_t *coeffU, int16_t *coeffV,
uint8_t depth, int8_t* split);
void encode_transform_coeff(encoder_control* encoder, int16_t *coeff, int16_t *coeffU, int16_t *coeffV,
int8_t CbY, int8_t CbU, int8_t CbV,int8_t width, int8_t intraPredMode, int8_t intraPredModeChroma, int8_t toplevel);
void init_tables(void);
static uint32_t* g_auiSigLastScan[3][7];

View file

@ -164,7 +164,7 @@ int8_t intra_getDirLumaPredictor(picture* pic,uint32_t xCtb, uint32_t yCtb, uint
void intra_filter(int16_t* ref, uint32_t stride,uint32_t width, int8_t mode)
{
#define FWIDTH (LCU_WIDTH+1)
#define FWIDTH (LCU_WIDTH*2+1)
int16_t filtered[FWIDTH*FWIDTH];
int16_t* filteredShift = &filtered[FWIDTH+1];
int x,y;
@ -233,11 +233,11 @@ int16_t intra_prediction(uint8_t* orig,uint32_t origstride,int16_t* rec,uint32_t
int16_t bestMode = 1;
int32_t x,y,i;
uint32_t (*calcSAD)(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2);
int16_t pred[LCU_WIDTH*LCU_WIDTH>>2];
int16_t origBlock[LCU_WIDTH*LCU_WIDTH>>2];
int16_t pred[LCU_WIDTH*LCU_WIDTH];
int16_t origBlock[LCU_WIDTH*LCU_WIDTH];
uint8_t *origShift = &orig[xpos+ypos*origstride];
int8_t filter = (width<32); //ToDo: chroma support
SADfunction SADarray[4] = {&SAD4x4,&SAD8x8,&SAD16x16,&SAD32x32};
SADfunction SADarray[5] = {&SAD4x4,&SAD8x8,&SAD16x16,&SAD32x32,&SAD64x64};
uint8_t threshold = intraHorVerDistThres[g_toBits[width]]; /*!< Intra filtering threshold */
#define COPY_PRED_TO_DST() for(y = 0; y < (int32_t)width; y++) { for(x = 0; x < (int32_t)width; x++) { dst[x+y*dststride] = pred[x+y*width]; } }
#define CHECK_FOR_BEST(mode) SAD = calcSAD(pred,width,origBlock,width); \
@ -362,7 +362,7 @@ void intra_recon(int16_t* rec,uint32_t recstride, uint32_t xpos, uint32_t ypos,u
*/
void intra_buildReferenceBorder(picture* pic, int32_t xCtb, int32_t yCtb,int8_t outwidth, int16_t* dst, int32_t dststride, int8_t chroma)
void intra_buildReferenceBorder(picture* pic, int32_t xCtb, int32_t yCtb,int16_t outwidth, int16_t* dst, int32_t dststride, int8_t chroma)
{
int32_t leftColumn; /*!< left column iterator */
int16_t val; /*!< variable to store extrapolated value */
@ -639,7 +639,7 @@ void intra_getPlanarPred(int16_t* src,int32_t srcstride, uint32_t xpos, uint32_t
int16_t pDcVal = 1<<(g_uiBitDepth-1);
int32_t k, l, bottomLeft, topRight;
int32_t horPred;
int32_t leftColumn[LCU_WIDTH], topRow[LCU_WIDTH], bottomRow[LCU_WIDTH], rightColumn[LCU_WIDTH];
int32_t leftColumn[LCU_WIDTH+1], topRow[LCU_WIDTH+1], bottomRow[LCU_WIDTH+1], rightColumn[LCU_WIDTH+1];
uint32_t blkSize = width;
uint32_t offset2D = width;
uint32_t shift1D = g_aucConvertToBit[ width ] + 2;

View file

@ -19,7 +19,7 @@ int8_t intra_getBlockMode(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t dep
int8_t intra_getDirLumaPredictor(picture* pic,uint32_t xCtb, uint32_t yCtb, uint8_t depth, int8_t* preds);
void intra_DCPredFiltering(uint8_t* pSrc, int32_t iSrcStride, uint8_t* rpDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight );
void intra_buildReferenceBorder(picture* pic, int32_t xCtb, int32_t yCtb,int8_t outwidth, int16_t* dst, int32_t dststride, int8_t chroma);
void intra_buildReferenceBorder(picture* pic, int32_t xCtb, int32_t yCtb,int16_t outwidth, int16_t* dst, int32_t dststride, int8_t chroma);
void intra_filter(int16_t* ref, uint32_t stride,uint32_t width, int8_t mode);
/* Predictions */

View file

@ -170,6 +170,24 @@ uint32_t SAD(uint8_t *block,uint8_t* block2, uint32_t x, uint32_t y)
return sum;
}
uint32_t SAD64x64(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2)
{
int32_t i,ii,y,x;
uint32_t sum=0;
for(y=0;y<64;y++)
{
i = y*stride1;
ii = y*stride2;
for(x = 0; x < 64;x++)
{
sum+=abs((int32_t)block[i+x]-(int32_t)block2[ii+x]);
}
}
return sum;
}
uint32_t SAD32x32(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2)
{
int32_t i,ii,y;

View file

@ -16,7 +16,7 @@
/* Functions */
uint32_t SAD64x64(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2);
uint32_t SAD32x32(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2);
uint32_t SAD16x16(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2);
uint32_t SAD8x8(int16_t *block,uint32_t stride1,int16_t* block2, uint32_t stride2);