mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 11:24:05 +00:00
[mtt] fix cost calculation
This commit is contained in:
parent
657254d38a
commit
8fbefc0de3
24
src/search.c
24
src/search.c
|
@ -298,16 +298,16 @@ static double cu_zero_coeff_cost(
|
|||
double ssd = 0.0;
|
||||
ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
|
||||
&lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
|
||||
LCU_WIDTH, LCU_WIDTH, cu_loc->width
|
||||
LCU_WIDTH, LCU_WIDTH, cu_loc->width, cu_loc->height
|
||||
);
|
||||
if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
|
||||
ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
|
||||
&lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
|
||||
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
|
||||
);
|
||||
ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
|
||||
&lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
|
||||
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
|
||||
);
|
||||
}
|
||||
// Save the pixels at a lower level of the working tree.
|
||||
|
@ -445,7 +445,7 @@ double uvg_cu_rd_cost_luma(
|
|||
int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
|
||||
ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
|
||||
LCU_WIDTH, LCU_WIDTH,
|
||||
cu_loc->width);
|
||||
cu_loc->width, cu_loc->height);
|
||||
}
|
||||
|
||||
|
||||
|
@ -550,10 +550,10 @@ double uvg_cu_rd_cost_chroma(
|
|||
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
|
||||
int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
cu_loc->chroma_width);
|
||||
cu_loc->chroma_width, cu_loc->chroma_height);
|
||||
int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
cu_loc->chroma_width);
|
||||
cu_loc->chroma_width, cu_loc->chroma_height);
|
||||
ssd = ssd_u + ssd_v;
|
||||
}
|
||||
|
||||
|
@ -684,7 +684,7 @@ static double cu_rd_cost_tr_split_accurate(
|
|||
int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y;
|
||||
luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
|
||||
LCU_WIDTH, LCU_WIDTH,
|
||||
width);
|
||||
width, height);
|
||||
}
|
||||
// Chroma transform skip enable/disable is non-normative, so we need to count the chroma
|
||||
// tr-skip bits even when we are never using it.
|
||||
|
@ -762,10 +762,10 @@ static double cu_rd_cost_tr_split_accurate(
|
|||
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
|
||||
unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
chroma_width);
|
||||
chroma_width, chroma_height);
|
||||
unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
chroma_width);
|
||||
chroma_width, chroma_height);
|
||||
chroma_ssd = ssd_u + ssd_v;
|
||||
}
|
||||
if(chroma_can_use_tr_skip && cb_flag_u) {
|
||||
|
@ -783,10 +783,10 @@ static double cu_rd_cost_tr_split_accurate(
|
|||
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
|
||||
int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
chroma_width);
|
||||
chroma_width, chroma_height);
|
||||
int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
chroma_width);
|
||||
chroma_width, chroma_height);
|
||||
chroma_ssd = ssd_u_joint + ssd_v_joint;
|
||||
}
|
||||
if (chroma_can_use_tr_skip) {
|
||||
|
@ -1360,7 +1360,7 @@ static double search_cu(
|
|||
cabac_data_t best_split_cabac;
|
||||
memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
|
||||
for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
|
||||
if (!can_split[split_type] || split_type != QT_SPLIT) continue;
|
||||
if (!can_split[split_type] || (split_type != QT_SPLIT && depth == 0) || (split_type == QT_SPLIT && depth == 1)) continue;
|
||||
split_tree_t new_split = {
|
||||
split_tree.split_tree | split_type << (split_tree.current_depth * 3),
|
||||
split_tree.current_depth + 1,
|
||||
|
|
|
@ -2144,15 +2144,15 @@ void uvg_cu_cost_inter_rd2(
|
|||
int index = y_px * LCU_WIDTH + x_px;
|
||||
double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
|
||||
LCU_WIDTH, LCU_WIDTH,
|
||||
width) * UVG_LUMA_MULT;
|
||||
width, height) * UVG_LUMA_MULT;
|
||||
if (reconstruct_chroma) {
|
||||
int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
|
||||
double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
cu_loc->chroma_width);
|
||||
cu_loc->chroma_width, cu_loc->chroma_height);
|
||||
double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
|
||||
LCU_WIDTH_C, LCU_WIDTH_C,
|
||||
cu_loc->chroma_width);
|
||||
cu_loc->chroma_width, cu_loc->chroma_height);
|
||||
ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
|
||||
}
|
||||
double no_cbf_bits;
|
||||
|
|
|
@ -145,15 +145,15 @@ static void get_cost_dual(
|
|||
if (satd_twin_func != NULL) {
|
||||
satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs);
|
||||
} else {
|
||||
satd_costs[0] = uvg_satd_any_size(width, height, preds[0], width, orig_block, LCU_WIDTH);
|
||||
satd_costs[1] = uvg_satd_any_size(width, height, preds[1], width, orig_block, LCU_WIDTH);
|
||||
satd_costs[0] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[0], width);
|
||||
satd_costs[1] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[1], width);
|
||||
}
|
||||
unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 };
|
||||
if (sad_twin_func != NULL) {
|
||||
sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
|
||||
} else {
|
||||
unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, LCU_WIDTH);
|
||||
unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, LCU_WIDTH);
|
||||
unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, width);
|
||||
unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, width);
|
||||
}
|
||||
costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2);
|
||||
costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2);
|
||||
|
|
|
@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)
|
|||
|
||||
static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec,
|
||||
const int ref_stride, const int rec_stride,
|
||||
const int width)
|
||||
const int width, const int height)
|
||||
{
|
||||
assert(width == height && "Non square not yet implemented");
|
||||
__m256i ssd_part;
|
||||
__m256i diff = _mm256_setzero_si256();
|
||||
__m128i sum;
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
|
||||
#include "strategies/generic/picture-generic.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "strategies/strategies-picture.h"
|
||||
|
@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel)
|
|||
|
||||
SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
|
||||
|
||||
uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
|
||||
{
|
||||
uint64_t satd = 0;
|
||||
coeff_t diff[4], m[4];
|
||||
|
||||
diff[0] = piOrg[0] - piCur[0];
|
||||
diff[1] = piOrg[1] - piCur[1];
|
||||
diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur];
|
||||
diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
|
||||
m[0] = diff[0] + diff[2];
|
||||
m[1] = diff[1] + diff[3];
|
||||
m[2] = diff[0] - diff[2];
|
||||
m[3] = diff[1] - diff[3];
|
||||
|
||||
satd += abs(m[0] + m[1]) >> 2;
|
||||
satd += abs(m[0] - m[1]);
|
||||
satd += abs(m[2] + m[3]);
|
||||
satd += abs(m[2] - m[3]);
|
||||
|
||||
return satd;
|
||||
}
|
||||
|
||||
|
||||
static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
|
||||
{ //need to add SIMD implementation ,JCA
|
||||
int k, i, j, jj, sad = 0;
|
||||
int diff[128], m1[8][16], m2[8][16];
|
||||
for (k = 0; k < 128; k += 16)
|
||||
{
|
||||
diff[k + 0] = piOrg[0] - piCur[0];
|
||||
diff[k + 1] = piOrg[1] - piCur[1];
|
||||
diff[k + 2] = piOrg[2] - piCur[2];
|
||||
diff[k + 3] = piOrg[3] - piCur[3];
|
||||
diff[k + 4] = piOrg[4] - piCur[4];
|
||||
diff[k + 5] = piOrg[5] - piCur[5];
|
||||
diff[k + 6] = piOrg[6] - piCur[6];
|
||||
diff[k + 7] = piOrg[7] - piCur[7];
|
||||
|
||||
diff[k + 8] = piOrg[8] - piCur[8];
|
||||
diff[k + 9] = piOrg[9] - piCur[9];
|
||||
diff[k + 10] = piOrg[10] - piCur[10];
|
||||
diff[k + 11] = piOrg[11] - piCur[11];
|
||||
diff[k + 12] = piOrg[12] - piCur[12];
|
||||
diff[k + 13] = piOrg[13] - piCur[13];
|
||||
diff[k + 14] = piOrg[14] - piCur[14];
|
||||
diff[k + 15] = piOrg[15] - piCur[15];
|
||||
|
||||
piCur += iStrideCur;
|
||||
piOrg += iStrideOrg;
|
||||
}
|
||||
|
||||
//horizontal
|
||||
for (j = 0; j < 8; j++)
|
||||
{
|
||||
jj = j << 4;
|
||||
|
||||
m2[j][0] = diff[jj] + diff[jj + 8];
|
||||
m2[j][1] = diff[jj + 1] + diff[jj + 9];
|
||||
m2[j][2] = diff[jj + 2] + diff[jj + 10];
|
||||
m2[j][3] = diff[jj + 3] + diff[jj + 11];
|
||||
m2[j][4] = diff[jj + 4] + diff[jj + 12];
|
||||
m2[j][5] = diff[jj + 5] + diff[jj + 13];
|
||||
m2[j][6] = diff[jj + 6] + diff[jj + 14];
|
||||
m2[j][7] = diff[jj + 7] + diff[jj + 15];
|
||||
m2[j][8] = diff[jj] - diff[jj + 8];
|
||||
m2[j][9] = diff[jj + 1] - diff[jj + 9];
|
||||
m2[j][10] = diff[jj + 2] - diff[jj + 10];
|
||||
m2[j][11] = diff[jj + 3] - diff[jj + 11];
|
||||
m2[j][12] = diff[jj + 4] - diff[jj + 12];
|
||||
m2[j][13] = diff[jj + 5] - diff[jj + 13];
|
||||
m2[j][14] = diff[jj + 6] - diff[jj + 14];
|
||||
m2[j][15] = diff[jj + 7] - diff[jj + 15];
|
||||
|
||||
m1[j][0] = m2[j][0] + m2[j][4];
|
||||
m1[j][1] = m2[j][1] + m2[j][5];
|
||||
m1[j][2] = m2[j][2] + m2[j][6];
|
||||
m1[j][3] = m2[j][3] + m2[j][7];
|
||||
m1[j][4] = m2[j][0] - m2[j][4];
|
||||
m1[j][5] = m2[j][1] - m2[j][5];
|
||||
m1[j][6] = m2[j][2] - m2[j][6];
|
||||
m1[j][7] = m2[j][3] - m2[j][7];
|
||||
m1[j][8] = m2[j][8] + m2[j][12];
|
||||
m1[j][9] = m2[j][9] + m2[j][13];
|
||||
m1[j][10] = m2[j][10] + m2[j][14];
|
||||
m1[j][11] = m2[j][11] + m2[j][15];
|
||||
m1[j][12] = m2[j][8] - m2[j][12];
|
||||
m1[j][13] = m2[j][9] - m2[j][13];
|
||||
m1[j][14] = m2[j][10] - m2[j][14];
|
||||
m1[j][15] = m2[j][11] - m2[j][15];
|
||||
|
||||
m2[j][0] = m1[j][0] + m1[j][2];
|
||||
m2[j][1] = m1[j][1] + m1[j][3];
|
||||
m2[j][2] = m1[j][0] - m1[j][2];
|
||||
m2[j][3] = m1[j][1] - m1[j][3];
|
||||
m2[j][4] = m1[j][4] + m1[j][6];
|
||||
m2[j][5] = m1[j][5] + m1[j][7];
|
||||
m2[j][6] = m1[j][4] - m1[j][6];
|
||||
m2[j][7] = m1[j][5] - m1[j][7];
|
||||
m2[j][8] = m1[j][8] + m1[j][10];
|
||||
m2[j][9] = m1[j][9] + m1[j][11];
|
||||
m2[j][10] = m1[j][8] - m1[j][10];
|
||||
m2[j][11] = m1[j][9] - m1[j][11];
|
||||
m2[j][12] = m1[j][12] + m1[j][14];
|
||||
m2[j][13] = m1[j][13] + m1[j][15];
|
||||
m2[j][14] = m1[j][12] - m1[j][14];
|
||||
m2[j][15] = m1[j][13] - m1[j][15];
|
||||
|
||||
m1[j][0] = m2[j][0] + m2[j][1];
|
||||
m1[j][1] = m2[j][0] - m2[j][1];
|
||||
m1[j][2] = m2[j][2] + m2[j][3];
|
||||
m1[j][3] = m2[j][2] - m2[j][3];
|
||||
m1[j][4] = m2[j][4] + m2[j][5];
|
||||
m1[j][5] = m2[j][4] - m2[j][5];
|
||||
m1[j][6] = m2[j][6] + m2[j][7];
|
||||
m1[j][7] = m2[j][6] - m2[j][7];
|
||||
m1[j][8] = m2[j][8] + m2[j][9];
|
||||
m1[j][9] = m2[j][8] - m2[j][9];
|
||||
m1[j][10] = m2[j][10] + m2[j][11];
|
||||
m1[j][11] = m2[j][10] - m2[j][11];
|
||||
m1[j][12] = m2[j][12] + m2[j][13];
|
||||
m1[j][13] = m2[j][12] - m2[j][13];
|
||||
m1[j][14] = m2[j][14] + m2[j][15];
|
||||
m1[j][15] = m2[j][14] - m2[j][15];
|
||||
}
|
||||
|
||||
//vertical
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
m2[0][i] = m1[0][i] + m1[4][i];
|
||||
m2[1][i] = m1[1][i] + m1[5][i];
|
||||
m2[2][i] = m1[2][i] + m1[6][i];
|
||||
m2[3][i] = m1[3][i] + m1[7][i];
|
||||
m2[4][i] = m1[0][i] - m1[4][i];
|
||||
m2[5][i] = m1[1][i] - m1[5][i];
|
||||
m2[6][i] = m1[2][i] - m1[6][i];
|
||||
m2[7][i] = m1[3][i] - m1[7][i];
|
||||
|
||||
m1[0][i] = m2[0][i] + m2[2][i];
|
||||
m1[1][i] = m2[1][i] + m2[3][i];
|
||||
m1[2][i] = m2[0][i] - m2[2][i];
|
||||
m1[3][i] = m2[1][i] - m2[3][i];
|
||||
m1[4][i] = m2[4][i] + m2[6][i];
|
||||
m1[5][i] = m2[5][i] + m2[7][i];
|
||||
m1[6][i] = m2[4][i] - m2[6][i];
|
||||
m1[7][i] = m2[5][i] - m2[7][i];
|
||||
|
||||
m2[0][i] = m1[0][i] + m1[1][i];
|
||||
m2[1][i] = m1[0][i] - m1[1][i];
|
||||
m2[2][i] = m1[2][i] + m1[3][i];
|
||||
m2[3][i] = m1[2][i] - m1[3][i];
|
||||
m2[4][i] = m1[4][i] + m1[5][i];
|
||||
m2[5][i] = m1[4][i] - m1[5][i];
|
||||
m2[6][i] = m1[6][i] + m1[7][i];
|
||||
m2[7][i] = m1[6][i] - m1[7][i];
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
{
|
||||
for (j = 0; j < 16; j++)
|
||||
{
|
||||
sad += abs(m2[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
sad -= abs(m2[0][0]);
|
||||
sad += abs(m2[0][0]) >> 2;
|
||||
sad = (int)(sad / sqrt(16.0 * 8) * 2);
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
|
||||
{
|
||||
int k, i, j, jj, sad = 0;
|
||||
int diff[128], m1[16][8], m2[16][8];
|
||||
for (k = 0; k < 128; k += 8)
|
||||
{
|
||||
diff[k + 0] = piOrg[0] - piCur[0];
|
||||
diff[k + 1] = piOrg[1] - piCur[1];
|
||||
diff[k + 2] = piOrg[2] - piCur[2];
|
||||
diff[k + 3] = piOrg[3] - piCur[3];
|
||||
diff[k + 4] = piOrg[4] - piCur[4];
|
||||
diff[k + 5] = piOrg[5] - piCur[5];
|
||||
diff[k + 6] = piOrg[6] - piCur[6];
|
||||
diff[k + 7] = piOrg[7] - piCur[7];
|
||||
|
||||
piCur += iStrideCur;
|
||||
piOrg += iStrideOrg;
|
||||
}
|
||||
|
||||
//horizontal
|
||||
for (j = 0; j < 16; j++)
|
||||
{
|
||||
jj = j << 3;
|
||||
|
||||
m2[j][0] = diff[jj] + diff[jj + 4];
|
||||
m2[j][1] = diff[jj + 1] + diff[jj + 5];
|
||||
m2[j][2] = diff[jj + 2] + diff[jj + 6];
|
||||
m2[j][3] = diff[jj + 3] + diff[jj + 7];
|
||||
m2[j][4] = diff[jj] - diff[jj + 4];
|
||||
m2[j][5] = diff[jj + 1] - diff[jj + 5];
|
||||
m2[j][6] = diff[jj + 2] - diff[jj + 6];
|
||||
m2[j][7] = diff[jj + 3] - diff[jj + 7];
|
||||
|
||||
m1[j][0] = m2[j][0] + m2[j][2];
|
||||
m1[j][1] = m2[j][1] + m2[j][3];
|
||||
m1[j][2] = m2[j][0] - m2[j][2];
|
||||
m1[j][3] = m2[j][1] - m2[j][3];
|
||||
m1[j][4] = m2[j][4] + m2[j][6];
|
||||
m1[j][5] = m2[j][5] + m2[j][7];
|
||||
m1[j][6] = m2[j][4] - m2[j][6];
|
||||
m1[j][7] = m2[j][5] - m2[j][7];
|
||||
|
||||
m2[j][0] = m1[j][0] + m1[j][1];
|
||||
m2[j][1] = m1[j][0] - m1[j][1];
|
||||
m2[j][2] = m1[j][2] + m1[j][3];
|
||||
m2[j][3] = m1[j][2] - m1[j][3];
|
||||
m2[j][4] = m1[j][4] + m1[j][5];
|
||||
m2[j][5] = m1[j][4] - m1[j][5];
|
||||
m2[j][6] = m1[j][6] + m1[j][7];
|
||||
m2[j][7] = m1[j][6] - m1[j][7];
|
||||
}
|
||||
|
||||
//vertical
|
||||
for (i = 0; i < 8; i++)
|
||||
{
|
||||
m1[0][i] = m2[0][i] + m2[8][i];
|
||||
m1[1][i] = m2[1][i] + m2[9][i];
|
||||
m1[2][i] = m2[2][i] + m2[10][i];
|
||||
m1[3][i] = m2[3][i] + m2[11][i];
|
||||
m1[4][i] = m2[4][i] + m2[12][i];
|
||||
m1[5][i] = m2[5][i] + m2[13][i];
|
||||
m1[6][i] = m2[6][i] + m2[14][i];
|
||||
m1[7][i] = m2[7][i] + m2[15][i];
|
||||
m1[8][i] = m2[0][i] - m2[8][i];
|
||||
m1[9][i] = m2[1][i] - m2[9][i];
|
||||
m1[10][i] = m2[2][i] - m2[10][i];
|
||||
m1[11][i] = m2[3][i] - m2[11][i];
|
||||
m1[12][i] = m2[4][i] - m2[12][i];
|
||||
m1[13][i] = m2[5][i] - m2[13][i];
|
||||
m1[14][i] = m2[6][i] - m2[14][i];
|
||||
m1[15][i] = m2[7][i] - m2[15][i];
|
||||
|
||||
m2[0][i] = m1[0][i] + m1[4][i];
|
||||
m2[1][i] = m1[1][i] + m1[5][i];
|
||||
m2[2][i] = m1[2][i] + m1[6][i];
|
||||
m2[3][i] = m1[3][i] + m1[7][i];
|
||||
m2[4][i] = m1[0][i] - m1[4][i];
|
||||
m2[5][i] = m1[1][i] - m1[5][i];
|
||||
m2[6][i] = m1[2][i] - m1[6][i];
|
||||
m2[7][i] = m1[3][i] - m1[7][i];
|
||||
m2[8][i] = m1[8][i] + m1[12][i];
|
||||
m2[9][i] = m1[9][i] + m1[13][i];
|
||||
m2[10][i] = m1[10][i] + m1[14][i];
|
||||
m2[11][i] = m1[11][i] + m1[15][i];
|
||||
m2[12][i] = m1[8][i] - m1[12][i];
|
||||
m2[13][i] = m1[9][i] - m1[13][i];
|
||||
m2[14][i] = m1[10][i] - m1[14][i];
|
||||
m2[15][i] = m1[11][i] - m1[15][i];
|
||||
|
||||
m1[0][i] = m2[0][i] + m2[2][i];
|
||||
m1[1][i] = m2[1][i] + m2[3][i];
|
||||
m1[2][i] = m2[0][i] - m2[2][i];
|
||||
m1[3][i] = m2[1][i] - m2[3][i];
|
||||
m1[4][i] = m2[4][i] + m2[6][i];
|
||||
m1[5][i] = m2[5][i] + m2[7][i];
|
||||
m1[6][i] = m2[4][i] - m2[6][i];
|
||||
m1[7][i] = m2[5][i] - m2[7][i];
|
||||
m1[8][i] = m2[8][i] + m2[10][i];
|
||||
m1[9][i] = m2[9][i] + m2[11][i];
|
||||
m1[10][i] = m2[8][i] - m2[10][i];
|
||||
m1[11][i] = m2[9][i] - m2[11][i];
|
||||
m1[12][i] = m2[12][i] + m2[14][i];
|
||||
m1[13][i] = m2[13][i] + m2[15][i];
|
||||
m1[14][i] = m2[12][i] - m2[14][i];
|
||||
m1[15][i] = m2[13][i] - m2[15][i];
|
||||
|
||||
m2[0][i] = m1[0][i] + m1[1][i];
|
||||
m2[1][i] = m1[0][i] - m1[1][i];
|
||||
m2[2][i] = m1[2][i] + m1[3][i];
|
||||
m2[3][i] = m1[2][i] - m1[3][i];
|
||||
m2[4][i] = m1[4][i] + m1[5][i];
|
||||
m2[5][i] = m1[4][i] - m1[5][i];
|
||||
m2[6][i] = m1[6][i] + m1[7][i];
|
||||
m2[7][i] = m1[6][i] - m1[7][i];
|
||||
m2[8][i] = m1[8][i] + m1[9][i];
|
||||
m2[9][i] = m1[8][i] - m1[9][i];
|
||||
m2[10][i] = m1[10][i] + m1[11][i];
|
||||
m2[11][i] = m1[10][i] - m1[11][i];
|
||||
m2[12][i] = m1[12][i] + m1[13][i];
|
||||
m2[13][i] = m1[12][i] - m1[13][i];
|
||||
m2[14][i] = m1[14][i] + m1[15][i];
|
||||
m2[15][i] = m1[14][i] - m1[15][i];
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
for (j = 0; j < 8; j++)
|
||||
{
|
||||
sad += abs(m2[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
sad -= abs(m2[0][0]);
|
||||
sad += abs(m2[0][0]) >> 2;
|
||||
sad = (int)(sad / sqrt(16.0 * 8) * 2);
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
|
||||
{
|
||||
int k, i, j, jj, sad = 0;
|
||||
int diff[32], m1[8][4], m2[8][4];
|
||||
for (k = 0; k < 32; k += 4)
|
||||
{
|
||||
diff[k + 0] = piOrg[0] - piCur[0];
|
||||
diff[k + 1] = piOrg[1] - piCur[1];
|
||||
diff[k + 2] = piOrg[2] - piCur[2];
|
||||
diff[k + 3] = piOrg[3] - piCur[3];
|
||||
|
||||
piCur += iStrideCur;
|
||||
piOrg += iStrideOrg;
|
||||
}
|
||||
|
||||
//horizontal
|
||||
for (j = 0; j < 8; j++)
|
||||
{
|
||||
jj = j << 2;
|
||||
m2[j][0] = diff[jj] + diff[jj + 2];
|
||||
m2[j][1] = diff[jj + 1] + diff[jj + 3];
|
||||
m2[j][2] = diff[jj] - diff[jj + 2];
|
||||
m2[j][3] = diff[jj + 1] - diff[jj + 3];
|
||||
|
||||
m1[j][0] = m2[j][0] + m2[j][1];
|
||||
m1[j][1] = m2[j][0] - m2[j][1];
|
||||
m1[j][2] = m2[j][2] + m2[j][3];
|
||||
m1[j][3] = m2[j][2] - m2[j][3];
|
||||
}
|
||||
|
||||
//vertical
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
m2[0][i] = m1[0][i] + m1[4][i];
|
||||
m2[1][i] = m1[1][i] + m1[5][i];
|
||||
m2[2][i] = m1[2][i] + m1[6][i];
|
||||
m2[3][i] = m1[3][i] + m1[7][i];
|
||||
m2[4][i] = m1[0][i] - m1[4][i];
|
||||
m2[5][i] = m1[1][i] - m1[5][i];
|
||||
m2[6][i] = m1[2][i] - m1[6][i];
|
||||
m2[7][i] = m1[3][i] - m1[7][i];
|
||||
|
||||
m1[0][i] = m2[0][i] + m2[2][i];
|
||||
m1[1][i] = m2[1][i] + m2[3][i];
|
||||
m1[2][i] = m2[0][i] - m2[2][i];
|
||||
m1[3][i] = m2[1][i] - m2[3][i];
|
||||
m1[4][i] = m2[4][i] + m2[6][i];
|
||||
m1[5][i] = m2[5][i] + m2[7][i];
|
||||
m1[6][i] = m2[4][i] - m2[6][i];
|
||||
m1[7][i] = m2[5][i] - m2[7][i];
|
||||
|
||||
m2[0][i] = m1[0][i] + m1[1][i];
|
||||
m2[1][i] = m1[0][i] - m1[1][i];
|
||||
m2[2][i] = m1[2][i] + m1[3][i];
|
||||
m2[3][i] = m1[2][i] - m1[3][i];
|
||||
m2[4][i] = m1[4][i] + m1[5][i];
|
||||
m2[5][i] = m1[4][i] - m1[5][i];
|
||||
m2[6][i] = m1[6][i] + m1[7][i];
|
||||
m2[7][i] = m1[6][i] - m1[7][i];
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
{
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
sad += abs(m2[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
sad -= abs(m2[0][0]);
|
||||
sad += abs(m2[0][0]) >> 2;
|
||||
sad = (int)(sad / sqrt(4.0 * 8) * 2);
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
|
||||
{
|
||||
int k, i, j, jj, sad = 0;
|
||||
int diff[32], m1[4][8], m2[4][8];
|
||||
for (k = 0; k < 32; k += 8)
|
||||
{
|
||||
diff[k + 0] = piOrg[0] - piCur[0];
|
||||
diff[k + 1] = piOrg[1] - piCur[1];
|
||||
diff[k + 2] = piOrg[2] - piCur[2];
|
||||
diff[k + 3] = piOrg[3] - piCur[3];
|
||||
diff[k + 4] = piOrg[4] - piCur[4];
|
||||
diff[k + 5] = piOrg[5] - piCur[5];
|
||||
diff[k + 6] = piOrg[6] - piCur[6];
|
||||
diff[k + 7] = piOrg[7] - piCur[7];
|
||||
|
||||
piCur += iStrideCur;
|
||||
piOrg += iStrideOrg;
|
||||
}
|
||||
|
||||
//horizontal
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
jj = j << 3;
|
||||
|
||||
m2[j][0] = diff[jj] + diff[jj + 4];
|
||||
m2[j][1] = diff[jj + 1] + diff[jj + 5];
|
||||
m2[j][2] = diff[jj + 2] + diff[jj + 6];
|
||||
m2[j][3] = diff[jj + 3] + diff[jj + 7];
|
||||
m2[j][4] = diff[jj] - diff[jj + 4];
|
||||
m2[j][5] = diff[jj + 1] - diff[jj + 5];
|
||||
m2[j][6] = diff[jj + 2] - diff[jj + 6];
|
||||
m2[j][7] = diff[jj + 3] - diff[jj + 7];
|
||||
|
||||
m1[j][0] = m2[j][0] + m2[j][2];
|
||||
m1[j][1] = m2[j][1] + m2[j][3];
|
||||
m1[j][2] = m2[j][0] - m2[j][2];
|
||||
m1[j][3] = m2[j][1] - m2[j][3];
|
||||
m1[j][4] = m2[j][4] + m2[j][6];
|
||||
m1[j][5] = m2[j][5] + m2[j][7];
|
||||
m1[j][6] = m2[j][4] - m2[j][6];
|
||||
m1[j][7] = m2[j][5] - m2[j][7];
|
||||
|
||||
m2[j][0] = m1[j][0] + m1[j][1];
|
||||
m2[j][1] = m1[j][0] - m1[j][1];
|
||||
m2[j][2] = m1[j][2] + m1[j][3];
|
||||
m2[j][3] = m1[j][2] - m1[j][3];
|
||||
m2[j][4] = m1[j][4] + m1[j][5];
|
||||
m2[j][5] = m1[j][4] - m1[j][5];
|
||||
m2[j][6] = m1[j][6] + m1[j][7];
|
||||
m2[j][7] = m1[j][6] - m1[j][7];
|
||||
}
|
||||
|
||||
//vertical
|
||||
for (i = 0; i < 8; i++)
|
||||
{
|
||||
m1[0][i] = m2[0][i] + m2[2][i];
|
||||
m1[1][i] = m2[1][i] + m2[3][i];
|
||||
m1[2][i] = m2[0][i] - m2[2][i];
|
||||
m1[3][i] = m2[1][i] - m2[3][i];
|
||||
|
||||
m2[0][i] = m1[0][i] + m1[1][i];
|
||||
m2[1][i] = m1[0][i] - m1[1][i];
|
||||
m2[2][i] = m1[2][i] + m1[3][i];
|
||||
m2[3][i] = m1[2][i] - m1[3][i];
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
for (j = 0; j < 8; j++)
|
||||
{
|
||||
sad += abs(m2[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
sad -= abs(m2[0][0]);
|
||||
sad += abs(m2[0][0]) >> 2;
|
||||
sad = (int)(sad / sqrt(4.0 * 8) * 2);
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
|
||||
uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
|
||||
{
|
||||
const uvg_pixel* piOrg = ref_in;
|
||||
const uvg_pixel* piCur = pred_in;
|
||||
const int iRows = height;
|
||||
const int iCols = width;
|
||||
const int iStrideOrg = ref_stride;
|
||||
const int iStrideCur = pred_stride;
|
||||
|
||||
int x = 0, y = 0;
|
||||
|
||||
uint64_t uiSum = 0;
|
||||
|
||||
if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0)
|
||||
{
|
||||
for (y = 0; y < iRows; y += 8)
|
||||
{
|
||||
for (x = 0; x < iCols; x += 16)
|
||||
{
|
||||
uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
|
||||
}
|
||||
piOrg += iStrideOrg * 8;
|
||||
piCur += iStrideCur * 8;
|
||||
}
|
||||
}
|
||||
else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0)
|
||||
{
|
||||
for (y = 0; y < iRows; y += 16)
|
||||
{
|
||||
for (x = 0; x < iCols; x += 8)
|
||||
{
|
||||
uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
|
||||
}
|
||||
piOrg += iStrideOrg * 16;
|
||||
piCur += iStrideCur * 16;
|
||||
}
|
||||
}
|
||||
else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0)
|
||||
{
|
||||
for (y = 0; y < iRows; y += 4)
|
||||
{
|
||||
for (x = 0; x < iCols; x += 8)
|
||||
{
|
||||
uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
|
||||
}
|
||||
piOrg += iStrideOrg * 4;
|
||||
piCur += iStrideCur * 4;
|
||||
}
|
||||
}
|
||||
else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0)
|
||||
{
|
||||
for (y = 0; y < iRows; y += 8)
|
||||
{
|
||||
for (x = 0; x < iCols; x += 4)
|
||||
{
|
||||
uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
|
||||
}
|
||||
piOrg += iStrideOrg * 8;
|
||||
piCur += iStrideCur * 8;
|
||||
}
|
||||
}
|
||||
else if ((iRows % 8 == 0) && (iCols % 8 == 0))
|
||||
{
|
||||
for (y = 0; y < iRows; y += 8)
|
||||
{
|
||||
for (x = 0; x < iCols; x += 8)
|
||||
{
|
||||
uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
|
||||
}
|
||||
piOrg += 8 * iStrideOrg;
|
||||
piCur += 8 * iStrideCur;
|
||||
}
|
||||
}
|
||||
else if ((iRows % 4 == 0) && (iCols % 4 == 0))
|
||||
{
|
||||
for (y = 0; y < iRows; y += 4)
|
||||
{
|
||||
for (x = 0; x < iCols; x += 4)
|
||||
{
|
||||
uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
|
||||
}
|
||||
piOrg += 4 * iStrideOrg;
|
||||
piCur += 4 * iStrideCur;
|
||||
}
|
||||
}
|
||||
else if ((iRows % 2 == 0) && (iCols % 2 == 0))
|
||||
{
|
||||
for (y = 0; y < iRows; y += 2)
|
||||
{
|
||||
for (x = 0; x < iCols; x += 2)
|
||||
{
|
||||
uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
|
||||
}
|
||||
piOrg += 2 * iStrideOrg;
|
||||
piCur += 2 * iStrideCur;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: 10 bit
|
||||
return (uiSum >> 0);
|
||||
}
|
||||
|
||||
|
||||
// Function macro for defining SAD calculating functions
|
||||
// for fixed size blocks.
|
||||
#define SAD_NXN(n, pixel_type) \
|
||||
|
@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel)
|
|||
|
||||
static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec,
|
||||
const int ref_stride, const int rec_stride,
|
||||
const int width)
|
||||
const int width, const int height)
|
||||
{
|
||||
int ssd = 0;
|
||||
int y, x;
|
||||
|
||||
for (y = 0; y < width; ++y) {
|
||||
for (y = 0; y < height; ++y) {
|
||||
for (x = 0; x < width; ++x) {
|
||||
int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
|
||||
ssd += diff * diff;
|
||||
|
@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
|
|||
success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
|
||||
success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
|
||||
success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
|
||||
success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs);
|
||||
success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
|
||||
|
||||
success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);
|
||||
|
|
|
@ -70,6 +70,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0;
|
|||
cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0;
|
||||
|
||||
cost_pixel_any_size_func * uvg_satd_any_size = 0;
|
||||
cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0;
|
||||
cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0;
|
||||
|
||||
pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0;
|
||||
|
|
|
@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)(
|
|||
typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out);
|
||||
typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
|
||||
|
||||
typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
|
||||
typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height);
|
||||
typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
|
||||
typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data,
|
||||
int32_t block_width, int32_t block_height,
|
||||
|
@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16;
|
|||
extern cost_pixel_nxn_func * uvg_satd_32x32;
|
||||
extern cost_pixel_nxn_func * uvg_satd_64x64;
|
||||
extern cost_pixel_any_size_func *uvg_satd_any_size;
|
||||
extern cost_pixel_any_size_func *uvg_satd_any_size_vtm;
|
||||
|
||||
extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual;
|
||||
extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual;
|
||||
|
@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigne
|
|||
{"satd_32x32", (void**) &uvg_satd_32x32}, \
|
||||
{"satd_64x64", (void**) &uvg_satd_64x64}, \
|
||||
{"satd_any_size", (void**) &uvg_satd_any_size}, \
|
||||
{"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \
|
||||
{"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \
|
||||
{"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \
|
||||
{"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \
|
||||
|
|
|
@ -617,7 +617,7 @@ void uvg_chroma_transform_search(
|
|||
|
||||
|
||||
if (v_has_coeffs && !is_jccr) {
|
||||
uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
|
||||
uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, height, COLOR_V,
|
||||
pred_cu->type, transforms[i] == CHROMA_TS);
|
||||
|
||||
if (transforms[i] != CHROMA_TS) {
|
||||
|
@ -661,10 +661,10 @@ void uvg_chroma_transform_search(
|
|||
if (!state->encoder_control->cfg.lossless) {
|
||||
ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i],
|
||||
LCU_WIDTH_C, width,
|
||||
width);
|
||||
width, height);
|
||||
ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i],
|
||||
LCU_WIDTH_C, width,
|
||||
width);
|
||||
width, height);
|
||||
}
|
||||
|
||||
double u_bits = 0;
|
||||
|
|
Loading…
Reference in a new issue