[mtt] fix cost calculation

This commit is contained in:
Joose Sainio 2022-11-29 07:47:05 +02:00 committed by Marko Viitanen
parent 657254d38a
commit 8fbefc0de3
8 changed files with 603 additions and 26 deletions

View file

@ -298,16 +298,16 @@ static double cu_zero_coeff_cost(
double ssd = 0.0; double ssd = 0.0;
ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd( ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
&lcu->ref.y[luma_index], &lcu->rec.y[luma_index], &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
LCU_WIDTH, LCU_WIDTH, cu_loc->width LCU_WIDTH, LCU_WIDTH, cu_loc->width, cu_loc->height
); );
if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) { if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
&lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
); );
ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
&lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height
); );
} }
// Save the pixels at a lower level of the working tree. // Save the pixels at a lower level of the working tree.
@ -445,7 +445,7 @@ double uvg_cu_rd_cost_luma(
int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x; int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH,
cu_loc->width); cu_loc->width, cu_loc->height);
} }
@ -550,10 +550,10 @@ double uvg_cu_rd_cost_chroma(
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
cu_loc->chroma_width); cu_loc->chroma_width, cu_loc->chroma_height);
int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
cu_loc->chroma_width); cu_loc->chroma_width, cu_loc->chroma_height);
ssd = ssd_u + ssd_v; ssd = ssd_u + ssd_v;
} }
@ -684,7 +684,7 @@ static double cu_rd_cost_tr_split_accurate(
int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y; int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y;
luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH,
width); width, height);
} }
// Chroma transform skip enable/disable is non-normative, so we need to count the chroma // Chroma transform skip enable/disable is non-normative, so we need to count the chroma
// tr-skip bits even when we are never using it. // tr-skip bits even when we are never using it.
@ -762,10 +762,10 @@ static double cu_rd_cost_tr_split_accurate(
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
chroma_width); chroma_width, chroma_height);
unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
chroma_width); chroma_width, chroma_height);
chroma_ssd = ssd_u + ssd_v; chroma_ssd = ssd_u + ssd_v;
} }
if(chroma_can_use_tr_skip && cb_flag_u) { if(chroma_can_use_tr_skip && cb_flag_u) {
@ -783,10 +783,10 @@ static double cu_rd_cost_tr_split_accurate(
int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
chroma_width); chroma_width, chroma_height);
int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
chroma_width); chroma_width, chroma_height);
chroma_ssd = ssd_u_joint + ssd_v_joint; chroma_ssd = ssd_u_joint + ssd_v_joint;
} }
if (chroma_can_use_tr_skip) { if (chroma_can_use_tr_skip) {
@ -1360,7 +1360,7 @@ static double search_cu(
cabac_data_t best_split_cabac; cabac_data_t best_split_cabac;
memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) { for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) {
if (!can_split[split_type] || split_type != QT_SPLIT) continue; if (!can_split[split_type] || (split_type != QT_SPLIT && depth == 0) || (split_type == QT_SPLIT && depth == 1)) continue;
split_tree_t new_split = { split_tree_t new_split = {
split_tree.split_tree | split_type << (split_tree.current_depth * 3), split_tree.split_tree | split_type << (split_tree.current_depth * 3),
split_tree.current_depth + 1, split_tree.current_depth + 1,

View file

@ -2144,15 +2144,15 @@ void uvg_cu_cost_inter_rd2(
int index = y_px * LCU_WIDTH + x_px; int index = y_px * LCU_WIDTH + x_px;
double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH,
width) * UVG_LUMA_MULT; width, height) * UVG_LUMA_MULT;
if (reconstruct_chroma) { if (reconstruct_chroma) {
int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
cu_loc->chroma_width); cu_loc->chroma_width, cu_loc->chroma_height);
double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C,
cu_loc->chroma_width); cu_loc->chroma_width, cu_loc->chroma_height);
ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT; ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
} }
double no_cbf_bits; double no_cbf_bits;

View file

@ -145,15 +145,15 @@ static void get_cost_dual(
if (satd_twin_func != NULL) { if (satd_twin_func != NULL) {
satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs); satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs);
} else { } else {
satd_costs[0] = uvg_satd_any_size(width, height, preds[0], width, orig_block, LCU_WIDTH); satd_costs[0] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[0], width);
satd_costs[1] = uvg_satd_any_size(width, height, preds[1], width, orig_block, LCU_WIDTH); satd_costs[1] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[1], width);
} }
unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 }; unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 };
if (sad_twin_func != NULL) { if (sad_twin_func != NULL) {
sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs); sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
} else { } else {
unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, LCU_WIDTH); unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, width);
unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, LCU_WIDTH); unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, width);
} }
costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2); costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2);
costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2); costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2);

View file

@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)
static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec, static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec,
const int ref_stride, const int rec_stride, const int ref_stride, const int rec_stride,
const int width) const int width, const int height)
{ {
assert(width == height && "Non square not yet implemented");
__m256i ssd_part; __m256i ssd_part;
__m256i diff = _mm256_setzero_si256(); __m256i diff = _mm256_setzero_si256();
__m128i sum; __m128i sum;

View file

@ -32,6 +32,7 @@
#include "strategies/generic/picture-generic.h" #include "strategies/generic/picture-generic.h"
#include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include "strategies/strategies-picture.h" #include "strategies/strategies-picture.h"
@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel)
SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4) SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
uint64_t satd = 0;
coeff_t diff[4], m[4];
diff[0] = piOrg[0] - piCur[0];
diff[1] = piOrg[1] - piCur[1];
diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur];
diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
m[0] = diff[0] + diff[2];
m[1] = diff[1] + diff[3];
m[2] = diff[0] - diff[2];
m[3] = diff[1] - diff[3];
satd += abs(m[0] + m[1]) >> 2;
satd += abs(m[0] - m[1]);
satd += abs(m[2] + m[3]);
satd += abs(m[2] - m[3]);
return satd;
}
static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{ //need to add SIMD implementation ,JCA
int k, i, j, jj, sad = 0;
int diff[128], m1[8][16], m2[8][16];
for (k = 0; k < 128; k += 16)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
diff[k + 4] = piOrg[4] - piCur[4];
diff[k + 5] = piOrg[5] - piCur[5];
diff[k + 6] = piOrg[6] - piCur[6];
diff[k + 7] = piOrg[7] - piCur[7];
diff[k + 8] = piOrg[8] - piCur[8];
diff[k + 9] = piOrg[9] - piCur[9];
diff[k + 10] = piOrg[10] - piCur[10];
diff[k + 11] = piOrg[11] - piCur[11];
diff[k + 12] = piOrg[12] - piCur[12];
diff[k + 13] = piOrg[13] - piCur[13];
diff[k + 14] = piOrg[14] - piCur[14];
diff[k + 15] = piOrg[15] - piCur[15];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 8; j++)
{
jj = j << 4;
m2[j][0] = diff[jj] + diff[jj + 8];
m2[j][1] = diff[jj + 1] + diff[jj + 9];
m2[j][2] = diff[jj + 2] + diff[jj + 10];
m2[j][3] = diff[jj + 3] + diff[jj + 11];
m2[j][4] = diff[jj + 4] + diff[jj + 12];
m2[j][5] = diff[jj + 5] + diff[jj + 13];
m2[j][6] = diff[jj + 6] + diff[jj + 14];
m2[j][7] = diff[jj + 7] + diff[jj + 15];
m2[j][8] = diff[jj] - diff[jj + 8];
m2[j][9] = diff[jj + 1] - diff[jj + 9];
m2[j][10] = diff[jj + 2] - diff[jj + 10];
m2[j][11] = diff[jj + 3] - diff[jj + 11];
m2[j][12] = diff[jj + 4] - diff[jj + 12];
m2[j][13] = diff[jj + 5] - diff[jj + 13];
m2[j][14] = diff[jj + 6] - diff[jj + 14];
m2[j][15] = diff[jj + 7] - diff[jj + 15];
m1[j][0] = m2[j][0] + m2[j][4];
m1[j][1] = m2[j][1] + m2[j][5];
m1[j][2] = m2[j][2] + m2[j][6];
m1[j][3] = m2[j][3] + m2[j][7];
m1[j][4] = m2[j][0] - m2[j][4];
m1[j][5] = m2[j][1] - m2[j][5];
m1[j][6] = m2[j][2] - m2[j][6];
m1[j][7] = m2[j][3] - m2[j][7];
m1[j][8] = m2[j][8] + m2[j][12];
m1[j][9] = m2[j][9] + m2[j][13];
m1[j][10] = m2[j][10] + m2[j][14];
m1[j][11] = m2[j][11] + m2[j][15];
m1[j][12] = m2[j][8] - m2[j][12];
m1[j][13] = m2[j][9] - m2[j][13];
m1[j][14] = m2[j][10] - m2[j][14];
m1[j][15] = m2[j][11] - m2[j][15];
m2[j][0] = m1[j][0] + m1[j][2];
m2[j][1] = m1[j][1] + m1[j][3];
m2[j][2] = m1[j][0] - m1[j][2];
m2[j][3] = m1[j][1] - m1[j][3];
m2[j][4] = m1[j][4] + m1[j][6];
m2[j][5] = m1[j][5] + m1[j][7];
m2[j][6] = m1[j][4] - m1[j][6];
m2[j][7] = m1[j][5] - m1[j][7];
m2[j][8] = m1[j][8] + m1[j][10];
m2[j][9] = m1[j][9] + m1[j][11];
m2[j][10] = m1[j][8] - m1[j][10];
m2[j][11] = m1[j][9] - m1[j][11];
m2[j][12] = m1[j][12] + m1[j][14];
m2[j][13] = m1[j][13] + m1[j][15];
m2[j][14] = m1[j][12] - m1[j][14];
m2[j][15] = m1[j][13] - m1[j][15];
m1[j][0] = m2[j][0] + m2[j][1];
m1[j][1] = m2[j][0] - m2[j][1];
m1[j][2] = m2[j][2] + m2[j][3];
m1[j][3] = m2[j][2] - m2[j][3];
m1[j][4] = m2[j][4] + m2[j][5];
m1[j][5] = m2[j][4] - m2[j][5];
m1[j][6] = m2[j][6] + m2[j][7];
m1[j][7] = m2[j][6] - m2[j][7];
m1[j][8] = m2[j][8] + m2[j][9];
m1[j][9] = m2[j][8] - m2[j][9];
m1[j][10] = m2[j][10] + m2[j][11];
m1[j][11] = m2[j][10] - m2[j][11];
m1[j][12] = m2[j][12] + m2[j][13];
m1[j][13] = m2[j][12] - m2[j][13];
m1[j][14] = m2[j][14] + m2[j][15];
m1[j][15] = m2[j][14] - m2[j][15];
}
//vertical
for (i = 0; i < 16; i++)
{
m2[0][i] = m1[0][i] + m1[4][i];
m2[1][i] = m1[1][i] + m1[5][i];
m2[2][i] = m1[2][i] + m1[6][i];
m2[3][i] = m1[3][i] + m1[7][i];
m2[4][i] = m1[0][i] - m1[4][i];
m2[5][i] = m1[1][i] - m1[5][i];
m2[6][i] = m1[2][i] - m1[6][i];
m2[7][i] = m1[3][i] - m1[7][i];
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m1[4][i] = m2[4][i] + m2[6][i];
m1[5][i] = m2[5][i] + m2[7][i];
m1[6][i] = m2[4][i] - m2[6][i];
m1[7][i] = m2[5][i] - m2[7][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
m2[4][i] = m1[4][i] + m1[5][i];
m2[5][i] = m1[4][i] - m1[5][i];
m2[6][i] = m1[6][i] + m1[7][i];
m2[7][i] = m1[6][i] - m1[7][i];
}
for (i = 0; i < 8; i++)
{
for (j = 0; j < 16; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(16.0 * 8) * 2);
return sad;
}
static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
int k, i, j, jj, sad = 0;
int diff[128], m1[16][8], m2[16][8];
for (k = 0; k < 128; k += 8)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
diff[k + 4] = piOrg[4] - piCur[4];
diff[k + 5] = piOrg[5] - piCur[5];
diff[k + 6] = piOrg[6] - piCur[6];
diff[k + 7] = piOrg[7] - piCur[7];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 16; j++)
{
jj = j << 3;
m2[j][0] = diff[jj] + diff[jj + 4];
m2[j][1] = diff[jj + 1] + diff[jj + 5];
m2[j][2] = diff[jj + 2] + diff[jj + 6];
m2[j][3] = diff[jj + 3] + diff[jj + 7];
m2[j][4] = diff[jj] - diff[jj + 4];
m2[j][5] = diff[jj + 1] - diff[jj + 5];
m2[j][6] = diff[jj + 2] - diff[jj + 6];
m2[j][7] = diff[jj + 3] - diff[jj + 7];
m1[j][0] = m2[j][0] + m2[j][2];
m1[j][1] = m2[j][1] + m2[j][3];
m1[j][2] = m2[j][0] - m2[j][2];
m1[j][3] = m2[j][1] - m2[j][3];
m1[j][4] = m2[j][4] + m2[j][6];
m1[j][5] = m2[j][5] + m2[j][7];
m1[j][6] = m2[j][4] - m2[j][6];
m1[j][7] = m2[j][5] - m2[j][7];
m2[j][0] = m1[j][0] + m1[j][1];
m2[j][1] = m1[j][0] - m1[j][1];
m2[j][2] = m1[j][2] + m1[j][3];
m2[j][3] = m1[j][2] - m1[j][3];
m2[j][4] = m1[j][4] + m1[j][5];
m2[j][5] = m1[j][4] - m1[j][5];
m2[j][6] = m1[j][6] + m1[j][7];
m2[j][7] = m1[j][6] - m1[j][7];
}
//vertical
for (i = 0; i < 8; i++)
{
m1[0][i] = m2[0][i] + m2[8][i];
m1[1][i] = m2[1][i] + m2[9][i];
m1[2][i] = m2[2][i] + m2[10][i];
m1[3][i] = m2[3][i] + m2[11][i];
m1[4][i] = m2[4][i] + m2[12][i];
m1[5][i] = m2[5][i] + m2[13][i];
m1[6][i] = m2[6][i] + m2[14][i];
m1[7][i] = m2[7][i] + m2[15][i];
m1[8][i] = m2[0][i] - m2[8][i];
m1[9][i] = m2[1][i] - m2[9][i];
m1[10][i] = m2[2][i] - m2[10][i];
m1[11][i] = m2[3][i] - m2[11][i];
m1[12][i] = m2[4][i] - m2[12][i];
m1[13][i] = m2[5][i] - m2[13][i];
m1[14][i] = m2[6][i] - m2[14][i];
m1[15][i] = m2[7][i] - m2[15][i];
m2[0][i] = m1[0][i] + m1[4][i];
m2[1][i] = m1[1][i] + m1[5][i];
m2[2][i] = m1[2][i] + m1[6][i];
m2[3][i] = m1[3][i] + m1[7][i];
m2[4][i] = m1[0][i] - m1[4][i];
m2[5][i] = m1[1][i] - m1[5][i];
m2[6][i] = m1[2][i] - m1[6][i];
m2[7][i] = m1[3][i] - m1[7][i];
m2[8][i] = m1[8][i] + m1[12][i];
m2[9][i] = m1[9][i] + m1[13][i];
m2[10][i] = m1[10][i] + m1[14][i];
m2[11][i] = m1[11][i] + m1[15][i];
m2[12][i] = m1[8][i] - m1[12][i];
m2[13][i] = m1[9][i] - m1[13][i];
m2[14][i] = m1[10][i] - m1[14][i];
m2[15][i] = m1[11][i] - m1[15][i];
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m1[4][i] = m2[4][i] + m2[6][i];
m1[5][i] = m2[5][i] + m2[7][i];
m1[6][i] = m2[4][i] - m2[6][i];
m1[7][i] = m2[5][i] - m2[7][i];
m1[8][i] = m2[8][i] + m2[10][i];
m1[9][i] = m2[9][i] + m2[11][i];
m1[10][i] = m2[8][i] - m2[10][i];
m1[11][i] = m2[9][i] - m2[11][i];
m1[12][i] = m2[12][i] + m2[14][i];
m1[13][i] = m2[13][i] + m2[15][i];
m1[14][i] = m2[12][i] - m2[14][i];
m1[15][i] = m2[13][i] - m2[15][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
m2[4][i] = m1[4][i] + m1[5][i];
m2[5][i] = m1[4][i] - m1[5][i];
m2[6][i] = m1[6][i] + m1[7][i];
m2[7][i] = m1[6][i] - m1[7][i];
m2[8][i] = m1[8][i] + m1[9][i];
m2[9][i] = m1[8][i] - m1[9][i];
m2[10][i] = m1[10][i] + m1[11][i];
m2[11][i] = m1[10][i] - m1[11][i];
m2[12][i] = m1[12][i] + m1[13][i];
m2[13][i] = m1[12][i] - m1[13][i];
m2[14][i] = m1[14][i] + m1[15][i];
m2[15][i] = m1[14][i] - m1[15][i];
}
for (i = 0; i < 16; i++)
{
for (j = 0; j < 8; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(16.0 * 8) * 2);
return sad;
}
static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
int k, i, j, jj, sad = 0;
int diff[32], m1[8][4], m2[8][4];
for (k = 0; k < 32; k += 4)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 8; j++)
{
jj = j << 2;
m2[j][0] = diff[jj] + diff[jj + 2];
m2[j][1] = diff[jj + 1] + diff[jj + 3];
m2[j][2] = diff[jj] - diff[jj + 2];
m2[j][3] = diff[jj + 1] - diff[jj + 3];
m1[j][0] = m2[j][0] + m2[j][1];
m1[j][1] = m2[j][0] - m2[j][1];
m1[j][2] = m2[j][2] + m2[j][3];
m1[j][3] = m2[j][2] - m2[j][3];
}
//vertical
for (i = 0; i < 4; i++)
{
m2[0][i] = m1[0][i] + m1[4][i];
m2[1][i] = m1[1][i] + m1[5][i];
m2[2][i] = m1[2][i] + m1[6][i];
m2[3][i] = m1[3][i] + m1[7][i];
m2[4][i] = m1[0][i] - m1[4][i];
m2[5][i] = m1[1][i] - m1[5][i];
m2[6][i] = m1[2][i] - m1[6][i];
m2[7][i] = m1[3][i] - m1[7][i];
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m1[4][i] = m2[4][i] + m2[6][i];
m1[5][i] = m2[5][i] + m2[7][i];
m1[6][i] = m2[4][i] - m2[6][i];
m1[7][i] = m2[5][i] - m2[7][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
m2[4][i] = m1[4][i] + m1[5][i];
m2[5][i] = m1[4][i] - m1[5][i];
m2[6][i] = m1[6][i] + m1[7][i];
m2[7][i] = m1[6][i] - m1[7][i];
}
for (i = 0; i < 8; i++)
{
for (j = 0; j < 4; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(4.0 * 8) * 2);
return sad;
}
static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur)
{
int k, i, j, jj, sad = 0;
int diff[32], m1[4][8], m2[4][8];
for (k = 0; k < 32; k += 8)
{
diff[k + 0] = piOrg[0] - piCur[0];
diff[k + 1] = piOrg[1] - piCur[1];
diff[k + 2] = piOrg[2] - piCur[2];
diff[k + 3] = piOrg[3] - piCur[3];
diff[k + 4] = piOrg[4] - piCur[4];
diff[k + 5] = piOrg[5] - piCur[5];
diff[k + 6] = piOrg[6] - piCur[6];
diff[k + 7] = piOrg[7] - piCur[7];
piCur += iStrideCur;
piOrg += iStrideOrg;
}
//horizontal
for (j = 0; j < 4; j++)
{
jj = j << 3;
m2[j][0] = diff[jj] + diff[jj + 4];
m2[j][1] = diff[jj + 1] + diff[jj + 5];
m2[j][2] = diff[jj + 2] + diff[jj + 6];
m2[j][3] = diff[jj + 3] + diff[jj + 7];
m2[j][4] = diff[jj] - diff[jj + 4];
m2[j][5] = diff[jj + 1] - diff[jj + 5];
m2[j][6] = diff[jj + 2] - diff[jj + 6];
m2[j][7] = diff[jj + 3] - diff[jj + 7];
m1[j][0] = m2[j][0] + m2[j][2];
m1[j][1] = m2[j][1] + m2[j][3];
m1[j][2] = m2[j][0] - m2[j][2];
m1[j][3] = m2[j][1] - m2[j][3];
m1[j][4] = m2[j][4] + m2[j][6];
m1[j][5] = m2[j][5] + m2[j][7];
m1[j][6] = m2[j][4] - m2[j][6];
m1[j][7] = m2[j][5] - m2[j][7];
m2[j][0] = m1[j][0] + m1[j][1];
m2[j][1] = m1[j][0] - m1[j][1];
m2[j][2] = m1[j][2] + m1[j][3];
m2[j][3] = m1[j][2] - m1[j][3];
m2[j][4] = m1[j][4] + m1[j][5];
m2[j][5] = m1[j][4] - m1[j][5];
m2[j][6] = m1[j][6] + m1[j][7];
m2[j][7] = m1[j][6] - m1[j][7];
}
//vertical
for (i = 0; i < 8; i++)
{
m1[0][i] = m2[0][i] + m2[2][i];
m1[1][i] = m2[1][i] + m2[3][i];
m1[2][i] = m2[0][i] - m2[2][i];
m1[3][i] = m2[1][i] - m2[3][i];
m2[0][i] = m1[0][i] + m1[1][i];
m2[1][i] = m1[0][i] - m1[1][i];
m2[2][i] = m1[2][i] + m1[3][i];
m2[3][i] = m1[2][i] - m1[3][i];
}
for (i = 0; i < 4; i++)
{
for (j = 0; j < 8; j++)
{
sad += abs(m2[i][j]);
}
}
sad -= abs(m2[0][0]);
sad += abs(m2[0][0]) >> 2;
sad = (int)(sad / sqrt(4.0 * 8) * 2);
return sad;
}
uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride)
{
const uvg_pixel* piOrg = ref_in;
const uvg_pixel* piCur = pred_in;
const int iRows = height;
const int iCols = width;
const int iStrideOrg = ref_stride;
const int iStrideCur = pred_stride;
int x = 0, y = 0;
uint64_t uiSum = 0;
if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0)
{
for (y = 0; y < iRows; y += 8)
{
for (x = 0; x < iCols; x += 16)
{
uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 8;
piCur += iStrideCur * 8;
}
}
else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0)
{
for (y = 0; y < iRows; y += 16)
{
for (x = 0; x < iCols; x += 8)
{
uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 16;
piCur += iStrideCur * 16;
}
}
else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0)
{
for (y = 0; y < iRows; y += 4)
{
for (x = 0; x < iCols; x += 8)
{
uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 4;
piCur += iStrideCur * 4;
}
}
else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0)
{
for (y = 0; y < iRows; y += 8)
{
for (x = 0; x < iCols; x += 4)
{
uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += iStrideOrg * 8;
piCur += iStrideCur * 8;
}
}
else if ((iRows % 8 == 0) && (iCols % 8 == 0))
{
for (y = 0; y < iRows; y += 8)
{
for (x = 0; x < iCols; x += 8)
{
uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
}
piOrg += 8 * iStrideOrg;
piCur += 8 * iStrideCur;
}
}
else if ((iRows % 4 == 0) && (iCols % 4 == 0))
{
for (y = 0; y < iRows; y += 4)
{
for (x = 0; x < iCols; x += 4)
{
uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur);
}
piOrg += 4 * iStrideOrg;
piCur += 4 * iStrideCur;
}
}
else if ((iRows % 2 == 0) && (iCols % 2 == 0))
{
for (y = 0; y < iRows; y += 2)
{
for (x = 0; x < iCols; x += 2)
{
uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur);
}
piOrg += 2 * iStrideOrg;
piCur += 2 * iStrideCur;
}
}
// TODO: 10 bit
return (uiSum >> 0);
}
// Function macro for defining SAD calculating functions // Function macro for defining SAD calculating functions
// for fixed size blocks. // for fixed size blocks.
#define SAD_NXN(n, pixel_type) \ #define SAD_NXN(n, pixel_type) \
@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel)
static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec, static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec,
const int ref_stride, const int rec_stride, const int ref_stride, const int rec_stride,
const int width) const int width, const int height)
{ {
int ssd = 0; int ssd = 0;
int y, x; int y, x;
for (y = 0; y < width; ++y) { for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride]; int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
ssd += diff * diff; ssd += diff * diff;
@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic); success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic); success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic); success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs);
success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic); success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic); success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);

View file

@ -70,6 +70,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0;
cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0; cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0;
cost_pixel_any_size_func * uvg_satd_any_size = 0; cost_pixel_any_size_func * uvg_satd_any_size = 0;
cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0;
cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0; cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0;
pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0; pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0;

View file

@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)(
typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out); typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out);
typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid); typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width); typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height);
typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t); typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data, typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data,
int32_t block_width, int32_t block_height, int32_t block_width, int32_t block_height,
@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16;
extern cost_pixel_nxn_func * uvg_satd_32x32; extern cost_pixel_nxn_func * uvg_satd_32x32;
extern cost_pixel_nxn_func * uvg_satd_64x64; extern cost_pixel_nxn_func * uvg_satd_64x64;
extern cost_pixel_any_size_func *uvg_satd_any_size; extern cost_pixel_any_size_func *uvg_satd_any_size;
extern cost_pixel_any_size_func *uvg_satd_any_size_vtm;
extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual; extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual;
extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual; extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual;
@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigne
{"satd_32x32", (void**) &uvg_satd_32x32}, \ {"satd_32x32", (void**) &uvg_satd_32x32}, \
{"satd_64x64", (void**) &uvg_satd_64x64}, \ {"satd_64x64", (void**) &uvg_satd_64x64}, \
{"satd_any_size", (void**) &uvg_satd_any_size}, \ {"satd_any_size", (void**) &uvg_satd_any_size}, \
{"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \
{"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \ {"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \
{"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \ {"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \
{"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \ {"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \

View file

@ -617,7 +617,7 @@ void uvg_chroma_transform_search(
if (v_has_coeffs && !is_jccr) { if (v_has_coeffs && !is_jccr) {
uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V, uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, height, COLOR_V,
pred_cu->type, transforms[i] == CHROMA_TS); pred_cu->type, transforms[i] == CHROMA_TS);
if (transforms[i] != CHROMA_TS) { if (transforms[i] != CHROMA_TS) {
@ -661,10 +661,10 @@ void uvg_chroma_transform_search(
if (!state->encoder_control->cfg.lossless) { if (!state->encoder_control->cfg.lossless) {
ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i], ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i],
LCU_WIDTH_C, width, LCU_WIDTH_C, width,
width); width, height);
ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i], ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i],
LCU_WIDTH_C, width, LCU_WIDTH_C, width,
width); width, height);
} }
double u_bits = 0; double u_bits = 0;