From 8fbefc0de311df7fc2d2fdfe15114b8115295eb2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 29 Nov 2022 07:47:05 +0200 Subject: [PATCH] [mtt] fix cost calculation --- src/search.c | 24 +- src/search_inter.c | 6 +- src/search_intra.c | 8 +- src/strategies/avx2/picture-avx2.c | 3 +- src/strategies/generic/picture-generic.c | 577 ++++++++++++++++++++++- src/strategies/strategies-picture.c | 1 + src/strategies/strategies-picture.h | 4 +- src/transform.c | 6 +- 8 files changed, 603 insertions(+), 26 deletions(-) diff --git a/src/search.c b/src/search.c index bc55a805..3c76dc93 100644 --- a/src/search.c +++ b/src/search.c @@ -298,16 +298,16 @@ static double cu_zero_coeff_cost( double ssd = 0.0; ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], - LCU_WIDTH, LCU_WIDTH, cu_loc->width + LCU_WIDTH, LCU_WIDTH, cu_loc->width, cu_loc->height ); if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) { ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], - LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width + LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height ); ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], - LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width + LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height ); } // Save the pixels at a lower level of the working tree. @@ -445,7 +445,7 @@ double uvg_cu_rd_cost_luma( int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x; ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], LCU_WIDTH, LCU_WIDTH, - cu_loc->width); + cu_loc->width, cu_loc->height); } @@ -550,10 +550,10 @@ double uvg_cu_rd_cost_chroma( int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - cu_loc->chroma_width); + cu_loc->chroma_width, cu_loc->chroma_height); int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - cu_loc->chroma_width); + cu_loc->chroma_width, cu_loc->chroma_height); ssd = ssd_u + ssd_v; } @@ -684,7 +684,7 @@ static double cu_rd_cost_tr_split_accurate( int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y; luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], LCU_WIDTH, LCU_WIDTH, - width); + width, height); } // Chroma transform skip enable/disable is non-normative, so we need to count the chroma // tr-skip bits even when we are never using it. @@ -762,10 +762,10 @@ static double cu_rd_cost_tr_split_accurate( int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + chroma_width, chroma_height); unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + chroma_width, chroma_height); chroma_ssd = ssd_u + ssd_v; } if(chroma_can_use_tr_skip && cb_flag_u) { @@ -783,10 +783,10 @@ static double cu_rd_cost_tr_split_accurate( int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + chroma_width, chroma_height); int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + chroma_width, chroma_height); chroma_ssd = ssd_u_joint + ssd_v_joint; } if (chroma_can_use_tr_skip) { @@ -1360,7 +1360,7 @@ static double search_cu( cabac_data_t best_split_cabac; memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) { - if (!can_split[split_type] || split_type != QT_SPLIT) continue; + if (!can_split[split_type] || (split_type != QT_SPLIT && depth == 0) || (split_type == QT_SPLIT && depth == 1)) continue; split_tree_t new_split = { split_tree.split_tree | split_type << (split_tree.current_depth * 3), split_tree.current_depth + 1, diff --git a/src/search_inter.c b/src/search_inter.c index 92a62795..76c7fc36 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -2144,15 +2144,15 @@ void uvg_cu_cost_inter_rd2( int index = y_px * LCU_WIDTH + x_px; double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], LCU_WIDTH, LCU_WIDTH, - width) * UVG_LUMA_MULT; + width, height) * UVG_LUMA_MULT; if (reconstruct_chroma) { int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - cu_loc->chroma_width); + cu_loc->chroma_width, cu_loc->chroma_height); double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - cu_loc->chroma_width); + cu_loc->chroma_width, cu_loc->chroma_height); ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT; } double no_cbf_bits; diff --git a/src/search_intra.c b/src/search_intra.c index 1ed00943..2a406076 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -145,15 +145,15 @@ static void get_cost_dual( if (satd_twin_func != NULL) { satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs); } else { - satd_costs[0] = uvg_satd_any_size(width, height, preds[0], width, orig_block, LCU_WIDTH); - satd_costs[1] = uvg_satd_any_size(width, height, preds[1], width, orig_block, LCU_WIDTH); + satd_costs[0] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[0], width); + satd_costs[1] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[1], width); } unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 }; if (sad_twin_func != NULL) { sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs); } else { - unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, LCU_WIDTH); - unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, LCU_WIDTH); + unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, width); + unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, width); } costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2); costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2); diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index a911928d..5d0b203c 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4) static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec, const int ref_stride, const int rec_stride, - const int width) + const int width, const int height) { + assert(width == height && "Non square not yet implemented"); __m256i ssd_part; __m256i diff = _mm256_setzero_si256(); __m128i sum; diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c index 6797a669..d6e3c81c 100644 --- a/src/strategies/generic/picture-generic.c +++ b/src/strategies/generic/picture-generic.c @@ -32,6 +32,7 @@ #include "strategies/generic/picture-generic.h" +#include #include #include "strategies/strategies-picture.h" @@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel) SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4) +uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + uint64_t satd = 0; + coeff_t diff[4], m[4]; + + diff[0] = piOrg[0] - piCur[0]; + diff[1] = piOrg[1] - piCur[1]; + diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur]; + diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur]; + m[0] = diff[0] + diff[2]; + m[1] = diff[1] + diff[3]; + m[2] = diff[0] - diff[2]; + m[3] = diff[1] - diff[3]; + + satd += abs(m[0] + m[1]) >> 2; + satd += abs(m[0] - m[1]); + satd += abs(m[2] + m[3]); + satd += abs(m[2] - m[3]); + + return satd; +} + + +static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ //need to add SIMD implementation ,JCA + int k, i, j, jj, sad = 0; + int diff[128], m1[8][16], m2[8][16]; + for (k = 0; k < 128; k += 16) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + diff[k + 4] = piOrg[4] - piCur[4]; + diff[k + 5] = piOrg[5] - piCur[5]; + diff[k + 6] = piOrg[6] - piCur[6]; + diff[k + 7] = piOrg[7] - piCur[7]; + + diff[k + 8] = piOrg[8] - piCur[8]; + diff[k + 9] = piOrg[9] - piCur[9]; + diff[k + 10] = piOrg[10] - piCur[10]; + diff[k + 11] = piOrg[11] - piCur[11]; + diff[k + 12] = piOrg[12] - piCur[12]; + diff[k + 13] = piOrg[13] - piCur[13]; + diff[k + 14] = piOrg[14] - piCur[14]; + diff[k + 15] = piOrg[15] - piCur[15]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 8; j++) + { + jj = j << 4; + + m2[j][0] = diff[jj] + diff[jj + 8]; + m2[j][1] = diff[jj + 1] + diff[jj + 9]; + m2[j][2] = diff[jj + 2] + diff[jj + 10]; + m2[j][3] = diff[jj + 3] + diff[jj + 11]; + m2[j][4] = diff[jj + 4] + diff[jj + 12]; + m2[j][5] = diff[jj + 5] + diff[jj + 13]; + m2[j][6] = diff[jj + 6] + diff[jj + 14]; + m2[j][7] = diff[jj + 7] + diff[jj + 15]; + m2[j][8] = diff[jj] - diff[jj + 8]; + m2[j][9] = diff[jj + 1] - diff[jj + 9]; + m2[j][10] = diff[jj + 2] - diff[jj + 10]; + m2[j][11] = diff[jj + 3] - diff[jj + 11]; + m2[j][12] = diff[jj + 4] - diff[jj + 12]; + m2[j][13] = diff[jj + 5] - diff[jj + 13]; + m2[j][14] = diff[jj + 6] - diff[jj + 14]; + m2[j][15] = diff[jj + 7] - diff[jj + 15]; + + m1[j][0] = m2[j][0] + m2[j][4]; + m1[j][1] = m2[j][1] + m2[j][5]; + m1[j][2] = m2[j][2] + m2[j][6]; + m1[j][3] = m2[j][3] + m2[j][7]; + m1[j][4] = m2[j][0] - m2[j][4]; + m1[j][5] = m2[j][1] - m2[j][5]; + m1[j][6] = m2[j][2] - m2[j][6]; + m1[j][7] = m2[j][3] - m2[j][7]; + m1[j][8] = m2[j][8] + m2[j][12]; + m1[j][9] = m2[j][9] + m2[j][13]; + m1[j][10] = m2[j][10] + m2[j][14]; + m1[j][11] = m2[j][11] + m2[j][15]; + m1[j][12] = m2[j][8] - m2[j][12]; + m1[j][13] = m2[j][9] - m2[j][13]; + m1[j][14] = m2[j][10] - m2[j][14]; + m1[j][15] = m2[j][11] - m2[j][15]; + + m2[j][0] = m1[j][0] + m1[j][2]; + m2[j][1] = m1[j][1] + m1[j][3]; + m2[j][2] = m1[j][0] - m1[j][2]; + m2[j][3] = m1[j][1] - m1[j][3]; + m2[j][4] = m1[j][4] + m1[j][6]; + m2[j][5] = m1[j][5] + m1[j][7]; + m2[j][6] = m1[j][4] - m1[j][6]; + m2[j][7] = m1[j][5] - m1[j][7]; + m2[j][8] = m1[j][8] + m1[j][10]; + m2[j][9] = m1[j][9] + m1[j][11]; + m2[j][10] = m1[j][8] - m1[j][10]; + m2[j][11] = m1[j][9] - m1[j][11]; + m2[j][12] = m1[j][12] + m1[j][14]; + m2[j][13] = m1[j][13] + m1[j][15]; + m2[j][14] = m1[j][12] - m1[j][14]; + m2[j][15] = m1[j][13] - m1[j][15]; + + m1[j][0] = m2[j][0] + m2[j][1]; + m1[j][1] = m2[j][0] - m2[j][1]; + m1[j][2] = m2[j][2] + m2[j][3]; + m1[j][3] = m2[j][2] - m2[j][3]; + m1[j][4] = m2[j][4] + m2[j][5]; + m1[j][5] = m2[j][4] - m2[j][5]; + m1[j][6] = m2[j][6] + m2[j][7]; + m1[j][7] = m2[j][6] - m2[j][7]; + m1[j][8] = m2[j][8] + m2[j][9]; + m1[j][9] = m2[j][8] - m2[j][9]; + m1[j][10] = m2[j][10] + m2[j][11]; + m1[j][11] = m2[j][10] - m2[j][11]; + m1[j][12] = m2[j][12] + m2[j][13]; + m1[j][13] = m2[j][12] - m2[j][13]; + m1[j][14] = m2[j][14] + m2[j][15]; + m1[j][15] = m2[j][14] - m2[j][15]; + } + + //vertical + for (i = 0; i < 16; i++) + { + m2[0][i] = m1[0][i] + m1[4][i]; + m2[1][i] = m1[1][i] + m1[5][i]; + m2[2][i] = m1[2][i] + m1[6][i]; + m2[3][i] = m1[3][i] + m1[7][i]; + m2[4][i] = m1[0][i] - m1[4][i]; + m2[5][i] = m1[1][i] - m1[5][i]; + m2[6][i] = m1[2][i] - m1[6][i]; + m2[7][i] = m1[3][i] - m1[7][i]; + + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + m1[4][i] = m2[4][i] + m2[6][i]; + m1[5][i] = m2[5][i] + m2[7][i]; + m1[6][i] = m2[4][i] - m2[6][i]; + m1[7][i] = m2[5][i] - m2[7][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + m2[4][i] = m1[4][i] + m1[5][i]; + m2[5][i] = m1[4][i] - m1[5][i]; + m2[6][i] = m1[6][i] + m1[7][i]; + m2[7][i] = m1[6][i] - m1[7][i]; + } + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 16; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(16.0 * 8) * 2); + + return sad; +} + +static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + int k, i, j, jj, sad = 0; + int diff[128], m1[16][8], m2[16][8]; + for (k = 0; k < 128; k += 8) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + diff[k + 4] = piOrg[4] - piCur[4]; + diff[k + 5] = piOrg[5] - piCur[5]; + diff[k + 6] = piOrg[6] - piCur[6]; + diff[k + 7] = piOrg[7] - piCur[7]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 16; j++) + { + jj = j << 3; + + m2[j][0] = diff[jj] + diff[jj + 4]; + m2[j][1] = diff[jj + 1] + diff[jj + 5]; + m2[j][2] = diff[jj + 2] + diff[jj + 6]; + m2[j][3] = diff[jj + 3] + diff[jj + 7]; + m2[j][4] = diff[jj] - diff[jj + 4]; + m2[j][5] = diff[jj + 1] - diff[jj + 5]; + m2[j][6] = diff[jj + 2] - diff[jj + 6]; + m2[j][7] = diff[jj + 3] - diff[jj + 7]; + + m1[j][0] = m2[j][0] + m2[j][2]; + m1[j][1] = m2[j][1] + m2[j][3]; + m1[j][2] = m2[j][0] - m2[j][2]; + m1[j][3] = m2[j][1] - m2[j][3]; + m1[j][4] = m2[j][4] + m2[j][6]; + m1[j][5] = m2[j][5] + m2[j][7]; + m1[j][6] = m2[j][4] - m2[j][6]; + m1[j][7] = m2[j][5] - m2[j][7]; + + m2[j][0] = m1[j][0] + m1[j][1]; + m2[j][1] = m1[j][0] - m1[j][1]; + m2[j][2] = m1[j][2] + m1[j][3]; + m2[j][3] = m1[j][2] - m1[j][3]; + m2[j][4] = m1[j][4] + m1[j][5]; + m2[j][5] = m1[j][4] - m1[j][5]; + m2[j][6] = m1[j][6] + m1[j][7]; + m2[j][7] = m1[j][6] - m1[j][7]; + } + + //vertical + for (i = 0; i < 8; i++) + { + m1[0][i] = m2[0][i] + m2[8][i]; + m1[1][i] = m2[1][i] + m2[9][i]; + m1[2][i] = m2[2][i] + m2[10][i]; + m1[3][i] = m2[3][i] + m2[11][i]; + m1[4][i] = m2[4][i] + m2[12][i]; + m1[5][i] = m2[5][i] + m2[13][i]; + m1[6][i] = m2[6][i] + m2[14][i]; + m1[7][i] = m2[7][i] + m2[15][i]; + m1[8][i] = m2[0][i] - m2[8][i]; + m1[9][i] = m2[1][i] - m2[9][i]; + m1[10][i] = m2[2][i] - m2[10][i]; + m1[11][i] = m2[3][i] - m2[11][i]; + m1[12][i] = m2[4][i] - m2[12][i]; + m1[13][i] = m2[5][i] - m2[13][i]; + m1[14][i] = m2[6][i] - m2[14][i]; + m1[15][i] = m2[7][i] - m2[15][i]; + + m2[0][i] = m1[0][i] + m1[4][i]; + m2[1][i] = m1[1][i] + m1[5][i]; + m2[2][i] = m1[2][i] + m1[6][i]; + m2[3][i] = m1[3][i] + m1[7][i]; + m2[4][i] = m1[0][i] - m1[4][i]; + m2[5][i] = m1[1][i] - m1[5][i]; + m2[6][i] = m1[2][i] - m1[6][i]; + m2[7][i] = m1[3][i] - m1[7][i]; + m2[8][i] = m1[8][i] + m1[12][i]; + m2[9][i] = m1[9][i] + m1[13][i]; + m2[10][i] = m1[10][i] + m1[14][i]; + m2[11][i] = m1[11][i] + m1[15][i]; + m2[12][i] = m1[8][i] - m1[12][i]; + m2[13][i] = m1[9][i] - m1[13][i]; + m2[14][i] = m1[10][i] - m1[14][i]; + m2[15][i] = m1[11][i] - m1[15][i]; + + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + m1[4][i] = m2[4][i] + m2[6][i]; + m1[5][i] = m2[5][i] + m2[7][i]; + m1[6][i] = m2[4][i] - m2[6][i]; + m1[7][i] = m2[5][i] - m2[7][i]; + m1[8][i] = m2[8][i] + m2[10][i]; + m1[9][i] = m2[9][i] + m2[11][i]; + m1[10][i] = m2[8][i] - m2[10][i]; + m1[11][i] = m2[9][i] - m2[11][i]; + m1[12][i] = m2[12][i] + m2[14][i]; + m1[13][i] = m2[13][i] + m2[15][i]; + m1[14][i] = m2[12][i] - m2[14][i]; + m1[15][i] = m2[13][i] - m2[15][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + m2[4][i] = m1[4][i] + m1[5][i]; + m2[5][i] = m1[4][i] - m1[5][i]; + m2[6][i] = m1[6][i] + m1[7][i]; + m2[7][i] = m1[6][i] - m1[7][i]; + m2[8][i] = m1[8][i] + m1[9][i]; + m2[9][i] = m1[8][i] - m1[9][i]; + m2[10][i] = m1[10][i] + m1[11][i]; + m2[11][i] = m1[10][i] - m1[11][i]; + m2[12][i] = m1[12][i] + m1[13][i]; + m2[13][i] = m1[12][i] - m1[13][i]; + m2[14][i] = m1[14][i] + m1[15][i]; + m2[15][i] = m1[14][i] - m1[15][i]; + } + + for (i = 0; i < 16; i++) + { + for (j = 0; j < 8; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(16.0 * 8) * 2); + + return sad; +} + +static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + int k, i, j, jj, sad = 0; + int diff[32], m1[8][4], m2[8][4]; + for (k = 0; k < 32; k += 4) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 8; j++) + { + jj = j << 2; + m2[j][0] = diff[jj] + diff[jj + 2]; + m2[j][1] = diff[jj + 1] + diff[jj + 3]; + m2[j][2] = diff[jj] - diff[jj + 2]; + m2[j][3] = diff[jj + 1] - diff[jj + 3]; + + m1[j][0] = m2[j][0] + m2[j][1]; + m1[j][1] = m2[j][0] - m2[j][1]; + m1[j][2] = m2[j][2] + m2[j][3]; + m1[j][3] = m2[j][2] - m2[j][3]; + } + + //vertical + for (i = 0; i < 4; i++) + { + m2[0][i] = m1[0][i] + m1[4][i]; + m2[1][i] = m1[1][i] + m1[5][i]; + m2[2][i] = m1[2][i] + m1[6][i]; + m2[3][i] = m1[3][i] + m1[7][i]; + m2[4][i] = m1[0][i] - m1[4][i]; + m2[5][i] = m1[1][i] - m1[5][i]; + m2[6][i] = m1[2][i] - m1[6][i]; + m2[7][i] = m1[3][i] - m1[7][i]; + + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + m1[4][i] = m2[4][i] + m2[6][i]; + m1[5][i] = m2[5][i] + m2[7][i]; + m1[6][i] = m2[4][i] - m2[6][i]; + m1[7][i] = m2[5][i] - m2[7][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + m2[4][i] = m1[4][i] + m1[5][i]; + m2[5][i] = m1[4][i] - m1[5][i]; + m2[6][i] = m1[6][i] + m1[7][i]; + m2[7][i] = m1[6][i] - m1[7][i]; + } + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 4; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(4.0 * 8) * 2); + + return sad; +} + +static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + int k, i, j, jj, sad = 0; + int diff[32], m1[4][8], m2[4][8]; + for (k = 0; k < 32; k += 8) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + diff[k + 4] = piOrg[4] - piCur[4]; + diff[k + 5] = piOrg[5] - piCur[5]; + diff[k + 6] = piOrg[6] - piCur[6]; + diff[k + 7] = piOrg[7] - piCur[7]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 4; j++) + { + jj = j << 3; + + m2[j][0] = diff[jj] + diff[jj + 4]; + m2[j][1] = diff[jj + 1] + diff[jj + 5]; + m2[j][2] = diff[jj + 2] + diff[jj + 6]; + m2[j][3] = diff[jj + 3] + diff[jj + 7]; + m2[j][4] = diff[jj] - diff[jj + 4]; + m2[j][5] = diff[jj + 1] - diff[jj + 5]; + m2[j][6] = diff[jj + 2] - diff[jj + 6]; + m2[j][7] = diff[jj + 3] - diff[jj + 7]; + + m1[j][0] = m2[j][0] + m2[j][2]; + m1[j][1] = m2[j][1] + m2[j][3]; + m1[j][2] = m2[j][0] - m2[j][2]; + m1[j][3] = m2[j][1] - m2[j][3]; + m1[j][4] = m2[j][4] + m2[j][6]; + m1[j][5] = m2[j][5] + m2[j][7]; + m1[j][6] = m2[j][4] - m2[j][6]; + m1[j][7] = m2[j][5] - m2[j][7]; + + m2[j][0] = m1[j][0] + m1[j][1]; + m2[j][1] = m1[j][0] - m1[j][1]; + m2[j][2] = m1[j][2] + m1[j][3]; + m2[j][3] = m1[j][2] - m1[j][3]; + m2[j][4] = m1[j][4] + m1[j][5]; + m2[j][5] = m1[j][4] - m1[j][5]; + m2[j][6] = m1[j][6] + m1[j][7]; + m2[j][7] = m1[j][6] - m1[j][7]; + } + + //vertical + for (i = 0; i < 8; i++) + { + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + } + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 8; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(4.0 * 8) * 2); + + return sad; +} + + +uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride) +{ + const uvg_pixel* piOrg = ref_in; + const uvg_pixel* piCur = pred_in; + const int iRows = height; + const int iCols = width; + const int iStrideOrg = ref_stride; + const int iStrideCur = pred_stride; + + int x = 0, y = 0; + + uint64_t uiSum = 0; + + if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0) + { + for (y = 0; y < iRows; y += 8) + { + for (x = 0; x < iCols; x += 16) + { + uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 8; + piCur += iStrideCur * 8; + } + } + else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0) + { + for (y = 0; y < iRows; y += 16) + { + for (x = 0; x < iCols; x += 8) + { + uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 16; + piCur += iStrideCur * 16; + } + } + else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0) + { + for (y = 0; y < iRows; y += 4) + { + for (x = 0; x < iCols; x += 8) + { + uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 4; + piCur += iStrideCur * 4; + } + } + else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0) + { + for (y = 0; y < iRows; y += 8) + { + for (x = 0; x < iCols; x += 4) + { + uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 8; + piCur += iStrideCur * 8; + } + } + else if ((iRows % 8 == 0) && (iCols % 8 == 0)) + { + for (y = 0; y < iRows; y += 8) + { + for (x = 0; x < iCols; x += 8) + { + uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur); + } + piOrg += 8 * iStrideOrg; + piCur += 8 * iStrideCur; + } + } + else if ((iRows % 4 == 0) && (iCols % 4 == 0)) + { + for (y = 0; y < iRows; y += 4) + { + for (x = 0; x < iCols; x += 4) + { + uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur); + } + piOrg += 4 * iStrideOrg; + piCur += 4 * iStrideCur; + } + } + else if ((iRows % 2 == 0) && (iCols % 2 == 0)) + { + for (y = 0; y < iRows; y += 2) + { + for (x = 0; x < iCols; x += 2) + { + uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += 2 * iStrideOrg; + piCur += 2 * iStrideCur; + } + } + + // TODO: 10 bit + return (uiSum >> 0); +} + + // Function macro for defining SAD calculating functions // for fixed size blocks. #define SAD_NXN(n, pixel_type) \ @@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel) static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, - const int width) + const int width, const int height) { int ssd = 0; int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride]; ssd += diff * diff; @@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic); success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic); success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic); + success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs); success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic); success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic); diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index 37d3cb75..643d2f8f 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -70,6 +70,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0; cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0; cost_pixel_any_size_func * uvg_satd_any_size = 0; +cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0; cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0; pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0; diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h index 286a0735..cd4e2ec5 100644 --- a/src/strategies/strategies-picture.h +++ b/src/strategies/strategies-picture.h @@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)( typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out); typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid); -typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width); +typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height); typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t); typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data, int32_t block_width, int32_t block_height, @@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16; extern cost_pixel_nxn_func * uvg_satd_32x32; extern cost_pixel_nxn_func * uvg_satd_64x64; extern cost_pixel_any_size_func *uvg_satd_any_size; +extern cost_pixel_any_size_func *uvg_satd_any_size_vtm; extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual; extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual; @@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigne {"satd_32x32", (void**) &uvg_satd_32x32}, \ {"satd_64x64", (void**) &uvg_satd_64x64}, \ {"satd_any_size", (void**) &uvg_satd_any_size}, \ + {"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \ {"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \ {"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \ {"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \ diff --git a/src/transform.c b/src/transform.c index 26851b8d..7e2b64ee 100644 --- a/src/transform.c +++ b/src/transform.c @@ -617,7 +617,7 @@ void uvg_chroma_transform_search( if (v_has_coeffs && !is_jccr) { - uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V, + uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, height, COLOR_V, pred_cu->type, transforms[i] == CHROMA_TS); if (transforms[i] != CHROMA_TS) { @@ -661,10 +661,10 @@ void uvg_chroma_transform_search( if (!state->encoder_control->cfg.lossless) { ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i], LCU_WIDTH_C, width, - width); + width, height); ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i], LCU_WIDTH_C, width, - width); + width, height); } double u_bits = 0;