mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 02:24:07 +00:00
[alf] Change the order of alf_covariance.ee values to get better optimized solution for alf_get_blk_stats_avx2()
This commit is contained in:
parent
f4de5cfd0f
commit
be9527cf1d
42
src/alf.c
42
src/alf.c
|
@ -119,7 +119,7 @@ static void get_clip_max(const alf_covariance *cov, int *clip_max)
|
||||||
{
|
{
|
||||||
for (int l = 0; inc && l < num_coeff; ++l)
|
for (int l = 0; inc && l < num_coeff; ++l)
|
||||||
{
|
{
|
||||||
if (cov->ee[clip_max[k]][0][k][l] != cov->ee[clip_max[k] + 1][0][k][l])
|
if (cov->ee[k][l][clip_max[k]][0] != cov->ee[k][l][clip_max[k] + 1][0])
|
||||||
{
|
{
|
||||||
inc = false;
|
inc = false;
|
||||||
}
|
}
|
||||||
|
@ -142,7 +142,7 @@ static void reduce_clip_cost(const alf_covariance *cov, int *clip)
|
||||||
{
|
{
|
||||||
for (int l = 0; dec && l < cov->num_coeff; ++l)
|
for (int l = 0; dec && l < cov->num_coeff; ++l)
|
||||||
{
|
{
|
||||||
if (cov->ee[clip[k]][clip[l]][k][l] != cov->ee[clip[k] - 1][clip[l]][k][l])
|
if (cov->ee[k][l][clip[k]][clip[l]] != cov->ee[k][l][clip[k] - 1][clip[l]])
|
||||||
{
|
{
|
||||||
dec = false;
|
dec = false;
|
||||||
}
|
}
|
||||||
|
@ -162,7 +162,7 @@ static void set_ey_from_clip(const alf_covariance *cov, const int* clip, double
|
||||||
y[k] = cov->y[clip[k]][k];
|
y[k] = cov->y[clip[k]][k];
|
||||||
for (int l = 0; l < size; l++)
|
for (int l = 0; l < size; l++)
|
||||||
{
|
{
|
||||||
ee[k][l] = cov->ee[clip[k]][clip[l]][k][l];
|
ee[k][l] = cov->ee[k][l][clip[k]][clip[l]];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -352,8 +352,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
|
||||||
ky[k] = cov->y[clip[k]][k];
|
ky[k] = cov->y[clip[k]][k];
|
||||||
for (int l = 0; l < size; l++)
|
for (int l = 0; l < size; l++)
|
||||||
{
|
{
|
||||||
ke[k][l] = cov->ee[clip[k]][clip[l]][k][l];
|
ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
|
||||||
ke[l][k] = cov->ee[clip[l]][clip[k]][l][k];
|
ke[l][k] = cov->ee[l][k][clip[l]][clip[k]];
|
||||||
}
|
}
|
||||||
|
|
||||||
gns_solve_by_chol(ke, ky, f, size);
|
gns_solve_by_chol(ke, ky, f, size);
|
||||||
|
@ -373,8 +373,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
|
||||||
ky[k] = cov->y[clip[k]][k];
|
ky[k] = cov->y[clip[k]][k];
|
||||||
for (int l = 0; l < size; l++)
|
for (int l = 0; l < size; l++)
|
||||||
{
|
{
|
||||||
ke[k][l] = cov->ee[clip[k]][clip[l]][k][l];
|
ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
|
||||||
ke[l][k] = cov->ee[clip[l]][clip[k]][l][k];
|
ke[l][k] = cov->ee[l][k][clip[l]][clip[k]];
|
||||||
}
|
}
|
||||||
|
|
||||||
gns_solve_by_chol(ke, ky, f, size);
|
gns_solve_by_chol(ke, ky, f, size);
|
||||||
|
@ -392,8 +392,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
|
||||||
ky[k] = cov->y[clip[k]][k];
|
ky[k] = cov->y[clip[k]][k];
|
||||||
for (int l = 0; l < size; l++)
|
for (int l = 0; l < size; l++)
|
||||||
{
|
{
|
||||||
ke[k][l] = cov->ee[clip[k]][clip[l]][k][l];
|
ke[k][l] = cov->ee[k][l][clip[k]][clip[l]];
|
||||||
ke[l][k] = cov->ee[clip[l]][clip[k]][l][k];
|
ke[l][k] = cov->ee[l][k][clip[l]][clip[k]];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -404,8 +404,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b
|
||||||
ky[idx_min] = cov->y[clip[idx_min]][idx_min];
|
ky[idx_min] = cov->y[clip[idx_min]][idx_min];
|
||||||
for (int l = 0; l < size; l++)
|
for (int l = 0; l < size; l++)
|
||||||
{
|
{
|
||||||
ke[idx_min][l] = cov->ee[clip[idx_min]][clip[l]][idx_min][l];
|
ke[idx_min][l] = cov->ee[idx_min][l][clip[idx_min]][clip[l]];
|
||||||
ke[l][idx_min] = cov->ee[clip[l]][clip[idx_min]][l][idx_min];
|
ke[l][idx_min] = cov->ee[l][idx_min][clip[l]][clip[idx_min]];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -469,9 +469,9 @@ static double calc_error_for_coeffs(const alf_covariance *cov, const int *clip,
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int j = i + 1; j < num_coeff; j++)
|
for (int j = i + 1; j < num_coeff; j++)
|
||||||
{
|
{
|
||||||
sum += cov->ee[clip[i]][clip[j]][i][j] * coeff[j];
|
sum += cov->ee[i][j][clip[i]][clip[j]] * coeff[j];
|
||||||
}
|
}
|
||||||
error += ((cov->ee[clip[i]][clip[i]][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i];
|
error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
return error / factor;
|
return error / factor;
|
||||||
|
@ -488,9 +488,9 @@ static double calc_error_for_cc_alf_coeffs(const alf_covariance *cov, const int1
|
||||||
for (int j = i + 1; j < num_coeff; j++)
|
for (int j = i + 1; j < num_coeff; j++)
|
||||||
{
|
{
|
||||||
// E[j][i] = E[i][j], sum will be multiplied by 2 later
|
// E[j][i] = E[i][j], sum will be multiplied by 2 later
|
||||||
sum += cov->ee[0][0][i][j] * coeff[j];
|
sum += cov->ee[i][j][0][0] * coeff[j];
|
||||||
}
|
}
|
||||||
error += ((cov->ee[0][0][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i];
|
error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
return error / factor;
|
return error / factor;
|
||||||
|
@ -753,7 +753,7 @@ static void add_alf_cov(alf_covariance *dst, alf_covariance *src)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < num_coeff; i++)
|
for (int i = 0; i < num_coeff; i++)
|
||||||
{
|
{
|
||||||
dst->ee[b0][b1][j][i] += src->ee[b0][b1][j][i];
|
dst->ee[j][i][b0][b1] += src->ee[j][i][b0][b1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -780,7 +780,7 @@ static void add_alf_cov_lhs_rhs(alf_covariance *dst, alf_covariance *lhs, alf_co
|
||||||
{
|
{
|
||||||
for (int i = 0; i < num_coeff; i++)
|
for (int i = 0; i < num_coeff; i++)
|
||||||
{
|
{
|
||||||
dst->ee[b0][b1][j][i] = lhs->ee[b0][b1][j][i] + rhs->ee[b0][b1][j][i];
|
dst->ee[j][i][b0][b1] = lhs->ee[j][i][b0][b1] + rhs->ee[j][i][b0][b1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1972,7 +1972,7 @@ static void derive_cc_alf_filter_coeff(alf_covariance *alf_covariance_frame_cc_a
|
||||||
ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[0][k];
|
ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[0][k];
|
||||||
for (int l = 0; l < size; l++)
|
for (int l = 0; l < size; l++)
|
||||||
{
|
{
|
||||||
k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[0][0][k][l];
|
k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[k][l][0][0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2766,11 +2766,11 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state,
|
||||||
{
|
{
|
||||||
if (0 /*g_alf_wssd*/)
|
if (0 /*g_alf_wssd*/)
|
||||||
{
|
{
|
||||||
alf_covariance->ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
|
alf_covariance->ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
alf_covariance->ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
|
alf_covariance->ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2826,7 +2826,7 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state,
|
||||||
{
|
{
|
||||||
for (int b1 = 0; b1 < num_bins; b1++)
|
for (int b1 = 0; b1 < num_bins; b1++)
|
||||||
{
|
{
|
||||||
alf_covariance->ee[b0][b1][k][l] = alf_covariance->ee[b1][b0][l][k];
|
alf_covariance->ee[k][l][b0][b1] = alf_covariance->ee[l][k][b1][b0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -176,7 +176,7 @@ typedef enum {
|
||||||
PACK(
|
PACK(
|
||||||
typedef struct alf_covariance {
|
typedef struct alf_covariance {
|
||||||
double pix_acc;
|
double pix_acc;
|
||||||
int64_t ee[MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF];
|
int64_t ee[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES];
|
||||||
int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF];
|
int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF];
|
||||||
int num_coeff;
|
int num_coeff;
|
||||||
int num_bins;
|
int num_bins;
|
||||||
|
|
|
@ -234,22 +234,15 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
|
||||||
__m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]);
|
__m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]);
|
||||||
/*for (int b1 = 0; b1 < 4; b1++)
|
/*for (int b1 = 0; b1 < 4; b1++)
|
||||||
{
|
{
|
||||||
alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
|
alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1];
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
//__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
|
|
||||||
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
|
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
|
||||||
__m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
|
__m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
|
||||||
__m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32);
|
__m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32);
|
||||||
int64_t data[4];
|
__m256i orig = _mm256_lddqu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]);
|
||||||
_mm256_storeu_si256((__m256i*)data, multiplied);
|
_mm256_storeu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0], _mm256_add_epi64(multiplied, orig));
|
||||||
|
|
||||||
|
|
||||||
alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
|
|
||||||
alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
|
|
||||||
alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
|
|
||||||
alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
@ -288,7 +281,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
|
||||||
{
|
{
|
||||||
for (int b1 = 0; b1 < 4; b1++)
|
for (int b1 = 0; b1 < 4; b1++)
|
||||||
{
|
{
|
||||||
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
|
alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -933,11 +933,11 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state,
|
||||||
{
|
{
|
||||||
if (0/*m_alfWSSD*/)
|
if (0/*m_alfWSSD*/)
|
||||||
{
|
{
|
||||||
alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
|
alf_covariance[class_idx].ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
|
alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -978,7 +978,7 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state,
|
||||||
{
|
{
|
||||||
for (int b1 = 0; b1 < num_bins; b1++)
|
for (int b1 = 0; b1 < num_bins; b1++)
|
||||||
{
|
{
|
||||||
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
|
alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue