mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-23 18:14:06 +00:00
[alf] Add strategy for alf_get_blk_stats() and an initial AVX2 version
This commit is contained in:
parent
f61b9138cd
commit
8ef3e6a126
|
@ -170,6 +170,12 @@
|
|||
<ClCompile Include="..\..\src\search.c" />
|
||||
<ClCompile Include="..\..\src\search_inter.c" />
|
||||
<ClCompile Include="..\..\src\search_intra.c" />
|
||||
<ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\avx2\intra-avx2.c">
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
|
@ -264,6 +270,7 @@
|
|||
<ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h" />
|
||||
<ClInclude Include="..\..\src\search_inter.h" />
|
||||
<ClInclude Include="..\..\src\search_intra.h" />
|
||||
<ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h" />
|
||||
<ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
|
||||
<ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
|
||||
<ClInclude Include="..\..\src\strategies\generic\alf-generic.h" />
|
||||
|
|
|
@ -263,6 +263,9 @@
|
|||
<ClCompile Include="..\..\src\strategies\sse41\alf-sse41.c">
|
||||
<Filter>Optimization\strategies\sse41</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
|
||||
<Filter>Optimization\strategies\avx2</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\src\bitstream.h">
|
||||
|
@ -491,6 +494,9 @@
|
|||
<ClInclude Include="..\..\src\strategies\sse41\alf-sse41.h">
|
||||
<Filter>Optimization\strategies\sse41</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h">
|
||||
<Filter>Optimization\strategies\avx2</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<YASM Include="..\..\src\extras\x86inc.asm">
|
||||
|
|
|
@ -196,7 +196,9 @@ libavx2_la_SOURCES = \
|
|||
strategies/avx2/quant-avx2.h \
|
||||
strategies/avx2/reg_sad_pow2_widths-avx2.h \
|
||||
strategies/avx2/sao-avx2.c \
|
||||
strategies/avx2/sao-avx2.h
|
||||
strategies/avx2/sao-avx2.h \
|
||||
strategies/avx2/alf-avx2.c \
|
||||
strategies/avx2/alf-avx2.h
|
||||
# strategies/avx2/encode_coding_tree-avx2.c \
|
||||
# strategies/avx2/encode_coding_tree-avx2.h
|
||||
|
||||
|
|
260
src/alf.c
260
src/alf.c
|
@ -4276,264 +4276,6 @@ static void alf_get_avai_aps_ids_luma(encoder_state_t * const state,
|
|||
assert(*new_aps_id < (int)ALF_CTB_MAX_NUM_APS); //Wrong APS index assignment in getAvaiApsIdsLuma
|
||||
}
|
||||
|
||||
static void alf_calc_covariance(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
|
||||
const kvz_pixel *rec,
|
||||
const int stride,
|
||||
const channel_type channel,
|
||||
const int transpose_idx,
|
||||
int vb_distance,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
static const int alf_pattern_5[13] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
static const int alf_pattern_7[25] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
int clip_top_row = -4;
|
||||
int clip_bot_row = 4;
|
||||
if (vb_distance >= -3 && vb_distance < 0)
|
||||
{
|
||||
clip_bot_row = -vb_distance - 1;
|
||||
clip_top_row = -clip_bot_row; // symmetric
|
||||
}
|
||||
else if (vb_distance >= 0 && vb_distance < 3)
|
||||
{
|
||||
clip_top_row = -vb_distance;
|
||||
clip_bot_row = -clip_top_row; // symmetric
|
||||
}
|
||||
|
||||
const bool is_luma = channel == CHANNEL_TYPE_LUMA;
|
||||
const int *filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
|
||||
const int half_filter_length = (is_luma ? 7 : 5) >> 1;
|
||||
const short* clip = alf_clipping_values[channel];
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int k = 0;
|
||||
|
||||
const int16_t curr = rec[0];
|
||||
|
||||
if (transpose_idx == 0)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 1)
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 2)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
|
||||
for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += curr;
|
||||
}
|
||||
}
|
||||
|
||||
static void alf_get_blk_stats(encoder_state_t * const state,
|
||||
channel_type channel,
|
||||
alf_covariance *alf_covariance,
|
||||
alf_classifier **g_classifier,
|
||||
kvz_pixel *org,
|
||||
int32_t org_stride,
|
||||
kvz_pixel *rec,
|
||||
int32_t rec_stride,
|
||||
const int x_pos,
|
||||
const int y_pos,
|
||||
const int x_dst,
|
||||
const int y_dst,
|
||||
const int width,
|
||||
const int height,
|
||||
int vb_ctu_height,
|
||||
int vb_pos,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
|
||||
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
|
||||
int transpose_idx = 0;
|
||||
int class_idx = 0;
|
||||
|
||||
for (int i = 0; i < height; i++)
|
||||
{
|
||||
int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
|
||||
for (int j = 0; j < width; j++)
|
||||
{
|
||||
if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
memset(e_local, 0, sizeof(e_local));
|
||||
if (g_classifier)
|
||||
{
|
||||
alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
|
||||
transpose_idx = cl->transpose_idx;
|
||||
class_idx = cl->class_idx;
|
||||
}
|
||||
|
||||
double weight = 1.0;
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
//weight = g_luma_level_to_weight_plut[org[j]];
|
||||
}
|
||||
int16_t y_local = org[j] - rec[j];
|
||||
alf_calc_covariance(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
|
||||
for (int k = 0; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = k; l < num_coeff; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < num_bins; b0++)
|
||||
{
|
||||
for (int b1 = 0; b1 < num_bins; b1++)
|
||||
{
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
|
||||
}
|
||||
else
|
||||
{
|
||||
alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
|
||||
}
|
||||
else
|
||||
{
|
||||
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
|
||||
}
|
||||
}
|
||||
org += org_stride;
|
||||
rec += rec_stride;
|
||||
}
|
||||
|
||||
int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
|
||||
for (class_idx = 0; class_idx < num_classes; class_idx++)
|
||||
{
|
||||
for (int k = 1; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = 0; l < k; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < num_bins; b0++)
|
||||
{
|
||||
for (int b1 = 0; b1 < num_bins; b1++)
|
||||
{
|
||||
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void alf_derive_stats_for_filtering(encoder_state_t * const state,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
|
@ -4619,7 +4361,7 @@ static void alf_derive_stats_for_filtering(encoder_state_t * const state,
|
|||
|
||||
const int num_classes = is_luma ? MAX_NUM_ALF_CLASSES : 1;
|
||||
const int cov_index = ctu_rs_addr * num_classes;
|
||||
alf_get_blk_stats(state, ch_type,
|
||||
kvz_alf_get_blk_stats(state, ch_type,
|
||||
&alf_cov[cov_index],
|
||||
comp_idx ? NULL : alf_info->classifier,
|
||||
org, org_stride, rec, rec_stride, pos_x, pos_y, pos_x, pos_y, blk_w, blk_h,
|
||||
|
|
393
src/strategies/avx2/alf-avx2.c
Normal file
393
src/strategies/avx2/alf-avx2.c
Normal file
|
@ -0,0 +1,393 @@
|
|||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
#include "global.h"
|
||||
|
||||
#include "strategies/avx2/alf-avx2.h"
|
||||
|
||||
#if COMPILE_INTEL_AVX2
|
||||
#include "kvazaar.h"
|
||||
#if KVZ_BIT_DEPTH == 8
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "strategyselector.h"
|
||||
|
||||
static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1)
|
||||
{
|
||||
return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref);
|
||||
}
|
||||
|
||||
|
||||
static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
|
||||
const kvz_pixel* rec,
|
||||
const int stride,
|
||||
const channel_type channel,
|
||||
const int transpose_idx,
|
||||
int vb_distance,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
static const int alf_pattern_5[13] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
static const int alf_pattern_7[25] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
int clip_top_row = -4;
|
||||
int clip_bot_row = 4;
|
||||
if (vb_distance >= -3 && vb_distance < 0)
|
||||
{
|
||||
clip_bot_row = -vb_distance - 1;
|
||||
clip_top_row = -clip_bot_row; // symmetric
|
||||
}
|
||||
else if (vb_distance >= 0 && vb_distance < 3)
|
||||
{
|
||||
clip_top_row = -vb_distance;
|
||||
clip_bot_row = -clip_top_row; // symmetric
|
||||
}
|
||||
|
||||
const bool is_luma = channel == CHANNEL_TYPE_LUMA;
|
||||
const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
|
||||
const int half_filter_length = (is_luma ? 7 : 5) >> 1;
|
||||
const short* clip = alf_clipping_values[channel];
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int k = 0;
|
||||
|
||||
const int16_t curr = rec[0];
|
||||
|
||||
const __m128i negate = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
|
||||
|
||||
if (transpose_idx == 0)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec0[j] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec[j] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec[-j] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 1)
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 2)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
|
||||
for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec0[j] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec[j] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec[-j] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate);
|
||||
__m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
|
||||
__m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
|
||||
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
|
||||
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
|
||||
}
|
||||
}
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_set1_epi16(curr));
|
||||
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
|
||||
|
||||
}
|
||||
|
||||
static void alf_get_blk_stats_avx2(encoder_state_t* const state,
|
||||
channel_type channel,
|
||||
alf_covariance* alf_covariance,
|
||||
alf_classifier** g_classifier,
|
||||
kvz_pixel* org,
|
||||
int32_t org_stride,
|
||||
kvz_pixel* rec,
|
||||
int32_t rec_stride,
|
||||
const int x_pos,
|
||||
const int y_pos,
|
||||
const int x_dst,
|
||||
const int y_dst,
|
||||
const int width,
|
||||
const int height,
|
||||
int vb_ctu_height,
|
||||
int vb_pos,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
|
||||
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
|
||||
int transpose_idx = 0;
|
||||
int class_idx = 0;
|
||||
|
||||
for (int i = 0; i < height; i++)
|
||||
{
|
||||
int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
|
||||
for (int j = 0; j < width; j++)
|
||||
{
|
||||
if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
memset(e_local, 0, sizeof(e_local));
|
||||
if (g_classifier)
|
||||
{
|
||||
alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
|
||||
transpose_idx = cl->transpose_idx;
|
||||
class_idx = cl->class_idx;
|
||||
}
|
||||
|
||||
int16_t y_local = org[j] - rec[j];
|
||||
|
||||
__m256d y_local_d = _mm256_set1_pd((double)y_local);
|
||||
alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
|
||||
for (int k = 0; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = k; l < num_coeff; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < 4; b0++)
|
||||
{
|
||||
__m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]);
|
||||
//for (int b1 = 0; b1 < 4; b1++)
|
||||
{
|
||||
|
||||
//__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
|
||||
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
|
||||
__m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
|
||||
__m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
|
||||
|
||||
__m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d);
|
||||
double data[4];
|
||||
_mm256_store_pd(data, multiplied);
|
||||
|
||||
//alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
|
||||
alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
|
||||
alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
|
||||
alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
|
||||
alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
//for (int b = 0; b < 4; b++)
|
||||
{
|
||||
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
|
||||
__m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
|
||||
__m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
|
||||
|
||||
__m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d);
|
||||
|
||||
__m128i output = _mm256_cvtpd_epi32(multiplied);
|
||||
|
||||
int32_t data[4];
|
||||
_mm_storeu_si128((__m128i*)data, output);
|
||||
//alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
|
||||
alf_covariance[class_idx].y[0][k] += data[0];
|
||||
alf_covariance[class_idx].y[1][k] += data[1];
|
||||
alf_covariance[class_idx].y[2][k] += data[2];
|
||||
alf_covariance[class_idx].y[3][k] += data[3];
|
||||
}
|
||||
}
|
||||
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
|
||||
}
|
||||
org += org_stride;
|
||||
rec += rec_stride;
|
||||
}
|
||||
|
||||
int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
|
||||
for (class_idx = 0; class_idx < num_classes; class_idx++)
|
||||
{
|
||||
for (int k = 1; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = 0; l < k; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < 4; b0++)
|
||||
{
|
||||
for (int b1 = 0; b1 < 4; b1++)
|
||||
{
|
||||
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // KVZ_BIT_DEPTH == 8
|
||||
#endif //COMPILE_INTEL_AVX2
|
||||
|
||||
|
||||
int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) {
|
||||
bool success = true;
|
||||
#if COMPILE_INTEL_AVX2
|
||||
#if KVZ_BIT_DEPTH == 8
|
||||
if (bitdepth == 8){
|
||||
success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2);
|
||||
}
|
||||
#endif // KVZ_BIT_DEPTH == 8
|
||||
#endif
|
||||
return success;
|
||||
}
|
32
src/strategies/avx2/alf-avx2.h
Normal file
32
src/strategies/avx2/alf-avx2.h
Normal file
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
/**
|
||||
* \ingroup Optimization
|
||||
* \file
|
||||
* Optimizations for AVX2.
|
||||
*/
|
||||
|
||||
#include "global.h" // IWYU pragma: keep
|
||||
#include "kvazaar.h"
|
||||
|
||||
int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth);
|
||||
|
|
@ -724,6 +724,269 @@ static void alf_filter_7x7_block_generic(encoder_state_t* const state,
|
|||
alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static void alf_calc_covariance_generic(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
|
||||
const kvz_pixel* rec,
|
||||
const int stride,
|
||||
const channel_type channel,
|
||||
const int transpose_idx,
|
||||
int vb_distance,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
static const int alf_pattern_5[13] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
static const int alf_pattern_7[25] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
int clip_top_row = -4;
|
||||
int clip_bot_row = 4;
|
||||
if (vb_distance >= -3 && vb_distance < 0)
|
||||
{
|
||||
clip_bot_row = -vb_distance - 1;
|
||||
clip_top_row = -clip_bot_row; // symmetric
|
||||
}
|
||||
else if (vb_distance >= 0 && vb_distance < 3)
|
||||
{
|
||||
clip_top_row = -vb_distance;
|
||||
clip_bot_row = -clip_top_row; // symmetric
|
||||
}
|
||||
|
||||
const bool is_luma = channel == CHANNEL_TYPE_LUMA;
|
||||
const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
|
||||
const int half_filter_length = (is_luma ? 7 : 5) >> 1;
|
||||
const short* clip = alf_clipping_values[channel];
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int k = 0;
|
||||
|
||||
const int16_t curr = rec[0];
|
||||
|
||||
if (transpose_idx == 0)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 1)
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 2)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
|
||||
for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
e_local[filter_pattern[k]][b] += curr;
|
||||
}
|
||||
}
|
||||
|
||||
static void alf_get_blk_stats_generic(encoder_state_t* const state,
|
||||
channel_type channel,
|
||||
alf_covariance* alf_covariance,
|
||||
alf_classifier** g_classifier,
|
||||
kvz_pixel* org,
|
||||
int32_t org_stride,
|
||||
kvz_pixel* rec,
|
||||
int32_t rec_stride,
|
||||
const int x_pos,
|
||||
const int y_pos,
|
||||
const int x_dst,
|
||||
const int y_dst,
|
||||
const int width,
|
||||
const int height,
|
||||
int vb_ctu_height,
|
||||
int vb_pos,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
|
||||
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
|
||||
int transpose_idx = 0;
|
||||
int class_idx = 0;
|
||||
|
||||
for (int i = 0; i < height; i++)
|
||||
{
|
||||
int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
|
||||
for (int j = 0; j < width; j++)
|
||||
{
|
||||
if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
memset(e_local, 0, sizeof(e_local));
|
||||
if (g_classifier)
|
||||
{
|
||||
alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
|
||||
transpose_idx = cl->transpose_idx;
|
||||
class_idx = cl->class_idx;
|
||||
}
|
||||
|
||||
double weight = 1.0;
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
//weight = g_luma_level_to_weight_plut[org[j]];
|
||||
}
|
||||
int16_t y_local = org[j] - rec[j];
|
||||
alf_calc_covariance_generic(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
|
||||
for (int k = 0; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = k; l < num_coeff; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < num_bins; b0++)
|
||||
{
|
||||
for (int b1 = 0; b1 < num_bins; b1++)
|
||||
{
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int b = 0; b < num_bins; b++)
|
||||
{
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
|
||||
}
|
||||
else
|
||||
{
|
||||
alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (0/*m_alfWSSD*/)
|
||||
{
|
||||
alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
|
||||
}
|
||||
else
|
||||
{
|
||||
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
|
||||
}
|
||||
}
|
||||
org += org_stride;
|
||||
rec += rec_stride;
|
||||
}
|
||||
|
||||
int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
|
||||
for (class_idx = 0; class_idx < num_classes; class_idx++)
|
||||
{
|
||||
for (int k = 1; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = 0; l < k; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < num_bins; b0++)
|
||||
{
|
||||
for (int b1 = 0; b1 < num_bins; b1++)
|
||||
{
|
||||
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
|
||||
{
|
||||
bool success = true;
|
||||
|
@ -731,6 +994,8 @@ int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
|
|||
success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic);
|
||||
success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_generic);
|
||||
|
||||
|
||||
return success;
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#include "strategies/strategies-alf.h"
|
||||
#include "strategies/sse41/alf-sse41.h"
|
||||
#include "strategies/avx2/alf-avx2.h"
|
||||
#include "strategies/generic/alf-generic.h"
|
||||
#include "strategyselector.h"
|
||||
|
||||
|
@ -28,6 +29,7 @@
|
|||
alf_derive_classification_blk_func* kvz_alf_derive_classification_blk;
|
||||
alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
|
||||
alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
|
||||
alf_get_blk_stats_func* kvz_alf_get_blk_stats;
|
||||
|
||||
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
|
||||
bool success = true;
|
||||
|
@ -37,6 +39,9 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
|
|||
if (kvz_g_hardware_flags.intel_flags.sse41) {
|
||||
success &= kvz_strategy_register_alf_sse41(opaque, bitdepth);
|
||||
}
|
||||
if (kvz_g_hardware_flags.intel_flags.avx2) {
|
||||
success &= kvz_strategy_register_alf_avx2(opaque, bitdepth);
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
|
@ -78,10 +78,29 @@ typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state,
|
|||
int vb_pos,
|
||||
const int vb_ctu_height);
|
||||
|
||||
typedef void (alf_get_blk_stats_func)(encoder_state_t* const state,
|
||||
channel_type channel,
|
||||
alf_covariance* alf_covariance,
|
||||
alf_classifier** g_classifier,
|
||||
kvz_pixel* org,
|
||||
int32_t org_stride,
|
||||
kvz_pixel* rec,
|
||||
int32_t rec_stride,
|
||||
const int x_pos,
|
||||
const int y_pos,
|
||||
const int x_dst,
|
||||
const int y_dst,
|
||||
const int width,
|
||||
const int height,
|
||||
int vb_ctu_height,
|
||||
int vb_pos,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]);
|
||||
|
||||
// Declare function pointers.
|
||||
extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk;
|
||||
extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
|
||||
extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
|
||||
extern alf_get_blk_stats_func* kvz_alf_get_blk_stats;
|
||||
|
||||
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
|
||||
|
||||
|
@ -90,5 +109,6 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
|
|||
{"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \
|
||||
{"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \
|
||||
{"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \
|
||||
{"alf_get_blk_stats", (void**) &kvz_alf_get_blk_stats}, \
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue