[alf] Add strategy for alf_get_blk_stats() and an initial AVX2 version

This commit is contained in:
Marko Viitanen 2021-08-25 20:22:24 +03:00
parent f61b9138cd
commit 8ef3e6a126
9 changed files with 732 additions and 260 deletions

View file

@ -170,6 +170,12 @@
<ClCompile Include="..\..\src\search.c" />
<ClCompile Include="..\..\src\search_inter.c" />
<ClCompile Include="..\..\src\search_intra.c" />
<ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
</ClCompile>
<ClCompile Include="..\..\src\strategies\avx2\intra-avx2.c">
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
@ -264,6 +270,7 @@
<ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h" />
<ClInclude Include="..\..\src\search_inter.h" />
<ClInclude Include="..\..\src\search_intra.h" />
<ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h" />
<ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
<ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
<ClInclude Include="..\..\src\strategies\generic\alf-generic.h" />

View file

@ -263,6 +263,9 @@
<ClCompile Include="..\..\src\strategies\sse41\alf-sse41.c">
<Filter>Optimization\strategies\sse41</Filter>
</ClCompile>
<ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
<Filter>Optimization\strategies\avx2</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\src\bitstream.h">
@ -491,6 +494,9 @@
<ClInclude Include="..\..\src\strategies\sse41\alf-sse41.h">
<Filter>Optimization\strategies\sse41</Filter>
</ClInclude>
<ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h">
<Filter>Optimization\strategies\avx2</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<YASM Include="..\..\src\extras\x86inc.asm">

View file

@ -196,7 +196,9 @@ libavx2_la_SOURCES = \
strategies/avx2/quant-avx2.h \
strategies/avx2/reg_sad_pow2_widths-avx2.h \
strategies/avx2/sao-avx2.c \
strategies/avx2/sao-avx2.h
strategies/avx2/sao-avx2.h \
strategies/avx2/alf-avx2.c \
strategies/avx2/alf-avx2.h
# strategies/avx2/encode_coding_tree-avx2.c \
# strategies/avx2/encode_coding_tree-avx2.h

260
src/alf.c
View file

@ -4276,264 +4276,6 @@ static void alf_get_avai_aps_ids_luma(encoder_state_t * const state,
assert(*new_aps_id < (int)ALF_CTB_MAX_NUM_APS); //Wrong APS index assignment in getAvaiApsIdsLuma
}
static void alf_calc_covariance(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
const kvz_pixel *rec,
const int stride,
const channel_type channel,
const int transpose_idx,
int vb_distance,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
{
static const int alf_pattern_5[13] = {
0,
1, 2, 3,
4, 5, 6, 5, 4,
3, 2, 1,
0
};
static const int alf_pattern_7[25] = {
0,
1, 2, 3,
4, 5, 6, 7, 8,
9, 10, 11, 12, 11, 10, 9,
8, 7, 6, 5, 4,
3, 2, 1,
0
};
int clip_top_row = -4;
int clip_bot_row = 4;
if (vb_distance >= -3 && vb_distance < 0)
{
clip_bot_row = -vb_distance - 1;
clip_top_row = -clip_bot_row; // symmetric
}
else if (vb_distance >= 0 && vb_distance < 3)
{
clip_top_row = -vb_distance;
clip_bot_row = -clip_top_row; // symmetric
}
const bool is_luma = channel == CHANNEL_TYPE_LUMA;
const int *filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
const int half_filter_length = (is_luma ? 7 : 5) >> 1;
const short* clip = alf_clipping_values[channel];
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
int k = 0;
const int16_t curr = rec[0];
if (transpose_idx == 0)
{
for (int i = -half_filter_length; i < 0; i++)
{
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
}
}
}
for (int j = -half_filter_length; j < 0; j++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
}
}
}
else if (transpose_idx == 1)
{
for (int j = -half_filter_length; j < 0; j++)
{
const kvz_pixel* rec0 = rec + j;
const kvz_pixel* rec1 = rec - j;
for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
}
}
}
for (int i = -half_filter_length; i < 0; i++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
}
}
}
else if (transpose_idx == 2)
{
for (int i = -half_filter_length; i < 0; i++)
{
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
}
}
}
for (int j = -half_filter_length; j < 0; j++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
}
}
}
else
{
for (int j = -half_filter_length; j < 0; j++)
{
const kvz_pixel* rec0 = rec + j;
const kvz_pixel* rec1 = rec - j;
for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
}
}
}
for (int i = -half_filter_length; i < 0; i++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
}
}
}
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += curr;
}
}
static void alf_get_blk_stats(encoder_state_t * const state,
channel_type channel,
alf_covariance *alf_covariance,
alf_classifier **g_classifier,
kvz_pixel *org,
int32_t org_stride,
kvz_pixel *rec,
int32_t rec_stride,
const int x_pos,
const int y_pos,
const int x_dst,
const int y_dst,
const int width,
const int height,
int vb_ctu_height,
int vb_pos,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
{
int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
int transpose_idx = 0;
int class_idx = 0;
for (int i = 0; i < height; i++)
{
int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
for (int j = 0; j < width; j++)
{
if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
{
continue;
}
memset(e_local, 0, sizeof(e_local));
if (g_classifier)
{
alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
transpose_idx = cl->transpose_idx;
class_idx = cl->class_idx;
}
double weight = 1.0;
if (0/*m_alfWSSD*/)
{
//weight = g_luma_level_to_weight_plut[org[j]];
}
int16_t y_local = org[j] - rec[j];
alf_calc_covariance(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
for (int k = 0; k < num_coeff; k++)
{
for (int l = k; l < num_coeff; l++)
{
for (int b0 = 0; b0 < num_bins; b0++)
{
for (int b1 = 0; b1 < num_bins; b1++)
{
if (0/*m_alfWSSD*/)
{
alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
}
else
{
alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
}
}
}
}
for (int b = 0; b < num_bins; b++)
{
if (0/*m_alfWSSD*/)
{
alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
}
else
{
alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
}
}
}
if (0/*m_alfWSSD*/)
{
alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
}
else
{
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
}
}
org += org_stride;
rec += rec_stride;
}
int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
for (class_idx = 0; class_idx < num_classes; class_idx++)
{
for (int k = 1; k < num_coeff; k++)
{
for (int l = 0; l < k; l++)
{
for (int b0 = 0; b0 < num_bins; b0++)
{
for (int b1 = 0; b1 < num_bins; b1++)
{
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
}
}
}
}
}
}
static void alf_derive_stats_for_filtering(encoder_state_t * const state,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
@ -4619,7 +4361,7 @@ static void alf_derive_stats_for_filtering(encoder_state_t * const state,
const int num_classes = is_luma ? MAX_NUM_ALF_CLASSES : 1;
const int cov_index = ctu_rs_addr * num_classes;
alf_get_blk_stats(state, ch_type,
kvz_alf_get_blk_stats(state, ch_type,
&alf_cov[cov_index],
comp_idx ? NULL : alf_info->classifier,
org, org_stride, rec, rec_stride, pos_x, pos_y, pos_x, pos_y, blk_w, blk_h,

View file

@ -0,0 +1,393 @@
/*****************************************************************************
* This file is part of Kvazaar HEVC encoder.
*
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
* COPYING file).
*
* Kvazaar is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
****************************************************************************/
#include "global.h"
#include "strategies/avx2/alf-avx2.h"
#if COMPILE_INTEL_AVX2
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include <immintrin.h>
#include <stdlib.h>
#include "strategyselector.h"
static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1)
{
return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref);
}
static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
const kvz_pixel* rec,
const int stride,
const channel_type channel,
const int transpose_idx,
int vb_distance,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
{
static const int alf_pattern_5[13] = {
0,
1, 2, 3,
4, 5, 6, 5, 4,
3, 2, 1,
0
};
static const int alf_pattern_7[25] = {
0,
1, 2, 3,
4, 5, 6, 7, 8,
9, 10, 11, 12, 11, 10, 9,
8, 7, 6, 5, 4,
3, 2, 1,
0
};
int clip_top_row = -4;
int clip_bot_row = 4;
if (vb_distance >= -3 && vb_distance < 0)
{
clip_bot_row = -vb_distance - 1;
clip_top_row = -clip_bot_row; // symmetric
}
else if (vb_distance >= 0 && vb_distance < 3)
{
clip_top_row = -vb_distance;
clip_bot_row = -clip_top_row; // symmetric
}
const bool is_luma = channel == CHANNEL_TYPE_LUMA;
const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
const int half_filter_length = (is_luma ? 7 : 5) >> 1;
const short* clip = alf_clipping_values[channel];
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
int k = 0;
const int16_t curr = rec[0];
const __m128i negate = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
if (transpose_idx == 0)
{
for (int i = -half_filter_length; i < 0; i++)
{
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec0[j] - curr));
__m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
}
}
for (int j = -half_filter_length; j < 0; j++, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec[j] - curr));
__m128i val1 = _mm_set1_epi16((rec[-j] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
}
}
else if (transpose_idx == 1)
{
for (int j = -half_filter_length; j < 0; j++)
{
const kvz_pixel* rec0 = rec + j;
const kvz_pixel* rec1 = rec - j;
for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
__m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
}
}
for (int i = -half_filter_length; i < 0; i++, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
__m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
}
}
else if (transpose_idx == 2)
{
for (int i = -half_filter_length; i < 0; i++)
{
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec0[j] - curr));
__m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
}
}
for (int j = -half_filter_length; j < 0; j++, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec[j] - curr));
__m128i val1 = _mm_set1_epi16((rec[-j] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
}
}
else
{
for (int j = -half_filter_length; j < 0; j++)
{
const kvz_pixel* rec0 = rec + j;
const kvz_pixel* rec1 = rec - j;
for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
__m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
}
}
for (int i = -half_filter_length; i < 0; i++, k++)
{
__m128i clips = _mm_loadl_epi64((__m128i*) clip);
__m128i neg_clips = _mm_sign_epi16(clips, negate);
__m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
__m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
}
}
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
__m128i result = _mm_add_epi16(e_local_original, _mm_set1_epi16(curr));
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
}
static void alf_get_blk_stats_avx2(encoder_state_t* const state,
channel_type channel,
alf_covariance* alf_covariance,
alf_classifier** g_classifier,
kvz_pixel* org,
int32_t org_stride,
kvz_pixel* rec,
int32_t rec_stride,
const int x_pos,
const int y_pos,
const int x_dst,
const int y_dst,
const int width,
const int height,
int vb_ctu_height,
int vb_pos,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
{
int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
int transpose_idx = 0;
int class_idx = 0;
for (int i = 0; i < height; i++)
{
int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
for (int j = 0; j < width; j++)
{
if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
{
continue;
}
memset(e_local, 0, sizeof(e_local));
if (g_classifier)
{
alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
transpose_idx = cl->transpose_idx;
class_idx = cl->class_idx;
}
int16_t y_local = org[j] - rec[j];
__m256d y_local_d = _mm256_set1_pd((double)y_local);
alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
for (int k = 0; k < num_coeff; k++)
{
for (int l = k; l < num_coeff; l++)
{
for (int b0 = 0; b0 < 4; b0++)
{
__m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]);
//for (int b1 = 0; b1 < 4; b1++)
{
//__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
__m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
__m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
__m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d);
double data[4];
_mm256_store_pd(data, multiplied);
//alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
}
}
}
//for (int b = 0; b < 4; b++)
{
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
__m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
__m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
__m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d);
__m128i output = _mm256_cvtpd_epi32(multiplied);
int32_t data[4];
_mm_storeu_si128((__m128i*)data, output);
//alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
alf_covariance[class_idx].y[0][k] += data[0];
alf_covariance[class_idx].y[1][k] += data[1];
alf_covariance[class_idx].y[2][k] += data[2];
alf_covariance[class_idx].y[3][k] += data[3];
}
}
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
}
org += org_stride;
rec += rec_stride;
}
int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
for (class_idx = 0; class_idx < num_classes; class_idx++)
{
for (int k = 1; k < num_coeff; k++)
{
for (int l = 0; l < k; l++)
{
for (int b0 = 0; b0 < 4; b0++)
{
for (int b1 = 0; b1 < 4; b1++)
{
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
}
}
}
}
}
}
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2
int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) {
bool success = true;
#if COMPILE_INTEL_AVX2
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2);
}
#endif // KVZ_BIT_DEPTH == 8
#endif
return success;
}

View file

@ -0,0 +1,32 @@
#pragma once
/*****************************************************************************
* This file is part of Kvazaar HEVC encoder.
*
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
* COPYING file).
*
* Kvazaar is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
****************************************************************************/
/**
* \ingroup Optimization
* \file
* Optimizations for AVX2.
*/
#include "global.h" // IWYU pragma: keep
#include "kvazaar.h"
int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth);

View file

@ -724,6 +724,269 @@ static void alf_filter_7x7_block_generic(encoder_state_t* const state,
alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height);
}
static void alf_calc_covariance_generic(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
const kvz_pixel* rec,
const int stride,
const channel_type channel,
const int transpose_idx,
int vb_distance,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
{
static const int alf_pattern_5[13] = {
0,
1, 2, 3,
4, 5, 6, 5, 4,
3, 2, 1,
0
};
static const int alf_pattern_7[25] = {
0,
1, 2, 3,
4, 5, 6, 7, 8,
9, 10, 11, 12, 11, 10, 9,
8, 7, 6, 5, 4,
3, 2, 1,
0
};
int clip_top_row = -4;
int clip_bot_row = 4;
if (vb_distance >= -3 && vb_distance < 0)
{
clip_bot_row = -vb_distance - 1;
clip_top_row = -clip_bot_row; // symmetric
}
else if (vb_distance >= 0 && vb_distance < 3)
{
clip_top_row = -vb_distance;
clip_bot_row = -clip_top_row; // symmetric
}
const bool is_luma = channel == CHANNEL_TYPE_LUMA;
const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
const int half_filter_length = (is_luma ? 7 : 5) >> 1;
const short* clip = alf_clipping_values[channel];
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
int k = 0;
const int16_t curr = rec[0];
if (transpose_idx == 0)
{
for (int i = -half_filter_length; i < 0; i++)
{
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
}
}
}
for (int j = -half_filter_length; j < 0; j++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
}
}
}
else if (transpose_idx == 1)
{
for (int j = -half_filter_length; j < 0; j++)
{
const kvz_pixel* rec0 = rec + j;
const kvz_pixel* rec1 = rec - j;
for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
}
}
}
for (int i = -half_filter_length; i < 0; i++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
}
}
}
else if (transpose_idx == 2)
{
for (int i = -half_filter_length; i < 0; i++)
{
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
}
}
}
for (int j = -half_filter_length; j < 0; j++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
}
}
}
else
{
for (int j = -half_filter_length; j < 0; j++)
{
const kvz_pixel* rec0 = rec + j;
const kvz_pixel* rec1 = rec - j;
for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
}
}
}
for (int i = -half_filter_length; i < 0; i++, k++)
{
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
}
}
}
for (int b = 0; b < num_bins; b++)
{
e_local[filter_pattern[k]][b] += curr;
}
}
static void alf_get_blk_stats_generic(encoder_state_t* const state,
channel_type channel,
alf_covariance* alf_covariance,
alf_classifier** g_classifier,
kvz_pixel* org,
int32_t org_stride,
kvz_pixel* rec,
int32_t rec_stride,
const int x_pos,
const int y_pos,
const int x_dst,
const int y_dst,
const int width,
const int height,
int vb_ctu_height,
int vb_pos,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
{
int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
int transpose_idx = 0;
int class_idx = 0;
for (int i = 0; i < height; i++)
{
int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
for (int j = 0; j < width; j++)
{
if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
{
continue;
}
memset(e_local, 0, sizeof(e_local));
if (g_classifier)
{
alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
transpose_idx = cl->transpose_idx;
class_idx = cl->class_idx;
}
double weight = 1.0;
if (0/*m_alfWSSD*/)
{
//weight = g_luma_level_to_weight_plut[org[j]];
}
int16_t y_local = org[j] - rec[j];
alf_calc_covariance_generic(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
for (int k = 0; k < num_coeff; k++)
{
for (int l = k; l < num_coeff; l++)
{
for (int b0 = 0; b0 < num_bins; b0++)
{
for (int b1 = 0; b1 < num_bins; b1++)
{
if (0/*m_alfWSSD*/)
{
alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
}
else
{
alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
}
}
}
}
for (int b = 0; b < num_bins; b++)
{
if (0/*m_alfWSSD*/)
{
alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
}
else
{
alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
}
}
}
if (0/*m_alfWSSD*/)
{
alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
}
else
{
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
}
}
org += org_stride;
rec += rec_stride;
}
int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
for (class_idx = 0; class_idx < num_classes; class_idx++)
{
for (int k = 1; k < num_coeff; k++)
{
for (int l = 0; l < k; l++)
{
for (int b0 = 0; b0 < num_bins; b0++)
{
for (int b1 = 0; b1 < num_bins; b1++)
{
alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
}
}
}
}
}
}
int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
{
bool success = true;
@ -731,6 +994,8 @@ int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic);
success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic);
success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic);
success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_generic);
return success;
}

View file

@ -20,6 +20,7 @@
#include "strategies/strategies-alf.h"
#include "strategies/sse41/alf-sse41.h"
#include "strategies/avx2/alf-avx2.h"
#include "strategies/generic/alf-generic.h"
#include "strategyselector.h"
@ -28,6 +29,7 @@
alf_derive_classification_blk_func* kvz_alf_derive_classification_blk;
alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
alf_get_blk_stats_func* kvz_alf_get_blk_stats;
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
bool success = true;
@ -37,6 +39,9 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
if (kvz_g_hardware_flags.intel_flags.sse41) {
success &= kvz_strategy_register_alf_sse41(opaque, bitdepth);
}
if (kvz_g_hardware_flags.intel_flags.avx2) {
success &= kvz_strategy_register_alf_avx2(opaque, bitdepth);
}
return success;
}

View file

@ -78,10 +78,29 @@ typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state,
int vb_pos,
const int vb_ctu_height);
typedef void (alf_get_blk_stats_func)(encoder_state_t* const state,
channel_type channel,
alf_covariance* alf_covariance,
alf_classifier** g_classifier,
kvz_pixel* org,
int32_t org_stride,
kvz_pixel* rec,
int32_t rec_stride,
const int x_pos,
const int y_pos,
const int x_dst,
const int y_dst,
const int width,
const int height,
int vb_ctu_height,
int vb_pos,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]);
// Declare function pointers.
extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk;
extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
extern alf_get_blk_stats_func* kvz_alf_get_blk_stats;
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
@ -90,5 +109,6 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
{"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \
{"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \
{"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \
{"alf_get_blk_stats", (void**) &kvz_alf_get_blk_stats}, \