mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 02:24:07 +00:00
Merge branch 'alf_optimization'
This commit is contained in:
commit
d4d0af4fa4
|
@ -170,6 +170,12 @@
|
|||
<ClCompile Include="..\..\src\search.c" />
|
||||
<ClCompile Include="..\..\src\search_inter.c" />
|
||||
<ClCompile Include="..\..\src\search_intra.c" />
|
||||
<ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\avx2\intra-avx2.c">
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
|
@ -188,10 +194,13 @@
|
|||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
<EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\generic\alf-generic.c" />
|
||||
<ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c" />
|
||||
<ClCompile Include="..\..\src\strategies\generic\intra-generic.c" />
|
||||
<ClCompile Include="..\..\src\strategies\generic\quant-generic.c" />
|
||||
<ClCompile Include="..\..\src\strategies\generic\sao-generic.c" />
|
||||
<ClCompile Include="..\..\src\strategies\sse41\alf-sse41.c" />
|
||||
<ClCompile Include="..\..\src\strategies\strategies-alf.c" />
|
||||
<ClCompile Include="..\..\src\strategies\strategies-encode.c" />
|
||||
<ClCompile Include="..\..\src\strategies\strategies-intra.c" />
|
||||
<ClCompile Include="..\..\src\strategies\strategies-quant.c" />
|
||||
|
@ -261,11 +270,15 @@
|
|||
<ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h" />
|
||||
<ClInclude Include="..\..\src\search_inter.h" />
|
||||
<ClInclude Include="..\..\src\search_intra.h" />
|
||||
<ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h" />
|
||||
<ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
|
||||
<ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
|
||||
<ClInclude Include="..\..\src\strategies\generic\alf-generic.h" />
|
||||
<ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h" />
|
||||
<ClInclude Include="..\..\src\strategies\generic\intra-generic.h" />
|
||||
<ClInclude Include="..\..\src\strategies\generic\sao-generic.h" />
|
||||
<ClInclude Include="..\..\src\strategies\sse41\alf-sse41.h" />
|
||||
<ClInclude Include="..\..\src\strategies\strategies-alf.h" />
|
||||
<ClInclude Include="..\..\src\strategies\strategies-common.h" />
|
||||
<ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" />
|
||||
<ClInclude Include="..\..\src\strategies\generic\quant-generic.h" />
|
||||
|
@ -339,4 +352,4 @@
|
|||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\yasm\vsyasm.targets" />
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
|
@ -254,6 +254,18 @@
|
|||
<Filter>Reconstruction</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\strategies-encode.c" />
|
||||
<ClCompile Include="..\..\src\strategies\generic\alf-generic.c">
|
||||
<Filter>Optimization\strategies\generic</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\strategies-alf.c">
|
||||
<Filter>Optimization\strategies</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\sse41\alf-sse41.c">
|
||||
<Filter>Optimization\strategies\sse41</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
|
||||
<Filter>Optimization\strategies\avx2</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\src\bitstream.h">
|
||||
|
@ -473,6 +485,18 @@
|
|||
<ClInclude Include="..\..\src\alf.h">
|
||||
<Filter>Reconstruction</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\strategies\generic\alf-generic.h">
|
||||
<Filter>Optimization\strategies\generic</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\strategies\strategies-alf.h">
|
||||
<Filter>Optimization\strategies</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\strategies\sse41\alf-sse41.h">
|
||||
<Filter>Optimization\strategies\sse41</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h">
|
||||
<Filter>Optimization\strategies\avx2</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<YASM Include="..\..\src\extras\x86inc.asm">
|
||||
|
|
|
@ -117,6 +117,8 @@ libkvazaar_la_SOURCES = \
|
|||
transform.h \
|
||||
videoframe.c \
|
||||
videoframe.h \
|
||||
strategies/generic/alf-generic.c \
|
||||
strategies/generic/alf-generic.h \
|
||||
strategies/generic/dct-generic.c \
|
||||
strategies/generic/dct-generic.h \
|
||||
strategies/generic/intra-generic.c \
|
||||
|
@ -137,6 +139,8 @@ libkvazaar_la_SOURCES = \
|
|||
strategies/optimized_sad_func_ptr_t.h \
|
||||
strategies/generic/sao_shared_generics.h \
|
||||
strategies/strategies-common.h \
|
||||
strategies/strategies-alf.c \
|
||||
strategies/strategies-alf.h \
|
||||
strategies/strategies-dct.c \
|
||||
strategies/strategies-dct.h \
|
||||
strategies/strategies-intra.c \
|
||||
|
@ -192,7 +196,9 @@ libavx2_la_SOURCES = \
|
|||
strategies/avx2/quant-avx2.h \
|
||||
strategies/avx2/reg_sad_pow2_widths-avx2.h \
|
||||
strategies/avx2/sao-avx2.c \
|
||||
strategies/avx2/sao-avx2.h
|
||||
strategies/avx2/sao-avx2.h \
|
||||
strategies/avx2/alf-avx2.c \
|
||||
strategies/avx2/alf-avx2.h
|
||||
# strategies/avx2/encode_coding_tree-avx2.c \
|
||||
# strategies/avx2/encode_coding_tree-avx2.h
|
||||
|
||||
|
@ -203,7 +209,9 @@ libsse2_la_SOURCES = \
|
|||
libsse41_la_SOURCES = \
|
||||
strategies/sse41/picture-sse41.c \
|
||||
strategies/sse41/picture-sse41.h \
|
||||
strategies/sse41/reg_sad_pow2_widths-sse41.h
|
||||
strategies/sse41/reg_sad_pow2_widths-sse41.h \
|
||||
strategies/sse41/alf-sse41.c \
|
||||
strategies/sse41/alf-sse41.h
|
||||
|
||||
if HAVE_PPC
|
||||
|
||||
|
|
|
@ -176,8 +176,8 @@ typedef enum {
|
|||
PACK(
|
||||
typedef struct alf_covariance {
|
||||
double pix_acc;
|
||||
int64_t ee[MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF];
|
||||
int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF];
|
||||
int64_t ee[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES];
|
||||
int32_t y[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
|
||||
int num_coeff;
|
||||
int num_bins;
|
||||
} alf_covariance;)
|
||||
|
|
|
@ -80,7 +80,6 @@ extern const uint32_t kvz_entropy_bits[512];
|
|||
|
||||
// Floating point fractional bits, derived from kvz_entropy_bits
|
||||
extern const float kvz_f_entropy_bits[512];
|
||||
// ToDo: generate a new table for VVC?
|
||||
#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
|
||||
|
||||
#endif
|
||||
|
|
308
src/strategies/avx2/alf-avx2.c
Normal file
308
src/strategies/avx2/alf-avx2.c
Normal file
|
@ -0,0 +1,308 @@
|
|||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
#include "global.h"
|
||||
|
||||
#include "strategies/avx2/alf-avx2.h"
|
||||
|
||||
#if COMPILE_INTEL_AVX2
|
||||
#include "kvazaar.h"
|
||||
#if KVZ_BIT_DEPTH == 8
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "strategyselector.h"
|
||||
|
||||
static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1)
|
||||
{
|
||||
return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref);
|
||||
}
|
||||
|
||||
#define ALF_CLIP_AND_ADD(VAL0,VAL1) __m128i clips = _mm_loadl_epi64((__m128i*) clip); \
|
||||
__m128i neg_clips = _mm_sign_epi16(clips, negate); \
|
||||
__m128i val0 = _mm_set1_epi16((VAL0 - curr));\
|
||||
__m128i val1 = _mm_set1_epi16((VAL1 - curr));\
|
||||
__m128i min_clips_val0 = _mm_min_epi16(clips, val0);\
|
||||
__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);\
|
||||
\
|
||||
__m128i min_clips_val1 = _mm_min_epi16(clips, val1);\
|
||||
__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);\
|
||||
\
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);\
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));\
|
||||
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
|
||||
|
||||
static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
|
||||
const kvz_pixel* rec,
|
||||
const int stride,
|
||||
const channel_type channel,
|
||||
const int transpose_idx,
|
||||
int vb_distance,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
static const int alf_pattern_5[13] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
static const int alf_pattern_7[25] = {
|
||||
0,
|
||||
1, 2, 3,
|
||||
4, 5, 6, 7, 8,
|
||||
9, 10, 11, 12, 11, 10, 9,
|
||||
8, 7, 6, 5, 4,
|
||||
3, 2, 1,
|
||||
0
|
||||
};
|
||||
|
||||
int clip_top_row = -4;
|
||||
int clip_bot_row = 4;
|
||||
if (vb_distance >= -3 && vb_distance < 0)
|
||||
{
|
||||
clip_bot_row = -vb_distance - 1;
|
||||
clip_top_row = -clip_bot_row; // symmetric
|
||||
}
|
||||
else if (vb_distance >= 0 && vb_distance < 3)
|
||||
{
|
||||
clip_top_row = -vb_distance;
|
||||
clip_bot_row = -clip_top_row; // symmetric
|
||||
}
|
||||
|
||||
const bool is_luma = channel == CHANNEL_TYPE_LUMA;
|
||||
const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
|
||||
const int half_filter_length = (is_luma ? 7 : 5) >> 1;
|
||||
const short* clip = alf_clipping_values[channel];
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int k = 0;
|
||||
|
||||
const int16_t curr = rec[0];
|
||||
|
||||
const __m128i negate = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
|
||||
|
||||
if (transpose_idx == 0)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec0[j], rec1[-j]);
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec[j], rec[-j]);
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 1)
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
else if (transpose_idx == 2)
|
||||
{
|
||||
for (int i = -half_filter_length; i < 0; i++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
|
||||
const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
|
||||
|
||||
for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec0[j], rec1[-j]);
|
||||
}
|
||||
}
|
||||
for (int j = -half_filter_length; j < 0; j++, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec[j], rec[-j]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = -half_filter_length; j < 0; j++)
|
||||
{
|
||||
const kvz_pixel* rec0 = rec + j;
|
||||
const kvz_pixel* rec1 = rec - j;
|
||||
|
||||
for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
for (int i = -half_filter_length; i < 0; i++, k++)
|
||||
{
|
||||
ALF_CLIP_AND_ADD(rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
|
||||
}
|
||||
}
|
||||
|
||||
__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
|
||||
__m128i result = _mm_add_epi16(e_local_original, _mm_set1_epi16(curr));
|
||||
_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
|
||||
|
||||
}
|
||||
|
||||
static void alf_get_blk_stats_avx2(encoder_state_t* const state,
|
||||
channel_type channel,
|
||||
alf_covariance* alf_covariance,
|
||||
alf_classifier** g_classifier,
|
||||
kvz_pixel* org,
|
||||
int32_t org_stride,
|
||||
kvz_pixel* rec,
|
||||
int32_t rec_stride,
|
||||
const int x_pos,
|
||||
const int y_pos,
|
||||
const int x_dst,
|
||||
const int y_dst,
|
||||
const int width,
|
||||
const int height,
|
||||
int vb_ctu_height,
|
||||
int vb_pos,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
|
||||
{
|
||||
int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
|
||||
|
||||
const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
|
||||
|
||||
int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
|
||||
int transpose_idx = 0;
|
||||
int class_idx = 0;
|
||||
|
||||
for (int i = 0; i < height; i++)
|
||||
{
|
||||
int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
|
||||
for (int j = 0; j < width; j++)
|
||||
{
|
||||
if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
memset(e_local, 0, sizeof(e_local));
|
||||
if (g_classifier)
|
||||
{
|
||||
alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
|
||||
transpose_idx = cl->transpose_idx;
|
||||
class_idx = cl->class_idx;
|
||||
}
|
||||
|
||||
int16_t y_local = org[j] - rec[j];
|
||||
|
||||
//__m256i const perm_mask = _mm256_set_epi32(14, 12, 10, 8, 6, 4, 2, 0);
|
||||
|
||||
__m256i y_local_32 = _mm256_set1_epi32(y_local);
|
||||
alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
|
||||
for (int k = 0; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = k; l < num_coeff; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < 4; b0++)
|
||||
{
|
||||
if (!e_local[k][b0]) continue;
|
||||
__m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]);
|
||||
/*for (int b1 = 0; b1 < 4; b1++)
|
||||
{
|
||||
alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1];
|
||||
}*/
|
||||
|
||||
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
|
||||
__m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1);
|
||||
__m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32);
|
||||
__m256i orig = _mm256_loadu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]);
|
||||
_mm256_storeu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0], _mm256_add_epi64(multiplied, orig));
|
||||
|
||||
}
|
||||
}
|
||||
/*
|
||||
for (int b = 0; b < 4; b++)
|
||||
{
|
||||
alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local;
|
||||
}*/
|
||||
|
||||
}
|
||||
for (int k = 0; k < num_coeff-1; k+=2)
|
||||
{
|
||||
__m128i e_local_1 = _mm_loadu_si128((__m128i*) & e_local[k][0]);
|
||||
__m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
|
||||
__m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
|
||||
__m256i orig = _mm256_loadu_si256((__m256i*) & alf_covariance[class_idx].y[k][0]);
|
||||
_mm256_storeu_si256((__m256i*)alf_covariance[class_idx].y[k], _mm256_add_epi32(multiplied, orig));
|
||||
}
|
||||
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[num_coeff-1][0]);
|
||||
__m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
|
||||
__m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
|
||||
__m128i orig = _mm_loadu_si128((__m128i*) & alf_covariance[class_idx].y[num_coeff - 1][0]);
|
||||
_mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[num_coeff - 1], _mm_add_epi32(_mm256_castsi256_si128(multiplied), orig));
|
||||
|
||||
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
|
||||
}
|
||||
org += org_stride;
|
||||
rec += rec_stride;
|
||||
}
|
||||
|
||||
int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
|
||||
for (class_idx = 0; class_idx < num_classes; class_idx++)
|
||||
{
|
||||
for (int k = 1; k < num_coeff; k++)
|
||||
{
|
||||
for (int l = 0; l < k; l++)
|
||||
{
|
||||
for (int b0 = 0; b0 < 4; b0++)
|
||||
{
|
||||
for (int b1 = 0; b1 < 4; b1++)
|
||||
{
|
||||
alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // KVZ_BIT_DEPTH == 8
|
||||
#endif //COMPILE_INTEL_AVX2
|
||||
|
||||
|
||||
int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) {
|
||||
bool success = true;
|
||||
#if COMPILE_INTEL_AVX2
|
||||
#if KVZ_BIT_DEPTH == 8
|
||||
if (bitdepth == 8){
|
||||
success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "avx2", 40, &alf_get_blk_stats_avx2);
|
||||
}
|
||||
#endif // KVZ_BIT_DEPTH == 8
|
||||
#endif
|
||||
return success;
|
||||
}
|
32
src/strategies/avx2/alf-avx2.h
Normal file
32
src/strategies/avx2/alf-avx2.h
Normal file
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
/**
|
||||
* \ingroup Optimization
|
||||
* \file
|
||||
* Optimizations for AVX2.
|
||||
*/
|
||||
|
||||
#include "global.h" // IWYU pragma: keep
|
||||
#include "kvazaar.h"
|
||||
|
||||
int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth);
|
||||
|
1001
src/strategies/generic/alf-generic.c
Normal file
1001
src/strategies/generic/alf-generic.c
Normal file
File diff suppressed because it is too large
Load diff
31
src/strategies/generic/alf-generic.h
Normal file
31
src/strategies/generic/alf-generic.h
Normal file
|
@ -0,0 +1,31 @@
|
|||
#pragma once
|
||||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
/**
|
||||
* \ingroup Optimization
|
||||
* \file
|
||||
* Generic C implementations of optimized functions.
|
||||
*/
|
||||
|
||||
#include "global.h" // IWYU pragma: keep
|
||||
|
||||
int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth);
|
||||
|
762
src/strategies/sse41/alf-sse41.c
Normal file
762
src/strategies/sse41/alf-sse41.c
Normal file
|
@ -0,0 +1,762 @@
|
|||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
#include "global.h"
|
||||
|
||||
#if COMPILE_INTEL_SSE41
|
||||
#include "kvazaar.h"
|
||||
#if KVZ_BIT_DEPTH == 8
|
||||
#include "strategies/sse41/alf-sse41.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "strategyselector.h"
|
||||
|
||||
static void alf_derive_classification_blk_sse41(encoder_state_t * const state,
|
||||
const int shift,
|
||||
const int n_height,
|
||||
const int n_width,
|
||||
const int blk_pos_x,
|
||||
const int blk_pos_y,
|
||||
const int blk_dst_x,
|
||||
const int blk_dst_y,
|
||||
const int vb_ctu_height,
|
||||
int vb_pos)
|
||||
{
|
||||
videoframe_t* const frame = state->tile->frame;
|
||||
const size_t imgStride = frame->rec->stride;
|
||||
const kvz_pixel * srcExt = state->tile->frame->rec->y;
|
||||
|
||||
const int imgHExtended = n_height + 4;
|
||||
const int imgWExtended = n_width + 4;
|
||||
|
||||
const int posX = blk_pos_x;
|
||||
const int posY = blk_pos_y;
|
||||
|
||||
alf_classifier** classifier = state->tile->frame->alf_info->classifier;
|
||||
|
||||
// 18x40 array
|
||||
uint16_t colSums[(CLASSIFICATION_BLK_SIZE + 4) >> 1]
|
||||
[CLASSIFICATION_BLK_SIZE + 8];
|
||||
|
||||
for (int i = 0; i < imgHExtended; i += 2)
|
||||
{
|
||||
const size_t offset = (i + posY - 3) * imgStride + posX - 3;
|
||||
|
||||
const kvz_pixel*imgY0 = &srcExt[offset];
|
||||
const kvz_pixel*imgY1 = &srcExt[offset + imgStride];
|
||||
const kvz_pixel*imgY2 = &srcExt[offset + imgStride * 2];
|
||||
const kvz_pixel*imgY3 = &srcExt[offset + imgStride * 3];
|
||||
|
||||
// pixel padding for gradient calculation
|
||||
int pos = blk_dst_y - 2 + i;
|
||||
int posInCTU = pos & (vb_ctu_height - 1);
|
||||
if (pos > 0 && posInCTU == vb_pos - 2)
|
||||
{
|
||||
imgY3 = imgY2;
|
||||
}
|
||||
else if (pos > 0 && posInCTU == vb_pos)
|
||||
{
|
||||
imgY0 = imgY1;
|
||||
}
|
||||
|
||||
__m128i prev = _mm_setzero_si128();
|
||||
|
||||
for (int j = 0; j < imgWExtended; j += 8)
|
||||
{
|
||||
const __m128i x00 = _mm_loadu_si128((const __m128i*) (imgY0 + j));
|
||||
const __m128i x01 = _mm_loadu_si128((const __m128i*) (imgY1 + j));
|
||||
const __m128i x02 = _mm_loadu_si128((const __m128i*) (imgY2 + j));
|
||||
const __m128i x03 = _mm_loadu_si128((const __m128i*) (imgY3 + j));
|
||||
|
||||
const __m128i x04 = _mm_loadu_si128((const __m128i*) (imgY0 + j + 2));
|
||||
const __m128i x05 = _mm_loadu_si128((const __m128i*) (imgY1 + j + 2));
|
||||
const __m128i x06 = _mm_loadu_si128((const __m128i*) (imgY2 + j + 2));
|
||||
const __m128i x07 = _mm_loadu_si128((const __m128i*) (imgY3 + j + 2));
|
||||
|
||||
const __m128i x0 = _mm_unpacklo_epi8(x00, _mm_setzero_si128());
|
||||
const __m128i x1 = _mm_unpacklo_epi8(x01, _mm_setzero_si128());
|
||||
const __m128i x2 = _mm_unpacklo_epi8(x02, _mm_setzero_si128());
|
||||
const __m128i x3 = _mm_unpacklo_epi8(x03, _mm_setzero_si128());
|
||||
|
||||
const __m128i x4 = _mm_unpacklo_epi8(x04, _mm_setzero_si128());
|
||||
const __m128i x5 = _mm_unpacklo_epi8(x05, _mm_setzero_si128());
|
||||
const __m128i x6 = _mm_unpacklo_epi8(x06, _mm_setzero_si128());
|
||||
const __m128i x7 = _mm_unpacklo_epi8(x07, _mm_setzero_si128());
|
||||
|
||||
const __m128i nw = _mm_blend_epi16(x0, x1, 0xaa);
|
||||
const __m128i n = _mm_blend_epi16(x0, x5, 0x55);
|
||||
const __m128i ne = _mm_blend_epi16(x4, x5, 0xaa);
|
||||
const __m128i w = _mm_blend_epi16(x1, x2, 0xaa);
|
||||
const __m128i e = _mm_blend_epi16(x5, x6, 0xaa);
|
||||
const __m128i sw = _mm_blend_epi16(x2, x3, 0xaa);
|
||||
const __m128i s = _mm_blend_epi16(x2, x7, 0x55);
|
||||
const __m128i se = _mm_blend_epi16(x6, x7, 0xaa);
|
||||
|
||||
__m128i c = _mm_blend_epi16(x1, x6, 0x55);
|
||||
c = _mm_add_epi16(c, c);
|
||||
__m128i d = _mm_shuffle_epi8(c, _mm_setr_epi8(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13));
|
||||
|
||||
const __m128i ver = _mm_abs_epi16(_mm_sub_epi16(c, _mm_add_epi16(n, s)));
|
||||
const __m128i hor = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(w, e)));
|
||||
const __m128i di0 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(nw, se)));
|
||||
const __m128i di1 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(ne, sw)));
|
||||
|
||||
const __m128i hv = _mm_hadd_epi16(ver, hor);
|
||||
const __m128i di = _mm_hadd_epi16(di0, di1);
|
||||
const __m128i all = _mm_hadd_epi16(hv, di);
|
||||
|
||||
const __m128i t = _mm_blend_epi16(all, prev, 0xaa);
|
||||
_mm_storeu_si128((__m128i*) & colSums[i >> 1][j], _mm_hadd_epi16(t, all));
|
||||
prev = all;
|
||||
|
||||
if (j + 8 < imgWExtended)
|
||||
{
|
||||
j += 8;
|
||||
|
||||
const __m128i x0 = _mm_unpackhi_epi8(x00, _mm_setzero_si128());
|
||||
const __m128i x1 = _mm_unpackhi_epi8(x01, _mm_setzero_si128());
|
||||
const __m128i x2 = _mm_unpackhi_epi8(x02, _mm_setzero_si128());
|
||||
const __m128i x3 = _mm_unpackhi_epi8(x03, _mm_setzero_si128());
|
||||
|
||||
const __m128i x4 = _mm_unpackhi_epi8(x04, _mm_setzero_si128());
|
||||
const __m128i x5 = _mm_unpackhi_epi8(x05, _mm_setzero_si128());
|
||||
const __m128i x6 = _mm_unpackhi_epi8(x06, _mm_setzero_si128());
|
||||
const __m128i x7 = _mm_unpackhi_epi8(x07, _mm_setzero_si128());
|
||||
|
||||
const __m128i nw = _mm_blend_epi16(x0, x1, 0xaa);
|
||||
const __m128i n = _mm_blend_epi16(x0, x5, 0x55);
|
||||
const __m128i ne = _mm_blend_epi16(x4, x5, 0xaa);
|
||||
const __m128i w = _mm_blend_epi16(x1, x2, 0xaa);
|
||||
const __m128i e = _mm_blend_epi16(x5, x6, 0xaa);
|
||||
const __m128i sw = _mm_blend_epi16(x2, x3, 0xaa);
|
||||
const __m128i s = _mm_blend_epi16(x2, x7, 0x55);
|
||||
const __m128i se = _mm_blend_epi16(x6, x7, 0xaa);
|
||||
|
||||
__m128i c = _mm_blend_epi16(x1, x6, 0x55);
|
||||
c = _mm_add_epi16(c, c);
|
||||
__m128i d = _mm_shuffle_epi8(c, _mm_setr_epi8(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13));
|
||||
|
||||
const __m128i ver = _mm_abs_epi16(_mm_sub_epi16(c, _mm_add_epi16(n, s)));
|
||||
const __m128i hor = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(w, e)));
|
||||
const __m128i di0 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(nw, se)));
|
||||
const __m128i di1 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(ne, sw)));
|
||||
|
||||
const __m128i hv = _mm_hadd_epi16(ver, hor);
|
||||
const __m128i di = _mm_hadd_epi16(di0, di1);
|
||||
const __m128i all = _mm_hadd_epi16(hv, di);
|
||||
|
||||
const __m128i t = _mm_blend_epi16(all, prev, 0xaa);
|
||||
_mm_storeu_si128((__m128i*) & colSums[i >> 1][j], _mm_hadd_epi16(t, all));
|
||||
prev = all;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (n_height >> 1); i += 4)
|
||||
{
|
||||
for (int j = 0; j < n_width; j += 8)
|
||||
{
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
const uint32_t z = (2 * i + blk_pos_y) & (vb_ctu_height - 1);
|
||||
const uint32_t z2 = (2 * i + 4 + blk_pos_y) & (vb_ctu_height - 1);
|
||||
|
||||
x0 = (z == vb_pos) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 0][j + 4]);
|
||||
x1 = _mm_loadu_si128((__m128i *) &colSums[i + 1][j + 4]);
|
||||
x2 = _mm_loadu_si128((__m128i *) &colSums[i + 2][j + 4]);
|
||||
x3 = (z == vb_pos - 4) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 3][j + 4]);
|
||||
|
||||
x4 = (z2 == vb_pos) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 2][j + 4]);
|
||||
x5 = _mm_loadu_si128((__m128i *) &colSums[i + 3][j + 4]);
|
||||
x6 = _mm_loadu_si128((__m128i *) &colSums[i + 4][j + 4]);
|
||||
x7 = (z2 == vb_pos - 4) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 5][j + 4]);
|
||||
|
||||
__m128i x0l = _mm_cvtepu16_epi32(x0);
|
||||
__m128i x0h = _mm_unpackhi_epi16(x0, _mm_setzero_si128());
|
||||
__m128i x1l = _mm_cvtepu16_epi32(x1);
|
||||
__m128i x1h = _mm_unpackhi_epi16(x1, _mm_setzero_si128());
|
||||
__m128i x2l = _mm_cvtepu16_epi32(x2);
|
||||
__m128i x2h = _mm_unpackhi_epi16(x2, _mm_setzero_si128());
|
||||
__m128i x3l = _mm_cvtepu16_epi32(x3);
|
||||
__m128i x3h = _mm_unpackhi_epi16(x3, _mm_setzero_si128());
|
||||
__m128i x4l = _mm_cvtepu16_epi32(x4);
|
||||
__m128i x4h = _mm_unpackhi_epi16(x4, _mm_setzero_si128());
|
||||
__m128i x5l = _mm_cvtepu16_epi32(x5);
|
||||
__m128i x5h = _mm_unpackhi_epi16(x5, _mm_setzero_si128());
|
||||
__m128i x6l = _mm_cvtepu16_epi32(x6);
|
||||
__m128i x6h = _mm_unpackhi_epi16(x6, _mm_setzero_si128());
|
||||
__m128i x7l = _mm_cvtepu16_epi32(x7);
|
||||
__m128i x7h = _mm_unpackhi_epi16(x7, _mm_setzero_si128());
|
||||
|
||||
x0l = _mm_add_epi32(x0l, x1l);
|
||||
x2l = _mm_add_epi32(x2l, x3l);
|
||||
x4l = _mm_add_epi32(x4l, x5l);
|
||||
x6l = _mm_add_epi32(x6l, x7l);
|
||||
x0h = _mm_add_epi32(x0h, x1h);
|
||||
x2h = _mm_add_epi32(x2h, x3h);
|
||||
x4h = _mm_add_epi32(x4h, x5h);
|
||||
x6h = _mm_add_epi32(x6h, x7h);
|
||||
|
||||
x0l = _mm_add_epi32(x0l, x2l);
|
||||
x4l = _mm_add_epi32(x4l, x6l);
|
||||
x0h = _mm_add_epi32(x0h, x2h);
|
||||
x4h = _mm_add_epi32(x4h, x6h);
|
||||
|
||||
x2l = _mm_unpacklo_epi32(x0l, x4l);
|
||||
x2h = _mm_unpackhi_epi32(x0l, x4l);
|
||||
x6l = _mm_unpacklo_epi32(x0h, x4h);
|
||||
x6h = _mm_unpackhi_epi32(x0h, x4h);
|
||||
|
||||
__m128i sumV = _mm_unpacklo_epi32(x2l, x6l);
|
||||
__m128i sumH = _mm_unpackhi_epi32(x2l, x6l);
|
||||
__m128i sumD0 = _mm_unpacklo_epi32(x2h, x6h);
|
||||
__m128i sumD1 = _mm_unpackhi_epi32(x2h, x6h);
|
||||
|
||||
// uint32_t tempAct = sumV + sumH;
|
||||
__m128i tempAct = _mm_add_epi32(sumV, sumH);
|
||||
|
||||
// const uint32_t activity = std::min<uint32_t>(15, tempAct * scale >> shift);
|
||||
// static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
|
||||
// uint8_t class_idx = th[activity];
|
||||
const uint32_t scale = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64;
|
||||
const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64;
|
||||
__m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2)));
|
||||
activity = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift));
|
||||
activity = _mm_min_epi32(activity, _mm_set1_epi32(15));
|
||||
__m128i class_idx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity);
|
||||
|
||||
// if (sumV > sumH)
|
||||
// {
|
||||
// hv1 = sumV;
|
||||
// hv0 = sumH;
|
||||
// dirTempHV = 0;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// hv1 = sumH;
|
||||
// hv0 = sumV;
|
||||
// dirTempHV = 1;
|
||||
// }
|
||||
__m128i dirTempHVMinus1 = _mm_cmpgt_epi32(sumV, sumH);
|
||||
__m128i hv1 = _mm_max_epi32(sumV, sumH);
|
||||
__m128i hv0 = _mm_min_epi32(sumV, sumH);
|
||||
|
||||
// if (sumD0 > sumD1)
|
||||
// {
|
||||
// d1 = sumD0;
|
||||
// d0 = sumD1;
|
||||
// dirTempD = 0;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// d1 = sumD1;
|
||||
// d0 = sumD0;
|
||||
// dirTempD = 1;
|
||||
// }
|
||||
__m128i dirTempDMinus1 = _mm_cmpgt_epi32(sumD0, sumD1);
|
||||
__m128i d1 = _mm_max_epi32(sumD0, sumD1);
|
||||
__m128i d0 = _mm_min_epi32(sumD0, sumD1);
|
||||
|
||||
// int dirIdx;
|
||||
// if (d1 * hv0 > hv1 * d0)
|
||||
// {
|
||||
// hvd1 = d1;
|
||||
// hvd0 = d0;
|
||||
// dirIdx = 0;
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// hvd1 = hv1;
|
||||
// hvd0 = hv0;
|
||||
// dirIdx = 2;
|
||||
// }
|
||||
__m128i a = _mm_xor_si128(_mm_mullo_epi32(d1, hv0), _mm_set1_epi32(0x80000000));
|
||||
__m128i b = _mm_xor_si128(_mm_mullo_epi32(hv1, d0), _mm_set1_epi32(0x80000000));
|
||||
__m128i dirIdx = _mm_cmpgt_epi32(a, b);
|
||||
__m128i hvd1 = _mm_blendv_epi8(hv1, d1, dirIdx);
|
||||
__m128i hvd0 = _mm_blendv_epi8(hv0, d0, dirIdx);
|
||||
|
||||
// if (hvd1 * 2 > 9 * hvd0)
|
||||
// {
|
||||
// class_idx += (dirIdx + 2) * 5;
|
||||
// }
|
||||
// else if (hvd1 > 2 * hvd0)
|
||||
// {
|
||||
// class_idx += (dirIdx + 1) * 5;
|
||||
// }
|
||||
__m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0));
|
||||
__m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3)));
|
||||
__m128i offset = _mm_and_si128(strength1, _mm_set1_epi32(5));
|
||||
class_idx = _mm_add_epi32(class_idx, offset);
|
||||
class_idx = _mm_add_epi32(class_idx, _mm_and_si128(strength2, _mm_set1_epi32(5)));
|
||||
offset = _mm_andnot_si128(dirIdx, offset);
|
||||
offset = _mm_add_epi32(offset, offset);
|
||||
class_idx = _mm_add_epi32(class_idx, offset);
|
||||
|
||||
// uint8_t transpose_idx = 2 * dirTempD + dirTempHV;
|
||||
__m128i transpose_idx = _mm_set1_epi32(3);
|
||||
transpose_idx = _mm_add_epi32(transpose_idx, dirTempHVMinus1);
|
||||
transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1);
|
||||
transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1);
|
||||
|
||||
int yOffset = 2 * i + blk_pos_y;
|
||||
int xOffset = j + blk_pos_x;
|
||||
|
||||
static_assert(sizeof(alf_classifier) == 2, "alf_classifier type must be 16 bits wide");
|
||||
__m128i v;
|
||||
v = _mm_unpacklo_epi8(class_idx, transpose_idx);
|
||||
v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset] + xOffset), v);
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 1] + xOffset), v);
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 2] + xOffset), v);
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 3] + xOffset), v);
|
||||
v = _mm_unpackhi_epi8(class_idx, transpose_idx);
|
||||
v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 4] + xOffset), v);
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 5] + xOffset), v);
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 6] + xOffset), v);
|
||||
_mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 7] + xOffset), v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
INLINE static void process2coeffs_5x5(__m128i params[2][3], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) {
|
||||
const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur);
|
||||
const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur);
|
||||
const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur);
|
||||
const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur);
|
||||
__m128i val01A = _mm_unpacklo_epi16(val00, val10);
|
||||
__m128i val01B = _mm_unpackhi_epi16(val00, val10);
|
||||
__m128i val01C = _mm_unpacklo_epi16(val01, val11);
|
||||
__m128i val01D = _mm_unpackhi_epi16(val01, val11);
|
||||
|
||||
__m128i limit01A = params[1][i];
|
||||
|
||||
val01A = _mm_min_epi16(val01A, limit01A);
|
||||
val01B = _mm_min_epi16(val01B, limit01A);
|
||||
val01C = _mm_min_epi16(val01C, limit01A);
|
||||
val01D = _mm_min_epi16(val01D, limit01A);
|
||||
|
||||
limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A);
|
||||
|
||||
val01A = _mm_max_epi16(val01A, limit01A);
|
||||
val01B = _mm_max_epi16(val01B, limit01A);
|
||||
val01C = _mm_max_epi16(val01C, limit01A);
|
||||
val01D = _mm_max_epi16(val01D, limit01A);
|
||||
|
||||
val01A = _mm_add_epi16(val01A, val01C);
|
||||
val01B = _mm_add_epi16(val01B, val01D);
|
||||
|
||||
__m128i coeff01A = params[0][i];
|
||||
|
||||
*accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A));
|
||||
*accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01A));
|
||||
};
|
||||
|
||||
|
||||
static void alf_filter_5x5_block_sse41(encoder_state_t* const state,
|
||||
const kvz_pixel* src_pixels,
|
||||
kvz_pixel* dst_pixels,
|
||||
const int src_stride,
|
||||
const int dst_stride,
|
||||
const short* filter_set,
|
||||
const int16_t* fClipSet,
|
||||
clp_rng clp_rng,
|
||||
const int width,
|
||||
const int height,
|
||||
int x_pos,
|
||||
int y_pos,
|
||||
int blk_dst_x,
|
||||
int blk_dst_y,
|
||||
int vb_pos,
|
||||
const int vb_ctu_height)
|
||||
{
|
||||
|
||||
|
||||
assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2");
|
||||
|
||||
alf_component_id compId = COMPONENT_Cb;
|
||||
|
||||
const size_t srcStride = src_stride;
|
||||
const size_t dstStride = dst_stride;
|
||||
|
||||
const int SHIFT = state->encoder_control->bitdepth - 1;
|
||||
const int ROUND = 1 << (SHIFT - 1);
|
||||
const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND);
|
||||
|
||||
const size_t STEP_X = 8;
|
||||
const size_t STEP_Y = 4;
|
||||
|
||||
assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering");
|
||||
assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering");
|
||||
assert(height % STEP_Y == 0 && "Wrong endHeight in filtering");
|
||||
assert(width % 4 == 0 && "Wrong endWidth in filtering");
|
||||
|
||||
const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos;
|
||||
kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x;
|
||||
|
||||
|
||||
|
||||
const __m128i mmOffset = _mm_set1_epi32(ROUND);
|
||||
const __m128i mmMin = _mm_set1_epi16(clp_rng.min);
|
||||
const __m128i mmMax = _mm_set1_epi16(clp_rng.max);
|
||||
|
||||
__m128i params[2][3];
|
||||
__m128i fs = _mm_loadu_si128((__m128i*) filter_set);
|
||||
params[0][0] = _mm_shuffle_epi32(fs, 0x00);
|
||||
params[0][1] = _mm_shuffle_epi32(fs, 0x55);
|
||||
params[0][2] = _mm_shuffle_epi32(fs, 0xaa);
|
||||
__m128i fc = _mm_loadu_si128((__m128i*) fClipSet);
|
||||
params[1][0] = _mm_shuffle_epi32(fc, 0x00);
|
||||
params[1][1] = _mm_shuffle_epi32(fc, 0x55);
|
||||
params[1][2] = _mm_shuffle_epi32(fc, 0xaa);
|
||||
|
||||
const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
|
||||
for (size_t i = 0; i < height; i += STEP_Y)
|
||||
{
|
||||
for (size_t j = 0; j < width; j += STEP_X)
|
||||
{
|
||||
|
||||
for (size_t ii = 0; ii < STEP_Y; ii++)
|
||||
{
|
||||
const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4;
|
||||
|
||||
pImg0 = src + j + ii * srcStride;
|
||||
pImg1 = pImg0 + srcStride;
|
||||
pImg2 = pImg0 - srcStride;
|
||||
pImg3 = pImg1 + srcStride;
|
||||
pImg4 = pImg2 - srcStride;
|
||||
|
||||
const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
|
||||
if (yVb < vb_pos && (yVb >= vb_pos - 2)) // above
|
||||
{
|
||||
pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1;
|
||||
pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3;
|
||||
|
||||
pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2;
|
||||
pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4;
|
||||
}
|
||||
else if (yVb >= vb_pos && (yVb <= vb_pos + 1)) // bottom
|
||||
{
|
||||
pImg2 = (yVb == vb_pos) ? pImg0 : pImg2;
|
||||
pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4;
|
||||
|
||||
pImg1 = (yVb == vb_pos) ? pImg0 : pImg1;
|
||||
pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3;
|
||||
}
|
||||
__m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128());
|
||||
|
||||
__m128i accumA = mmOffset;
|
||||
__m128i accumB = mmOffset;
|
||||
|
||||
|
||||
|
||||
process2coeffs_5x5(params, &cur, &accumA, &accumB, 0, pImg3 + 0, pImg4 + 0, pImg1 + 1, pImg2 - 1);
|
||||
process2coeffs_5x5(params, &cur, &accumA, &accumB, 1, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1);
|
||||
process2coeffs_5x5(params, &cur, &accumA, &accumB, 2, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1);
|
||||
bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1);
|
||||
bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos);
|
||||
if (!(isNearVBabove || isNearVBbelow))
|
||||
{
|
||||
accumA = _mm_srai_epi32(accumA, SHIFT);
|
||||
accumB = _mm_srai_epi32(accumB, SHIFT);
|
||||
}
|
||||
else
|
||||
{
|
||||
accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3);
|
||||
accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3);
|
||||
}
|
||||
accumA = _mm_packs_epi32(accumA, accumB);
|
||||
accumA = _mm_add_epi16(accumA, cur);
|
||||
accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin));
|
||||
|
||||
if (j + STEP_X <= width)
|
||||
{
|
||||
//_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA);
|
||||
_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask));
|
||||
}
|
||||
else
|
||||
{
|
||||
//_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), accumA);
|
||||
_mm_store_ss((float*) (dst + ii * dstStride + j), _mm_castsi128_ps(_mm_shuffle_epi8(accumA, mask)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
src += srcStride * STEP_Y;
|
||||
dst += dstStride * STEP_Y;
|
||||
}
|
||||
}
|
||||
|
||||
#define sh(x) 0x0202 * (x & 7) + 0x0100 + 0x1010 * (x & 8)
|
||||
|
||||
static const uint16_t shuffleTab[4][2][8] = {
|
||||
{
|
||||
{ sh(0), sh(1), sh(2), sh(3), sh(4), sh(5), sh(6), sh(7) },
|
||||
{ sh(8), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) },
|
||||
},
|
||||
{
|
||||
{ sh(9), sh(4), sh(10), sh(8), sh(1), sh(5), sh(11), sh(7) },
|
||||
{ sh(3), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) },
|
||||
},
|
||||
{
|
||||
{ sh(0), sh(3), sh(2), sh(1), sh(8), sh(7), sh(6), sh(5) },
|
||||
{ sh(4), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) },
|
||||
},
|
||||
{
|
||||
{ sh(9), sh(8), sh(10), sh(4), sh(3), sh(7), sh(11), sh(5) },
|
||||
{ sh(1), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) },
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
|
||||
INLINE static void process2coeffs_7x7(__m128i params[2][2][6], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) {
|
||||
const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur);
|
||||
const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur);
|
||||
const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur);
|
||||
const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur);
|
||||
|
||||
__m128i val01A = _mm_unpacklo_epi16(val00, val10);
|
||||
__m128i val01B = _mm_unpackhi_epi16(val00, val10);
|
||||
__m128i val01C = _mm_unpacklo_epi16(val01, val11);
|
||||
__m128i val01D = _mm_unpackhi_epi16(val01, val11);
|
||||
|
||||
__m128i limit01A = params[0][1][i];
|
||||
__m128i limit01B = params[1][1][i];
|
||||
|
||||
val01A = _mm_min_epi16(val01A, limit01A);
|
||||
val01B = _mm_min_epi16(val01B, limit01B);
|
||||
val01C = _mm_min_epi16(val01C, limit01A);
|
||||
val01D = _mm_min_epi16(val01D, limit01B);
|
||||
|
||||
limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A);
|
||||
limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B);
|
||||
|
||||
val01A = _mm_max_epi16(val01A, limit01A);
|
||||
val01B = _mm_max_epi16(val01B, limit01B);
|
||||
val01C = _mm_max_epi16(val01C, limit01A);
|
||||
val01D = _mm_max_epi16(val01D, limit01B);
|
||||
|
||||
val01A = _mm_add_epi16(val01A, val01C);
|
||||
val01B = _mm_add_epi16(val01B, val01D);
|
||||
|
||||
const __m128i coeff01A = params[0][0][i];
|
||||
const __m128i coeff01B = params[1][0][i];
|
||||
|
||||
*accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A));
|
||||
*accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01B));
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
static void alf_filter_7x7_block_sse41(encoder_state_t* const state,
|
||||
const kvz_pixel* src_pixels,
|
||||
kvz_pixel* dst_pixels,
|
||||
const int src_stride,
|
||||
const int dst_stride,
|
||||
const short* filter_set,
|
||||
const int16_t* fClipSet,
|
||||
clp_rng clp_rng,
|
||||
const int width,
|
||||
const int height,
|
||||
int x_pos,
|
||||
int y_pos,
|
||||
int blk_dst_x,
|
||||
int blk_dst_y,
|
||||
int vb_pos,
|
||||
const int vb_ctu_height)
|
||||
{
|
||||
assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2");
|
||||
alf_component_id compId = COMPONENT_Y;
|
||||
|
||||
|
||||
const size_t srcStride = src_stride;
|
||||
const size_t dstStride = dst_stride;
|
||||
|
||||
const int SHIFT = state->encoder_control->bitdepth - 1;
|
||||
const int ROUND = 1 << (SHIFT - 1);
|
||||
|
||||
const size_t STEP_X = 8;
|
||||
const size_t STEP_Y = 4;
|
||||
|
||||
assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering");
|
||||
assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering");
|
||||
assert(height % STEP_Y == 0 && "Wrong endHeight in filtering");
|
||||
assert(width % STEP_X == 0 && "Wrong endWidth in filtering");
|
||||
|
||||
const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos;
|
||||
kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x;
|
||||
|
||||
const __m128i mmOffset = _mm_set1_epi32(ROUND);
|
||||
const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND);
|
||||
const __m128i mmMin = _mm_set1_epi16(clp_rng.min);
|
||||
const __m128i mmMax = _mm_set1_epi16(clp_rng.max);
|
||||
|
||||
const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
|
||||
for (size_t i = 0; i < height; i += STEP_Y)
|
||||
{
|
||||
const alf_classifier* pClass = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x;
|
||||
|
||||
for (size_t j = 0; j < width; j += STEP_X)
|
||||
{
|
||||
__m128i params[2][2][6];
|
||||
|
||||
for (int k = 0; k < 2; ++k)
|
||||
{
|
||||
const alf_classifier* cl = &pClass[j + 4 * k];
|
||||
|
||||
const int transpose_idx = cl->transpose_idx;
|
||||
const int class_idx = cl->class_idx;
|
||||
|
||||
static_assert(sizeof(*filter_set) == 2, "ALF coeffs must be 16-bit wide");
|
||||
static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide");
|
||||
|
||||
__m128i rawCoeff0, rawCoeff1;
|
||||
__m128i rawClip0, rawClip1;
|
||||
|
||||
rawCoeff0 = _mm_loadu_si128((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF));
|
||||
rawCoeff1 = _mm_loadl_epi64((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8));
|
||||
|
||||
rawClip0 = _mm_loadu_si128((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF));
|
||||
rawClip1 = _mm_loadl_epi64((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8));
|
||||
|
||||
const __m128i s0 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][0]);
|
||||
const __m128i s1 = _mm_xor_si128(s0, _mm_set1_epi8((char)0x80));
|
||||
const __m128i s2 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][1]);
|
||||
const __m128i s3 = _mm_xor_si128(s2, _mm_set1_epi8((char)0x80));
|
||||
|
||||
const __m128i rawCoeffLo = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s0), _mm_shuffle_epi8(rawCoeff1, s1));
|
||||
const __m128i rawCoeffHi = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s2), _mm_shuffle_epi8(rawCoeff1, s3));
|
||||
const __m128i rawClipLo = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s0), _mm_shuffle_epi8(rawClip1, s1));
|
||||
const __m128i rawClipHi = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s2), _mm_shuffle_epi8(rawClip1, s3));
|
||||
|
||||
params[k][0][0] = _mm_shuffle_epi32(rawCoeffLo, 0x00);
|
||||
params[k][0][1] = _mm_shuffle_epi32(rawCoeffLo, 0x55);
|
||||
params[k][0][2] = _mm_shuffle_epi32(rawCoeffLo, 0xaa);
|
||||
params[k][0][3] = _mm_shuffle_epi32(rawCoeffLo, 0xff);
|
||||
params[k][0][4] = _mm_shuffle_epi32(rawCoeffHi, 0x00);
|
||||
params[k][0][5] = _mm_shuffle_epi32(rawCoeffHi, 0x55);
|
||||
params[k][1][0] = _mm_shuffle_epi32(rawClipLo, 0x00);
|
||||
params[k][1][1] = _mm_shuffle_epi32(rawClipLo, 0x55);
|
||||
params[k][1][2] = _mm_shuffle_epi32(rawClipLo, 0xaa);
|
||||
params[k][1][3] = _mm_shuffle_epi32(rawClipLo, 0xff);
|
||||
params[k][1][4] = _mm_shuffle_epi32(rawClipHi, 0x00);
|
||||
params[k][1][5] = _mm_shuffle_epi32(rawClipHi, 0x55);
|
||||
}
|
||||
|
||||
for (size_t ii = 0; ii < STEP_Y; ii++)
|
||||
{
|
||||
const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4, * pImg5, * pImg6;
|
||||
|
||||
pImg0 = src + j + ii * srcStride;
|
||||
pImg1 = pImg0 + srcStride;
|
||||
pImg2 = pImg0 - srcStride;
|
||||
pImg3 = pImg1 + srcStride;
|
||||
pImg4 = pImg2 - srcStride;
|
||||
pImg5 = pImg3 + srcStride;
|
||||
pImg6 = pImg4 - srcStride;
|
||||
|
||||
const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
|
||||
if (yVb < vb_pos && (yVb >= vb_pos - 4)) // above
|
||||
{
|
||||
pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1;
|
||||
pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3;
|
||||
pImg5 = (yVb >= vb_pos - 3) ? pImg3 : pImg5;
|
||||
|
||||
pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2;
|
||||
pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4;
|
||||
pImg6 = (yVb >= vb_pos - 3) ? pImg4 : pImg6;
|
||||
}
|
||||
else if (yVb >= vb_pos && (yVb <= vb_pos + 3)) // bottom
|
||||
{
|
||||
pImg2 = (yVb == vb_pos) ? pImg0 : pImg2;
|
||||
pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4;
|
||||
pImg6 = (yVb <= vb_pos + 2) ? pImg4 : pImg6;
|
||||
|
||||
pImg1 = (yVb == vb_pos) ? pImg0 : pImg1;
|
||||
pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3;
|
||||
pImg5 = (yVb <= vb_pos + 2) ? pImg3 : pImg5;
|
||||
}
|
||||
__m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128());
|
||||
|
||||
__m128i accumA = mmOffset;
|
||||
__m128i accumB = mmOffset;
|
||||
|
||||
process2coeffs_7x7(params, &cur, &accumA, &accumB, 0, pImg5 + 0, pImg6 + 0, pImg3 + 1, pImg4 - 1);
|
||||
process2coeffs_7x7(params, &cur, &accumA, &accumB, 1, pImg3 + 0, pImg4 + 0, pImg3 - 1, pImg4 + 1);
|
||||
process2coeffs_7x7(params, &cur, &accumA, &accumB, 2, pImg1 + 2, pImg2 - 2, pImg1 + 1, pImg2 - 1);
|
||||
process2coeffs_7x7(params, &cur, &accumA, &accumB, 3, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1);
|
||||
process2coeffs_7x7(params, &cur, &accumA, &accumB, 4, pImg1 - 2, pImg2 + 2, pImg0 + 3, pImg0 - 3);
|
||||
process2coeffs_7x7(params, &cur, &accumA, &accumB, 5, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1);
|
||||
|
||||
|
||||
bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1);
|
||||
bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos);
|
||||
if (!(isNearVBabove || isNearVBbelow))
|
||||
{
|
||||
accumA = _mm_srai_epi32(accumA, SHIFT);
|
||||
accumB = _mm_srai_epi32(accumB, SHIFT);
|
||||
}
|
||||
else
|
||||
{
|
||||
accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3);
|
||||
accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3);
|
||||
}
|
||||
accumA = _mm_packs_epi32(accumA, accumB);
|
||||
accumA = _mm_add_epi16(accumA, cur);
|
||||
accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin));
|
||||
|
||||
//_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA);
|
||||
_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask));
|
||||
}
|
||||
}
|
||||
|
||||
src += srcStride * STEP_Y;
|
||||
dst += dstStride * STEP_Y;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // KVZ_BIT_DEPTH == 8
|
||||
#endif //COMPILE_INTEL_SSE41
|
||||
|
||||
|
||||
int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth) {
|
||||
bool success = true;
|
||||
#if COMPILE_INTEL_SSE41
|
||||
#if KVZ_BIT_DEPTH == 8
|
||||
if (bitdepth == 8){
|
||||
success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41);
|
||||
success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "sse41", 0, &alf_filter_5x5_block_sse41);
|
||||
success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "sse41", 0, &alf_filter_7x7_block_sse41);
|
||||
}
|
||||
#endif // KVZ_BIT_DEPTH == 8
|
||||
#endif
|
||||
return success;
|
||||
}
|
32
src/strategies/sse41/alf-sse41.h
Normal file
32
src/strategies/sse41/alf-sse41.h
Normal file
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
/**
|
||||
* \ingroup Optimization
|
||||
* \file
|
||||
* Optimizations for SSE4.1.
|
||||
*/
|
||||
|
||||
#include "global.h" // IWYU pragma: keep
|
||||
#include "kvazaar.h"
|
||||
|
||||
int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth);
|
||||
|
47
src/strategies/strategies-alf.c
Normal file
47
src/strategies/strategies-alf.c
Normal file
|
@ -0,0 +1,47 @@
|
|||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
#include "strategies/strategies-alf.h"
|
||||
#include "strategies/sse41/alf-sse41.h"
|
||||
#include "strategies/avx2/alf-avx2.h"
|
||||
#include "strategies/generic/alf-generic.h"
|
||||
#include "strategyselector.h"
|
||||
|
||||
|
||||
// Define function pointers.
|
||||
alf_derive_classification_blk_func* kvz_alf_derive_classification_blk;
|
||||
alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
|
||||
alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
|
||||
alf_get_blk_stats_func* kvz_alf_get_blk_stats;
|
||||
|
||||
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
|
||||
bool success = true;
|
||||
|
||||
success &= kvz_strategy_register_alf_generic(opaque, bitdepth);
|
||||
|
||||
if (kvz_g_hardware_flags.intel_flags.sse41) {
|
||||
success &= kvz_strategy_register_alf_sse41(opaque, bitdepth);
|
||||
}
|
||||
if (kvz_g_hardware_flags.intel_flags.avx2) {
|
||||
success &= kvz_strategy_register_alf_avx2(opaque, bitdepth);
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
114
src/strategies/strategies-alf.h
Normal file
114
src/strategies/strategies-alf.h
Normal file
|
@ -0,0 +1,114 @@
|
|||
#pragma once
|
||||
/*****************************************************************************
|
||||
* This file is part of Kvazaar HEVC encoder.
|
||||
*
|
||||
* Copyright (C) 2013-2021 Tampere University of Technology and others (see
|
||||
* COPYING file).
|
||||
*
|
||||
* Kvazaar is free software: you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the
|
||||
* Free Software Foundation; either version 2.1 of the License, or (at your
|
||||
* option) any later version.
|
||||
*
|
||||
* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
/**
|
||||
* \ingroup Optimization
|
||||
* \file
|
||||
* Interface for alf functions.
|
||||
*/
|
||||
|
||||
#include "encoder.h"
|
||||
#include "encoderstate.h"
|
||||
#include "global.h" // IWYU pragma: keep
|
||||
#include "kvazaar.h"
|
||||
#include "alf.h"
|
||||
|
||||
|
||||
// Declare function pointers.
|
||||
typedef void (alf_derive_classification_blk_func)(encoder_state_t * const state,
|
||||
const int shift,
|
||||
const int n_height,
|
||||
const int n_width,
|
||||
const int blk_pos_x,
|
||||
const int blk_pos_y,
|
||||
const int blk_dst_x,
|
||||
const int blk_dst_y,
|
||||
const int vb_ctu_height,
|
||||
int vb_pos);
|
||||
|
||||
typedef void (alf_filter_5x5_blk_func)(encoder_state_t* const state,
|
||||
const kvz_pixel* src_pixels,
|
||||
kvz_pixel* dst_pixels,
|
||||
const int src_stride,
|
||||
const int dst_stride,
|
||||
const short* filter_set,
|
||||
const int16_t* fClipSet,
|
||||
clp_rng clp_rng,
|
||||
const int width,
|
||||
const int height,
|
||||
int x_pos,
|
||||
int y_pos,
|
||||
int blk_dst_x,
|
||||
int blk_dst_y,
|
||||
int vb_pos,
|
||||
const int vb_ctu_height);
|
||||
|
||||
typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state,
|
||||
const kvz_pixel* src_pixels,
|
||||
kvz_pixel* dst_pixels,
|
||||
const int src_stride,
|
||||
const int dst_stride,
|
||||
const short* filter_set,
|
||||
const int16_t* fClipSet,
|
||||
clp_rng clp_rng,
|
||||
const int width,
|
||||
const int height,
|
||||
int x_pos,
|
||||
int y_pos,
|
||||
int blk_dst_x,
|
||||
int blk_dst_y,
|
||||
int vb_pos,
|
||||
const int vb_ctu_height);
|
||||
|
||||
typedef void (alf_get_blk_stats_func)(encoder_state_t* const state,
|
||||
channel_type channel,
|
||||
alf_covariance* alf_covariance,
|
||||
alf_classifier** g_classifier,
|
||||
kvz_pixel* org,
|
||||
int32_t org_stride,
|
||||
kvz_pixel* rec,
|
||||
int32_t rec_stride,
|
||||
const int x_pos,
|
||||
const int y_pos,
|
||||
const int x_dst,
|
||||
const int y_dst,
|
||||
const int width,
|
||||
const int height,
|
||||
int vb_ctu_height,
|
||||
int vb_pos,
|
||||
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]);
|
||||
|
||||
// Declare function pointers.
|
||||
extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk;
|
||||
extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
|
||||
extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
|
||||
extern alf_get_blk_stats_func* kvz_alf_get_blk_stats;
|
||||
|
||||
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
|
||||
|
||||
|
||||
#define STRATEGIES_ALF_EXPORTS \
|
||||
{"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \
|
||||
{"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \
|
||||
{"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \
|
||||
{"alf_get_blk_stats", (void**) &kvz_alf_get_blk_stats}, \
|
||||
|
||||
|
|
@ -90,6 +90,11 @@ int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
|
|||
fprintf(stderr, "kvz_strategy_register_encode failed!\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!kvz_strategy_register_alf(&strategies, bitdepth)) {
|
||||
fprintf(stderr, "kvz_strategy_register_encode failed!\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
while(cur_strategy_to_select->fptr) {
|
||||
*(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
|
||||
|
|
|
@ -96,6 +96,7 @@ int kvz_strategyselector_register(void *opaque, const char *type, const char *st
|
|||
#include "strategies/strategies-intra.h"
|
||||
#include "strategies/strategies-sao.h"
|
||||
#include "strategies/strategies-encode.h"
|
||||
#include "strategies/strategies-alf.h"
|
||||
|
||||
static const strategy_to_select_t strategies_to_select[] = {
|
||||
STRATEGIES_NAL_EXPORTS
|
||||
|
@ -106,6 +107,7 @@ static const strategy_to_select_t strategies_to_select[] = {
|
|||
STRATEGIES_INTRA_EXPORTS
|
||||
STRATEGIES_SAO_EXPORTS
|
||||
STRATEGIES_ENCODE_EXPORTS
|
||||
STRATEGIES_ALF_EXPORTS
|
||||
{ NULL, NULL },
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue