mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 10:34:05 +00:00
Utilize registers more efficiently for 8x8 and larger blocks
This commit is contained in:
parent
192cee95b2
commit
efbdc5dade
|
@ -34,6 +34,7 @@
|
||||||
#include "strategies/generic/picture-generic.h"
|
#include "strategies/generic/picture-generic.h"
|
||||||
#include "strategies/strategies-ipol.h"
|
#include "strategies/strategies-ipol.h"
|
||||||
#include "strategyselector.h"
|
#include "strategyselector.h"
|
||||||
|
#include "strategies/strategies-common.h"
|
||||||
|
|
||||||
|
|
||||||
#define FILTER_OFFSET 3
|
#define FILTER_OFFSET 3
|
||||||
|
@ -267,6 +268,22 @@ void kvz_four_tap_filter_x4_hor_avx2(int8_t *filter, kvz_pixel *data, int shift,
|
||||||
_mm_storel_epi64((__m128i*)dst, temp);
|
_mm_storel_epi64((__m128i*)dst, temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kvz_four_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
|
||||||
|
{
|
||||||
|
__m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)data)), _mm_loadl_epi64((__m128i*)(data + 4)), 1);
|
||||||
|
__m256i packed_filter = _mm256_set1_epi32(*(int32_t*)filter);
|
||||||
|
__m256i idx_lookup = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6));
|
||||||
|
|
||||||
|
__m256i temp = _mm256_shuffle_epi8(packed_data, idx_lookup);
|
||||||
|
|
||||||
|
temp = _mm256_maddubs_epi16(temp, packed_filter);
|
||||||
|
temp = _mm256_hadd_epi16(temp, temp);
|
||||||
|
temp = _mm256_srai_epi16(temp, shift);
|
||||||
|
|
||||||
|
_mm_storel_epi64((__m128i*)dst, _mm256_castsi256_si128(temp));
|
||||||
|
_mm_storel_epi64((__m128i*)(dst + 4), _mm256_extracti128_si256(temp, 1));
|
||||||
|
}
|
||||||
|
|
||||||
int32_t kvz_four_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3)
|
int32_t kvz_four_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -291,6 +308,31 @@ int32_t kvz_four_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int
|
||||||
return _mm_cvtsi128_si32(temp);
|
return _mm_cvtsi128_si32(temp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kvz_four_tap_filter_x8_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3, kvz_pixel* dst)
|
||||||
|
{
|
||||||
|
|
||||||
|
__m256i v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[0])));
|
||||||
|
__m256i v_data0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 0)));
|
||||||
|
__m256i v_data1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 1)));
|
||||||
|
__m256i v_data = _mm256_or_si256(v_data0, _mm256_slli_epi32(v_data1, 16));
|
||||||
|
__m256i temp = _mm256_madd_epi16(v_filter, v_data);
|
||||||
|
|
||||||
|
v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[2])));
|
||||||
|
__m256i v_data2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 2)));
|
||||||
|
__m256i v_data3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 3)));
|
||||||
|
v_data = _mm256_or_si256(v_data2, _mm256_slli_epi32(v_data3, 16));
|
||||||
|
temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
|
||||||
|
|
||||||
|
temp = _mm256_add_epi32(temp, _mm256_set1_epi32(offset));
|
||||||
|
temp = _mm256_srai_epi32(temp, shift2 + shift3);
|
||||||
|
|
||||||
|
temp = _mm256_packus_epi32(temp, temp);
|
||||||
|
temp = _mm256_packus_epi16(temp, temp);
|
||||||
|
|
||||||
|
*(int32_t*)dst = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp));
|
||||||
|
*(int32_t*)(dst + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(temp, 1));
|
||||||
|
}
|
||||||
|
|
||||||
void kvz_filter_inter_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
|
void kvz_filter_inter_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -538,13 +580,14 @@ void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, kvz_
|
||||||
#define FILTER_OFFSET_C (FILTER_OFFSET / 2)
|
#define FILTER_OFFSET_C (FILTER_OFFSET / 2)
|
||||||
int16_t hor_filtered[(LCU_WIDTH_C + 1) + FILTER_SIZE_C][(LCU_WIDTH_C + 1) + FILTER_SIZE_C];
|
int16_t hor_filtered[(LCU_WIDTH_C + 1) + FILTER_SIZE_C][(LCU_WIDTH_C + 1) + FILTER_SIZE_C];
|
||||||
|
|
||||||
|
if (width == 4) {
|
||||||
// Filter horizontally and flip x and y
|
// Filter horizontally and flip x and y
|
||||||
for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
|
for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
|
||||||
for (x = 0; x < width; x += 4) {
|
for (x = 0; x < width; x += 4) {
|
||||||
int ypos = y - FILTER_OFFSET_C;
|
int ypos = y - FILTER_OFFSET_C;
|
||||||
int xpos = x - FILTER_OFFSET_C;
|
int xpos = x - FILTER_OFFSET_C;
|
||||||
int16_t *dst = &(hor_filtered[y][x]);
|
int16_t *out = &(hor_filtered[y][x]);
|
||||||
kvz_four_tap_filter_x4_hor_avx2(hor_filter, &src[src_stride*ypos + xpos], shift1, dst);
|
kvz_four_tap_filter_x4_hor_avx2(hor_filter, &src[src_stride*ypos + xpos], shift1, out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -556,6 +599,28 @@ void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, kvz_
|
||||||
*(int32_t*)&(dst[y*dst_stride + x]) = kvz_four_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filtered[ypos][xpos], sizeof(hor_filtered[0])/sizeof(int16_t), offset23, shift2, shift3);
|
*(int32_t*)&(dst[y*dst_stride + x]) = kvz_four_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filtered[ypos][xpos], sizeof(hor_filtered[0])/sizeof(int16_t), offset23, shift2, shift3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// Filter horizontally and flip x and y
|
||||||
|
for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
|
||||||
|
for (x = 0; x < width; x += 8) {
|
||||||
|
int ypos = y - FILTER_OFFSET_C;
|
||||||
|
int xpos = x - FILTER_OFFSET_C;
|
||||||
|
int16_t *dst = &(hor_filtered[y][x]);
|
||||||
|
kvz_four_tap_filter_x8_hor_avx2(hor_filter, &src[src_stride*ypos + xpos], shift1, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter vertically and flip x and y
|
||||||
|
for (y = 0; y < height; ++y) {
|
||||||
|
for (x = 0; x < width; x+=8) {
|
||||||
|
int ypos = y;
|
||||||
|
int xpos = x;
|
||||||
|
kvz_pixel *out = &(dst[y*dst_stride + x]);
|
||||||
|
kvz_four_tap_filter_x8_ver_16bit_avx2(ver_filter, &hor_filtered[ypos][xpos], sizeof(hor_filtered[0])/sizeof(int16_t), offset23, shift2, shift3, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue