mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 10:34:05 +00:00
Merge branch 'master' of https://github.com/ultravideo/kvazaar
This commit is contained in:
commit
949ec57849
|
@ -333,16 +333,18 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void encoder_state_encode_leaf(encoder_state_t * const state) {
|
static void encoder_state_encode_leaf(encoder_state_t * const state) {
|
||||||
const encoder_control_t * const encoder = state->encoder_control;
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
assert(state->is_leaf);
|
assert(state->is_leaf);
|
||||||
assert(state->lcu_order_count > 0);
|
assert(state->lcu_order_count > 0);
|
||||||
|
|
||||||
//If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism
|
// Select whether to encode the frame/tile in current thread or to define
|
||||||
if (state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !state->parent->children[1].encoder_control)) {
|
// wavefront jobs for other threads to handle.
|
||||||
for (i = 0; i < state->lcu_order_count; ++i) {
|
bool wavefront = state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW;
|
||||||
|
bool use_parallel_encoding = (wavefront && state->parent->children[1].encoder_control);
|
||||||
|
if (!use_parallel_encoding) {
|
||||||
|
// Encode every LCU in order and perform SAO reconstruction after every
|
||||||
|
// frame is encoded. Deblocking and SAO search is done during LCU encoding.
|
||||||
|
|
||||||
|
for (int i = 0; i < state->lcu_order_count; ++i) {
|
||||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_ENCODE_LCU);
|
PERFORMANCE_MEASURE_START(_DEBUG_PERF_ENCODE_LCU);
|
||||||
|
|
||||||
encoder_state_worker_encode_lcu(&state->lcu_order[i]);
|
encoder_state_worker_encode_lcu(&state->lcu_order[i]);
|
||||||
|
@ -355,7 +357,7 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) {
|
||||||
#endif //_DEBUG
|
#endif //_DEBUG
|
||||||
}
|
}
|
||||||
|
|
||||||
if (encoder->sao_enable) {
|
if (state->encoder_control->sao_enable) {
|
||||||
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME);
|
PERFORMANCE_MEASURE_START(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME);
|
||||||
sao_reconstruct_frame(state);
|
sao_reconstruct_frame(state);
|
||||||
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME, state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", state->global->frame, state->tile->id, state->slice->id, state->lcu_order[0].position.y + state->tile->lcu_offset_y, state->lcu_order[state->lcu_order_count-1].position.y + state->tile->lcu_offset_y,
|
PERFORMANCE_MEASURE_END(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME, state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", state->global->frame, state->tile->id, state->slice->id, state->lcu_order[0].position.y + state->tile->lcu_offset_y, state->lcu_order[state->lcu_order_count-1].position.y + state->tile->lcu_offset_y,
|
||||||
|
@ -364,7 +366,10 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (i = 0; i < state->lcu_order_count; ++i) {
|
// Add every LCU in the frame as a job to a queue, along with
|
||||||
|
// their dependancies, so they can be processed in parallel.
|
||||||
|
|
||||||
|
for (int i = 0; i < state->lcu_order_count; ++i) {
|
||||||
const lcu_order_element_t * const lcu = &state->lcu_order[i];
|
const lcu_order_element_t * const lcu = &state->lcu_order[i];
|
||||||
#ifdef _DEBUG
|
#ifdef _DEBUG
|
||||||
char job_description[256];
|
char job_description[256];
|
||||||
|
@ -373,40 +378,41 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) {
|
||||||
char* job_description = NULL;
|
char* job_description = NULL;
|
||||||
#endif
|
#endif
|
||||||
state->tile->wf_jobs[lcu->id] = threadqueue_submit(state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description);
|
state->tile->wf_jobs[lcu->id] = threadqueue_submit(state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description);
|
||||||
if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_recon_done && !state->global->is_radl_frame) {
|
assert(state->tile->wf_jobs[lcu->id] != NULL);
|
||||||
|
|
||||||
//Only for the first in the row (we reconstruct row-wise)
|
// Add dependancy for inter frames to the reconstruction of the row
|
||||||
|
// below current row in the previous frame. This ensures that we can
|
||||||
|
// search for motion vectors in the previous frame as long as we don't
|
||||||
|
// go more than one LCU below current row.
|
||||||
|
if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_recon_done && !state->global->is_radl_frame) {
|
||||||
|
// Only add the dependancy to the first LCU in the row.
|
||||||
if (!lcu->left) {
|
if (!lcu->left) {
|
||||||
//If we have a row below, then we wait till it's completed
|
|
||||||
if (lcu->below) {
|
if (lcu->below) {
|
||||||
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->below->encoder_state->previous_encoder_state->tqj_recon_done);
|
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->below->encoder_state->previous_encoder_state->tqj_recon_done);
|
||||||
}
|
} else {
|
||||||
//Also add always a dep on current line
|
|
||||||
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->encoder_state->previous_encoder_state->tqj_recon_done);
|
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->encoder_state->previous_encoder_state->tqj_recon_done);
|
||||||
if (lcu->above) {
|
|
||||||
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->above->encoder_state->previous_encoder_state->tqj_recon_done);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (state->tile->wf_jobs[lcu->id]) {
|
|
||||||
if (lcu->position.x > 0) {
|
// Add local WPP dependancy to the LCU on the left.
|
||||||
// Wait for the LCU on the left.
|
if (lcu->left) {
|
||||||
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]);
|
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]);
|
||||||
}
|
}
|
||||||
if (lcu->position.y > 0) {
|
// Add local WPP dependancy to the LCU on the top right.
|
||||||
if (lcu->position.x < state->tile->frame->width_in_lcu - 1) {
|
if (lcu->above) {
|
||||||
// Wait for the LCU to the top-right of this one.
|
if (lcu->above->right) {
|
||||||
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]);
|
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]);
|
||||||
} else {
|
} else {
|
||||||
// If there is no top-right LCU, wait for the one above.
|
|
||||||
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]);
|
threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
|
threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
|
||||||
}
|
|
||||||
if (lcu->position.x == state->tile->frame->width_in_lcu - 1) {
|
if (lcu->position.x == state->tile->frame->width_in_lcu - 1) {
|
||||||
if (!encoder->sao_enable) {
|
if (!state->encoder_control->sao_enable) {
|
||||||
//No SAO + last LCU: the row is reconstructed
|
// No SAO + last LCU: the row is reconstructed
|
||||||
assert(!state->tqj_recon_done);
|
assert(!state->tqj_recon_done);
|
||||||
state->tqj_recon_done = state->tile->wf_jobs[lcu->id];
|
state->tqj_recon_done = state->tile->wf_jobs[lcu->id];
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,25 +22,156 @@
|
||||||
* \file
|
* \file
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#include "ipol-avx2.h"
|
#include "ipol-avx2.h"
|
||||||
#include "strategyselector.h"
|
#include "strategyselector.h"
|
||||||
|
|
||||||
|
#if COMPILE_INTEL_AVX2
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
|
||||||
#include "encoder.h"
|
#include "encoder.h"
|
||||||
#include "strategies/generic/picture-generic.h"
|
#include "strategies/generic/picture-generic.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define FILTER_OFFSET 3
|
||||||
|
#define FILTER_SIZE 8
|
||||||
|
|
||||||
|
#define MAX_HEIGHT (4 * (LCU_WIDTH + 1) + FILTER_SIZE)
|
||||||
|
#define MAX_WIDTH ((LCU_WIDTH + 1) + FILTER_SIZE)
|
||||||
|
|
||||||
extern int8_t g_luma_filter[4][8];
|
extern int8_t g_luma_filter[4][8];
|
||||||
extern int8_t g_chroma_filter[8][4];
|
extern int8_t g_chroma_filter[8][4];
|
||||||
|
|
||||||
|
void eight_tap_filter_x8_and_flip(__m128i data01, __m128i data23, __m128i data45, __m128i data67, __m128i* filter, __m128i* dst)
|
||||||
|
{
|
||||||
|
__m128i a, b, c, d;
|
||||||
|
__m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter));
|
||||||
|
|
||||||
|
a = _mm_maddubs_epi16(data01, fir);
|
||||||
|
b = _mm_maddubs_epi16(data23, fir);
|
||||||
|
a = _mm_hadd_epi16(a, b);
|
||||||
|
|
||||||
|
c = _mm_maddubs_epi16(data45, fir);
|
||||||
|
d = _mm_maddubs_epi16(data67, fir);
|
||||||
|
c = _mm_hadd_epi16(c, d);
|
||||||
|
|
||||||
|
a = _mm_hadd_epi16(a, c);
|
||||||
|
|
||||||
|
_mm_storeu_si128(dst, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
__m128i eight_tap_filter_x4_and_flip_16bit(__m128i data0, __m128i data1, __m128i data2, __m128i data3, __m128i* filter)
|
||||||
|
{
|
||||||
|
__m128i a, b, c, d;
|
||||||
|
__m128i fir = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(filter)));
|
||||||
|
|
||||||
|
a = _mm_madd_epi16(data0, fir);
|
||||||
|
b = _mm_madd_epi16(data1, fir);
|
||||||
|
a = _mm_hadd_epi32(a, b);
|
||||||
|
|
||||||
|
c = _mm_madd_epi16(data2, fir);
|
||||||
|
d = _mm_madd_epi16(data3, fir);
|
||||||
|
c = _mm_hadd_epi32(c, d);
|
||||||
|
|
||||||
|
a = _mm_hadd_epi32(a, c);
|
||||||
|
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
void eight_tap_filter_and_flip_avx2(int8_t filter[4][8], pixel_t *src, int16_t src_stride, int16_t* __restrict dst)
|
||||||
|
{
|
||||||
|
|
||||||
|
//Load 2 rows per xmm register
|
||||||
|
__m128i rows01 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride));
|
||||||
|
rows01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows01), (double*)(src + 1 * src_stride)));
|
||||||
|
|
||||||
|
__m128i rows23 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride));
|
||||||
|
rows23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows23), (double*)(src + 3 * src_stride)));
|
||||||
|
|
||||||
|
__m128i rows45 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride));
|
||||||
|
rows45 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows45), (double*)(src + 5 * src_stride)));
|
||||||
|
|
||||||
|
__m128i rows67 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride));
|
||||||
|
rows67 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows67), (double*)(src + 7 * src_stride)));
|
||||||
|
|
||||||
|
//Filter rows
|
||||||
|
const int dst_stride = MAX_WIDTH;
|
||||||
|
eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[0]), (__m128i*)(dst + 0));
|
||||||
|
eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[1]), (__m128i*)(dst + 1 * dst_stride));
|
||||||
|
eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[2]), (__m128i*)(dst + 2 * dst_stride));
|
||||||
|
eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[3]), (__m128i*)(dst + 3 * dst_stride));
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void eight_tap_filter_and_flip_16bit_avx2(int8_t filter[4][8], int16_t *src, int16_t src_stride, int offset, int combined_shift, pixel_t* __restrict dst, int16_t dst_stride)
|
||||||
|
{
|
||||||
|
|
||||||
|
//Load a row per xmm register
|
||||||
|
__m128i row0 = _mm_loadu_si128((__m128i*)(src + 0 * src_stride));
|
||||||
|
__m128i row1 = _mm_loadu_si128((__m128i*)(src + 1 * src_stride));
|
||||||
|
__m128i row2 = _mm_loadu_si128((__m128i*)(src + 2 * src_stride));
|
||||||
|
__m128i row3 = _mm_loadu_si128((__m128i*)(src + 3 * src_stride));
|
||||||
|
|
||||||
|
//Filter rows
|
||||||
|
union {
|
||||||
|
__m128i vector;
|
||||||
|
int32_t array[4];
|
||||||
|
} temp[4];
|
||||||
|
|
||||||
|
temp[0].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[0]));
|
||||||
|
temp[1].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[1]));
|
||||||
|
temp[2].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[2]));
|
||||||
|
temp[3].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[3]));
|
||||||
|
|
||||||
|
__m128i packed_offset = _mm_set1_epi32(offset);
|
||||||
|
|
||||||
|
temp[0].vector = _mm_add_epi32(temp[0].vector, packed_offset);
|
||||||
|
temp[0].vector = _mm_srai_epi32(temp[0].vector, combined_shift);
|
||||||
|
temp[1].vector = _mm_add_epi32(temp[1].vector, packed_offset);
|
||||||
|
temp[1].vector = _mm_srai_epi32(temp[1].vector, combined_shift);
|
||||||
|
|
||||||
|
temp[0].vector = _mm_packus_epi32(temp[0].vector, temp[1].vector);
|
||||||
|
|
||||||
|
temp[2].vector = _mm_add_epi32(temp[2].vector, packed_offset);
|
||||||
|
temp[2].vector = _mm_srai_epi32(temp[2].vector, combined_shift);
|
||||||
|
temp[3].vector = _mm_add_epi32(temp[3].vector, packed_offset);
|
||||||
|
temp[3].vector = _mm_srai_epi32(temp[3].vector, combined_shift);
|
||||||
|
|
||||||
|
temp[2].vector = _mm_packus_epi32(temp[2].vector, temp[3].vector);
|
||||||
|
|
||||||
|
temp[0].vector = _mm_packus_epi16(temp[0].vector, temp[2].vector);
|
||||||
|
|
||||||
|
int32_t* four_pixels = (int32_t*)&(dst[0 * dst_stride]);
|
||||||
|
*four_pixels = temp[0].array[0];
|
||||||
|
|
||||||
|
four_pixels = (int32_t*)&(dst[1 * dst_stride]);
|
||||||
|
*four_pixels = _mm_extract_epi32(temp[0].vector, 1);
|
||||||
|
|
||||||
|
four_pixels = (int32_t*)&(dst[2 * dst_stride]);
|
||||||
|
*four_pixels = _mm_extract_epi32(temp[0].vector, 2);
|
||||||
|
|
||||||
|
four_pixels = (int32_t*)&(dst[3 * dst_stride]);
|
||||||
|
*four_pixels = _mm_extract_epi32(temp[0].vector, 3);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
int16_t eight_tap_filter_hor_avx2(int8_t *filter, pixel_t *data)
|
int16_t eight_tap_filter_hor_avx2(int8_t *filter, pixel_t *data)
|
||||||
{
|
{
|
||||||
int16_t temp = 0;
|
union {
|
||||||
for (int i = 0; i < 8; ++i)
|
__m128i vector;
|
||||||
{
|
int16_t array[8];
|
||||||
temp += filter[i] * data[i];
|
} sample;
|
||||||
}
|
|
||||||
|
|
||||||
return temp;
|
__m128i packed_data = _mm_loadu_si128((__m128i*)data);
|
||||||
|
__m128i packed_filter = _mm_loadu_si128((__m128i*)filter);
|
||||||
|
|
||||||
|
sample.vector = _mm_maddubs_epi16(packed_data, packed_filter);
|
||||||
|
sample.vector = _mm_hadd_epi16(sample.vector, sample.vector);
|
||||||
|
sample.vector = _mm_hadd_epi16(sample.vector, sample.vector);
|
||||||
|
|
||||||
|
return sample.array[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
|
int32_t eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
|
||||||
|
@ -124,79 +255,49 @@ void filter_inter_quarterpel_luma_avx2(const encoder_control_t * const encoder,
|
||||||
{
|
{
|
||||||
|
|
||||||
int32_t x, y;
|
int32_t x, y;
|
||||||
int32_t shift1 = BIT_DEPTH - 8;
|
int16_t shift1 = BIT_DEPTH - 8;
|
||||||
int32_t shift2 = 6;
|
int32_t shift2 = 6;
|
||||||
int32_t shift3 = 14 - BIT_DEPTH;
|
int32_t shift3 = 14 - BIT_DEPTH;
|
||||||
int32_t offset3 = 1 << (shift3 - 1);
|
|
||||||
int32_t offset23 = 1 << (shift2 + shift3 - 1);
|
int32_t offset23 = 1 << (shift2 + shift3 - 1);
|
||||||
|
|
||||||
//coefficients for 1/4, 2/4 and 3/4 positions
|
//coefficients for 1/4, 2/4 and 3/4 positions
|
||||||
int8_t *c1, *c2, *c3;
|
int8_t *c0, *c1, *c2, *c3;
|
||||||
|
|
||||||
int i;
|
c0 = g_luma_filter[0];
|
||||||
c1 = g_luma_filter[1];
|
c1 = g_luma_filter[1];
|
||||||
c2 = g_luma_filter[2];
|
c2 = g_luma_filter[2];
|
||||||
c3 = g_luma_filter[3];
|
c3 = g_luma_filter[3];
|
||||||
|
|
||||||
int16_t temp[3][8];
|
int16_t flipped_hor_filtered[MAX_HEIGHT][MAX_WIDTH];
|
||||||
|
|
||||||
// Loop source pixels and generate sixteen filtered quarter-pel pixels on each round
|
// Filter horizontally and flip x and y
|
||||||
for (y = 0; y < height; y++) {
|
for (x = 0; x < width; ++x) {
|
||||||
int dst_pos_y = (y << 2)*dst_stride;
|
for (y = 0; y < height; y += 8) {
|
||||||
int src_pos_y = y*src_stride;
|
int ypos = y - FILTER_OFFSET;
|
||||||
for (x = 0; x < width; x++) {
|
int xpos = x - FILTER_OFFSET;
|
||||||
// Calculate current dst and src pixel positions
|
|
||||||
int dst_pos = dst_pos_y + (x << 2);
|
|
||||||
int src_pos = src_pos_y + x;
|
|
||||||
|
|
||||||
// Original pixel
|
eight_tap_filter_and_flip_avx2(g_luma_filter, &src[src_stride*ypos + xpos], src_stride, (int16_t*)&(flipped_hor_filtered[4 * x + 0][y]));
|
||||||
dst[dst_pos] = src[src_pos];
|
|
||||||
|
|
||||||
//
|
|
||||||
if (hor_flag && !ver_flag) {
|
|
||||||
|
|
||||||
temp[0][3] = eight_tap_filter_hor_avx2(c1, &src[src_pos - 3]) >> shift1;
|
|
||||||
temp[1][3] = eight_tap_filter_hor_avx2(c2, &src[src_pos - 3]) >> shift1;
|
|
||||||
temp[2][3] = eight_tap_filter_hor_avx2(c3, &src[src_pos - 3]) >> shift1;
|
|
||||||
}
|
|
||||||
// ea0,0 - needed only when ver_flag
|
|
||||||
if (ver_flag) {
|
|
||||||
dst[dst_pos + 1 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_avx2(c1, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
|
|
||||||
dst[dst_pos + 2 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_avx2(c2, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
|
|
||||||
dst[dst_pos + 3 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_avx2(c3, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// When both flags, we use _only_ this pixel (but still need ae0,0 for it)
|
|
||||||
if (hor_flag && ver_flag) {
|
|
||||||
|
|
||||||
// Calculate temporary values..
|
|
||||||
src_pos -= 3 * src_stride; //0,-3
|
|
||||||
for (i = 0; i < 8; ++i) {
|
|
||||||
|
|
||||||
temp[0][i] = eight_tap_filter_hor_avx2(c1, &src[src_pos + i * src_stride - 3]) >> shift1; // h0(0,-3+i)
|
|
||||||
temp[1][i] = eight_tap_filter_hor_avx2(c2, &src[src_pos + i * src_stride - 3]) >> shift1; // h1(0,-3+i)
|
|
||||||
temp[2][i] = eight_tap_filter_hor_avx2(c3, &src[src_pos + i * src_stride - 3]) >> shift1; // h2(0,-3+i)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for (i = 0; i<3; ++i){
|
|
||||||
dst[dst_pos + 1 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_avx2(c1, &temp[i][0]) + offset23) >> shift2) >> shift3);
|
|
||||||
dst[dst_pos + 2 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_avx2(c2, &temp[i][0]) + offset23) >> shift2) >> shift3);
|
|
||||||
dst[dst_pos + 3 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_avx2(c3, &temp[i][0]) + offset23) >> shift2) >> shift3);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (; y < height + FILTER_SIZE; ++y) {
|
||||||
|
int ypos = y - FILTER_OFFSET;
|
||||||
|
int xpos = x - FILTER_OFFSET;
|
||||||
|
flipped_hor_filtered[4 * x + 0][y] = eight_tap_filter_hor_avx2(c0, &src[src_stride*ypos + xpos]) << shift1;
|
||||||
|
flipped_hor_filtered[4 * x + 1][y] = eight_tap_filter_hor_avx2(c1, &src[src_stride*ypos + xpos]) << shift1;
|
||||||
|
flipped_hor_filtered[4 * x + 2][y] = eight_tap_filter_hor_avx2(c2, &src[src_stride*ypos + xpos]) << shift1;
|
||||||
|
flipped_hor_filtered[4 * x + 3][y] = eight_tap_filter_hor_avx2(c3, &src[src_stride*ypos + xpos]) << shift1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hor_flag) {
|
// Filter vertically and flip x and y
|
||||||
dst[dst_pos + 1] = fast_clip_32bit_to_pixel((temp[0][3] + offset3) >> shift3);
|
for (y = 0; y < height; ++y) {
|
||||||
dst[dst_pos + 2] = fast_clip_32bit_to_pixel((temp[1][3] + offset3) >> shift3);
|
for (x = 0; x < 4 * width - 3; x += 4) {
|
||||||
dst[dst_pos + 3] = fast_clip_32bit_to_pixel((temp[2][3] + offset3) >> shift3);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
eight_tap_filter_and_flip_16bit_avx2(g_luma_filter, &flipped_hor_filtered[x][y], MAX_WIDTH, offset23, shift2 + shift3, &(dst[(4 * y + 0)*dst_stride + x]), dst_stride);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -416,15 +517,16 @@ void extend_borders_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif //COMPILE_INTEL_AVX2
|
||||||
|
|
||||||
int strategy_register_ipol_avx2(void* opaque)
|
int strategy_register_ipol_avx2(void* opaque)
|
||||||
{
|
{
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
#if COMPILE_INTEL_AVX2
|
||||||
success &= strategyselector_register(opaque, "filter_inter_quarterpel_luma", "avx2", 0, &filter_inter_quarterpel_luma_avx2);
|
success &= strategyselector_register(opaque, "filter_inter_quarterpel_luma", "avx2", 40, &filter_inter_quarterpel_luma_avx2);
|
||||||
success &= strategyselector_register(opaque, "filter_inter_halfpel_chroma", "avx2", 0, &filter_inter_halfpel_chroma_avx2);
|
success &= strategyselector_register(opaque, "filter_inter_halfpel_chroma", "avx2", 40, &filter_inter_halfpel_chroma_avx2);
|
||||||
success &= strategyselector_register(opaque, "filter_inter_octpel_chroma", "avx2", 0, &filter_inter_octpel_chroma_avx2);
|
success &= strategyselector_register(opaque, "filter_inter_octpel_chroma", "avx2", 40, &filter_inter_octpel_chroma_avx2);
|
||||||
success &= strategyselector_register(opaque, "extend_borders", "avx2", 0, &extend_borders_avx2);
|
success &= strategyselector_register(opaque, "extend_borders", "avx2", 40, &extend_borders_avx2);
|
||||||
|
#endif //COMPILE_INTEL_AVX2
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
|
@ -122,83 +122,52 @@ int32_t four_tap_filter_ver_16bit_generic(int8_t *filter, int16_t *data, int16_t
|
||||||
|
|
||||||
void filter_inter_quarterpel_luma_generic(const encoder_control_t * const encoder, pixel_t *src, int16_t src_stride, int width, int height, pixel_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
|
void filter_inter_quarterpel_luma_generic(const encoder_control_t * const encoder, pixel_t *src, int16_t src_stride, int width, int height, pixel_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
|
||||||
{
|
{
|
||||||
|
//TODO: horizontal and vertical only filtering
|
||||||
int32_t x, y;
|
int32_t x, y;
|
||||||
int32_t shift1 = BIT_DEPTH - 8;
|
int16_t shift1 = BIT_DEPTH - 8;
|
||||||
int32_t shift2 = 6;
|
int32_t shift2 = 6;
|
||||||
int32_t shift3 = 14 - BIT_DEPTH;
|
int32_t shift3 = 14 - BIT_DEPTH;
|
||||||
int32_t offset3 = 1 << (shift3 - 1);
|
|
||||||
int32_t offset23 = 1 << (shift2 + shift3 - 1);
|
int32_t offset23 = 1 << (shift2 + shift3 - 1);
|
||||||
|
|
||||||
//coefficients for 1/4, 2/4 and 3/4 positions
|
//coefficients for 1/4, 2/4 and 3/4 positions
|
||||||
int8_t *c1, *c2, *c3;
|
int8_t *c0, *c1, *c2, *c3;
|
||||||
|
|
||||||
int i;
|
c0 = g_luma_filter[0];
|
||||||
c1 = g_luma_filter[1];
|
c1 = g_luma_filter[1];
|
||||||
c2 = g_luma_filter[2];
|
c2 = g_luma_filter[2];
|
||||||
c3 = g_luma_filter[3];
|
c3 = g_luma_filter[3];
|
||||||
|
|
||||||
int16_t temp[3][8];
|
#define FILTER_OFFSET 3
|
||||||
|
#define FILTER_SIZE 8
|
||||||
|
|
||||||
// Loop source pixels and generate sixteen filtered quarter-pel pixels on each round
|
int16_t flipped_hor_filtered[4 * (LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE];
|
||||||
for (y = 0; y < height; y++) {
|
|
||||||
int dst_pos_y = (y << 2)*dst_stride;
|
|
||||||
int src_pos_y = y*src_stride;
|
|
||||||
for (x = 0; x < width; x++) {
|
|
||||||
// Calculate current dst and src pixel positions
|
|
||||||
int dst_pos = dst_pos_y + (x << 2);
|
|
||||||
int src_pos = src_pos_y + x;
|
|
||||||
|
|
||||||
|
// Filter horizontally and flip x and y
|
||||||
|
for (x = 0; x < width; ++x) {
|
||||||
|
for (y = 0; y < height + FILTER_SIZE; ++y) {
|
||||||
|
int ypos = y - FILTER_OFFSET;
|
||||||
|
int xpos = x - FILTER_OFFSET;
|
||||||
// Original pixel
|
// Original pixel
|
||||||
dst[dst_pos] = src[src_pos];
|
flipped_hor_filtered[4 * x + 0][y] = (c0[FILTER_OFFSET] * src[src_stride*ypos + xpos + FILTER_OFFSET]) << shift1;
|
||||||
|
flipped_hor_filtered[4 * x + 1][y] = eight_tap_filter_hor_generic(c1, &src[src_stride*ypos + xpos]) << shift1;
|
||||||
//
|
flipped_hor_filtered[4 * x + 2][y] = eight_tap_filter_hor_generic(c2, &src[src_stride*ypos + xpos]) << shift1;
|
||||||
if (hor_flag && !ver_flag) {
|
flipped_hor_filtered[4 * x + 3][y] = eight_tap_filter_hor_generic(c3, &src[src_stride*ypos + xpos]) << shift1;
|
||||||
|
|
||||||
temp[0][3] = eight_tap_filter_hor_generic(c1, &src[src_pos - 3]) >> shift1;
|
|
||||||
temp[1][3] = eight_tap_filter_hor_generic(c2, &src[src_pos - 3]) >> shift1;
|
|
||||||
temp[2][3] = eight_tap_filter_hor_generic(c3, &src[src_pos - 3]) >> shift1;
|
|
||||||
}
|
|
||||||
// ea0,0 - needed only when ver_flag
|
|
||||||
if (ver_flag) {
|
|
||||||
dst[dst_pos + 1 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_generic(c1, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
|
|
||||||
dst[dst_pos + 2 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_generic(c2, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
|
|
||||||
dst[dst_pos + 3 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_generic(c3, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// When both flags, we use _only_ this pixel (but still need ae0,0 for it)
|
|
||||||
if (hor_flag && ver_flag) {
|
|
||||||
|
|
||||||
// Calculate temporary values..
|
|
||||||
src_pos -= 3 * src_stride; //0,-3
|
|
||||||
for (i = 0; i < 8; ++i) {
|
|
||||||
|
|
||||||
temp[0][i] = eight_tap_filter_hor_generic(c1, &src[src_pos + i * src_stride - 3]) >> shift1; // h0(0,-3+i)
|
|
||||||
temp[1][i] = eight_tap_filter_hor_generic(c2, &src[src_pos + i * src_stride - 3]) >> shift1; // h1(0,-3+i)
|
|
||||||
temp[2][i] = eight_tap_filter_hor_generic(c3, &src[src_pos + i * src_stride - 3]) >> shift1; // h2(0,-3+i)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for (i = 0; i<3; ++i){
|
|
||||||
dst[dst_pos + 1 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c1, &temp[i][0]) + offset23) >> shift2) >> shift3);
|
|
||||||
dst[dst_pos + 2 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c2, &temp[i][0]) + offset23) >> shift2) >> shift3);
|
|
||||||
dst[dst_pos + 3 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c3, &temp[i][0]) + offset23) >> shift2) >> shift3);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hor_flag) {
|
|
||||||
dst[dst_pos + 1] = fast_clip_32bit_to_pixel((temp[0][3] + offset3) >> shift3);
|
|
||||||
dst[dst_pos + 2] = fast_clip_32bit_to_pixel((temp[1][3] + offset3) >> shift3);
|
|
||||||
dst[dst_pos + 3] = fast_clip_32bit_to_pixel((temp[2][3] + offset3) >> shift3);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Filter vertically and flip x and y
|
||||||
|
for (x = 0; x < 4 * width; ++x) {
|
||||||
|
for (y = 0; y < height; ++y) {
|
||||||
|
int ypos = y;
|
||||||
|
int xpos = x;
|
||||||
|
dst[(4 * y + 0)*dst_stride + x] = fast_clip_32bit_to_pixel(((c0[FILTER_OFFSET] * flipped_hor_filtered[xpos][ypos + FILTER_OFFSET] + offset23) >> shift2) >> shift3);
|
||||||
|
dst[(4 * y + 1)*dst_stride + x] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c1, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3);
|
||||||
|
dst[(4 * y + 2)*dst_stride + x] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c2, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3);
|
||||||
|
dst[(4 * y + 3)*dst_stride + x] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c3, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in a new issue