From 39eceec38daadca1cedbd3c861f82f6a0a81973b Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 27 Feb 2015 15:12:03 +0200 Subject: [PATCH 1/3] Rewrite of luma fractional pixel filtering. Utilizes intermediate values instead of calculating everything again. --- src/strategies/generic/ipol-generic.c | 87 +++++++++------------------ 1 file changed, 28 insertions(+), 59 deletions(-) diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index 42027260..576e4a49 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -122,83 +122,52 @@ int32_t four_tap_filter_ver_16bit_generic(int8_t *filter, int16_t *data, int16_t void filter_inter_quarterpel_luma_generic(const encoder_control_t * const encoder, pixel_t *src, int16_t src_stride, int width, int height, pixel_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) { - + //TODO: horizontal and vertical only filtering int32_t x, y; - int32_t shift1 = BIT_DEPTH - 8; + int16_t shift1 = BIT_DEPTH - 8; int32_t shift2 = 6; int32_t shift3 = 14 - BIT_DEPTH; - int32_t offset3 = 1 << (shift3 - 1); int32_t offset23 = 1 << (shift2 + shift3 - 1); //coefficients for 1/4, 2/4 and 3/4 positions - int8_t *c1, *c2, *c3; + int8_t *c0, *c1, *c2, *c3; - int i; + c0 = g_luma_filter[0]; c1 = g_luma_filter[1]; c2 = g_luma_filter[2]; c3 = g_luma_filter[3]; - int16_t temp[3][8]; + #define FILTER_OFFSET 3 + #define FILTER_SIZE 8 - // Loop source pixels and generate sixteen filtered quarter-pel pixels on each round - for (y = 0; y < height; y++) { - int dst_pos_y = (y << 2)*dst_stride; - int src_pos_y = y*src_stride; - for (x = 0; x < width; x++) { - // Calculate current dst and src pixel positions - int dst_pos = dst_pos_y + (x << 2); - int src_pos = src_pos_y + x; + int16_t flipped_hor_filtered[4 * (LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE]; + // Filter horizontally and flip x and y + for (x = 0; x < width; ++x) { + for (y = 0; y < height + FILTER_SIZE; ++y) { + int ypos = y - FILTER_OFFSET; + int xpos = x - FILTER_OFFSET; // Original pixel - dst[dst_pos] = src[src_pos]; - - // - if (hor_flag && !ver_flag) { - - temp[0][3] = eight_tap_filter_hor_generic(c1, &src[src_pos - 3]) >> shift1; - temp[1][3] = eight_tap_filter_hor_generic(c2, &src[src_pos - 3]) >> shift1; - temp[2][3] = eight_tap_filter_hor_generic(c3, &src[src_pos - 3]) >> shift1; - } - // ea0,0 - needed only when ver_flag - if (ver_flag) { - dst[dst_pos + 1 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_generic(c1, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 2 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_generic(c2, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 3 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_generic(c3, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - } - - // When both flags, we use _only_ this pixel (but still need ae0,0 for it) - if (hor_flag && ver_flag) { - - // Calculate temporary values.. - src_pos -= 3 * src_stride; //0,-3 - for (i = 0; i < 8; ++i) { - - temp[0][i] = eight_tap_filter_hor_generic(c1, &src[src_pos + i * src_stride - 3]) >> shift1; // h0(0,-3+i) - temp[1][i] = eight_tap_filter_hor_generic(c2, &src[src_pos + i * src_stride - 3]) >> shift1; // h1(0,-3+i) - temp[2][i] = eight_tap_filter_hor_generic(c3, &src[src_pos + i * src_stride - 3]) >> shift1; // h2(0,-3+i) - } - - - - for (i = 0; i<3; ++i){ - dst[dst_pos + 1 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c1, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 2 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c2, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 3 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c3, &temp[i][0]) + offset23) >> shift2) >> shift3); - - } - - } - - if (hor_flag) { - dst[dst_pos + 1] = fast_clip_32bit_to_pixel((temp[0][3] + offset3) >> shift3); - dst[dst_pos + 2] = fast_clip_32bit_to_pixel((temp[1][3] + offset3) >> shift3); - dst[dst_pos + 3] = fast_clip_32bit_to_pixel((temp[2][3] + offset3) >> shift3); - } - + flipped_hor_filtered[4 * x + 0][y] = (c0[FILTER_OFFSET] * src[src_stride*ypos + xpos + FILTER_OFFSET]) << shift1; + flipped_hor_filtered[4 * x + 1][y] = eight_tap_filter_hor_generic(c1, &src[src_stride*ypos + xpos]) << shift1; + flipped_hor_filtered[4 * x + 2][y] = eight_tap_filter_hor_generic(c2, &src[src_stride*ypos + xpos]) << shift1; + flipped_hor_filtered[4 * x + 3][y] = eight_tap_filter_hor_generic(c3, &src[src_stride*ypos + xpos]) << shift1; } } + // Filter vertically and flip x and y + for (x = 0; x < 4 * width; ++x) { + for (y = 0; y < height; ++y) { + int ypos = y; + int xpos = x; + dst[(4 * y + 0)*dst_stride + x] = fast_clip_32bit_to_pixel(((c0[FILTER_OFFSET] * flipped_hor_filtered[xpos][ypos + FILTER_OFFSET] + offset23) >> shift2) >> shift3); + dst[(4 * y + 1)*dst_stride + x] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c1, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); + dst[(4 * y + 2)*dst_stride + x] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c2, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); + dst[(4 * y + 3)*dst_stride + x] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_generic(c3, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); + + } + } } /** From b9ec4b0a542589c90abdc919c437bf2c2d20b22a Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 6 Mar 2015 17:14:58 +0200 Subject: [PATCH 2/3] AVX2 acceleration for new luma filtering. --- src/strategies/avx2/ipol-avx2.c | 242 +++++++++++++++++++++++--------- 1 file changed, 172 insertions(+), 70 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 9705b2f6..40ff27b4 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -22,25 +22,156 @@ * \file */ -#include - #include "ipol-avx2.h" #include "strategyselector.h" + +#if COMPILE_INTEL_AVX2 +#include + +#include + + #include "encoder.h" #include "strategies/generic/picture-generic.h" + +#define FILTER_OFFSET 3 +#define FILTER_SIZE 8 + +#define MAX_HEIGHT (4 * (LCU_WIDTH + 1) + FILTER_SIZE) +#define MAX_WIDTH ((LCU_WIDTH + 1) + FILTER_SIZE) + extern int8_t g_luma_filter[4][8]; extern int8_t g_chroma_filter[8][4]; +void eight_tap_filter_x8_and_flip(__m128i data01, __m128i data23, __m128i data45, __m128i data67, __m128i* filter, __m128i* dst) +{ + __m128i a, b, c, d; + __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter)); + + a = _mm_maddubs_epi16(data01, fir); + b = _mm_maddubs_epi16(data23, fir); + a = _mm_hadd_epi16(a, b); + + c = _mm_maddubs_epi16(data45, fir); + d = _mm_maddubs_epi16(data67, fir); + c = _mm_hadd_epi16(c, d); + + a = _mm_hadd_epi16(a, c); + + _mm_storeu_si128(dst, a); +} + +__m128i eight_tap_filter_x4_and_flip_16bit(__m128i data0, __m128i data1, __m128i data2, __m128i data3, __m128i* filter) +{ + __m128i a, b, c, d; + __m128i fir = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(filter))); + + a = _mm_madd_epi16(data0, fir); + b = _mm_madd_epi16(data1, fir); + a = _mm_hadd_epi32(a, b); + + c = _mm_madd_epi16(data2, fir); + d = _mm_madd_epi16(data3, fir); + c = _mm_hadd_epi32(c, d); + + a = _mm_hadd_epi32(a, c); + + return a; +} + +void eight_tap_filter_and_flip_avx2(int8_t filter[4][8], pixel_t *src, int16_t src_stride, int16_t* __restrict dst) +{ + + //Load 2 rows per xmm register + __m128i rows01 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride)); + rows01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows01), (double*)(src + 1 * src_stride))); + + __m128i rows23 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride)); + rows23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows23), (double*)(src + 3 * src_stride))); + + __m128i rows45 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride)); + rows45 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows45), (double*)(src + 5 * src_stride))); + + __m128i rows67 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride)); + rows67 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows67), (double*)(src + 7 * src_stride))); + + //Filter rows + const int dst_stride = MAX_WIDTH; + eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[0]), (__m128i*)(dst + 0)); + eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[1]), (__m128i*)(dst + 1 * dst_stride)); + eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[2]), (__m128i*)(dst + 2 * dst_stride)); + eight_tap_filter_x8_and_flip(rows01, rows23, rows45, rows67, (__m128i*)(&filter[3]), (__m128i*)(dst + 3 * dst_stride)); +} + +static INLINE void eight_tap_filter_and_flip_16bit_avx2(int8_t filter[4][8], int16_t *src, int16_t src_stride, int offset, int combined_shift, pixel_t* __restrict dst, int16_t dst_stride) +{ + + //Load a row per xmm register + __m128i row0 = _mm_loadu_si128((__m128i*)(src + 0 * src_stride)); + __m128i row1 = _mm_loadu_si128((__m128i*)(src + 1 * src_stride)); + __m128i row2 = _mm_loadu_si128((__m128i*)(src + 2 * src_stride)); + __m128i row3 = _mm_loadu_si128((__m128i*)(src + 3 * src_stride)); + + //Filter rows + union { + __m128i vector; + int32_t array[4]; + } temp[4]; + + temp[0].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[0])); + temp[1].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[1])); + temp[2].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[2])); + temp[3].vector = eight_tap_filter_x4_and_flip_16bit(row0, row1, row2, row3, (__m128i*)(&filter[3])); + + __m128i packed_offset = _mm_set1_epi32(offset); + + temp[0].vector = _mm_add_epi32(temp[0].vector, packed_offset); + temp[0].vector = _mm_srai_epi32(temp[0].vector, combined_shift); + temp[1].vector = _mm_add_epi32(temp[1].vector, packed_offset); + temp[1].vector = _mm_srai_epi32(temp[1].vector, combined_shift); + + temp[0].vector = _mm_packus_epi32(temp[0].vector, temp[1].vector); + + temp[2].vector = _mm_add_epi32(temp[2].vector, packed_offset); + temp[2].vector = _mm_srai_epi32(temp[2].vector, combined_shift); + temp[3].vector = _mm_add_epi32(temp[3].vector, packed_offset); + temp[3].vector = _mm_srai_epi32(temp[3].vector, combined_shift); + + temp[2].vector = _mm_packus_epi32(temp[2].vector, temp[3].vector); + + temp[0].vector = _mm_packus_epi16(temp[0].vector, temp[2].vector); + + int32_t* four_pixels = (int32_t*)&(dst[0 * dst_stride]); + *four_pixels = temp[0].array[0]; + + four_pixels = (int32_t*)&(dst[1 * dst_stride]); + *four_pixels = _mm_extract_epi32(temp[0].vector, 1); + + four_pixels = (int32_t*)&(dst[2 * dst_stride]); + *four_pixels = _mm_extract_epi32(temp[0].vector, 2); + + four_pixels = (int32_t*)&(dst[3 * dst_stride]); + *four_pixels = _mm_extract_epi32(temp[0].vector, 3); + + +} + int16_t eight_tap_filter_hor_avx2(int8_t *filter, pixel_t *data) { - int16_t temp = 0; - for (int i = 0; i < 8; ++i) - { - temp += filter[i] * data[i]; - } + union { + __m128i vector; + int16_t array[8]; + } sample; - return temp; + __m128i packed_data = _mm_loadu_si128((__m128i*)data); + __m128i packed_filter = _mm_loadu_si128((__m128i*)filter); + + sample.vector = _mm_maddubs_epi16(packed_data, packed_filter); + sample.vector = _mm_hadd_epi16(sample.vector, sample.vector); + sample.vector = _mm_hadd_epi16(sample.vector, sample.vector); + + return sample.array[0]; } int32_t eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data) @@ -124,79 +255,49 @@ void filter_inter_quarterpel_luma_avx2(const encoder_control_t * const encoder, { int32_t x, y; - int32_t shift1 = BIT_DEPTH - 8; + int16_t shift1 = BIT_DEPTH - 8; int32_t shift2 = 6; int32_t shift3 = 14 - BIT_DEPTH; - int32_t offset3 = 1 << (shift3 - 1); int32_t offset23 = 1 << (shift2 + shift3 - 1); //coefficients for 1/4, 2/4 and 3/4 positions - int8_t *c1, *c2, *c3; + int8_t *c0, *c1, *c2, *c3; - int i; + c0 = g_luma_filter[0]; c1 = g_luma_filter[1]; c2 = g_luma_filter[2]; c3 = g_luma_filter[3]; - int16_t temp[3][8]; + int16_t flipped_hor_filtered[MAX_HEIGHT][MAX_WIDTH]; - // Loop source pixels and generate sixteen filtered quarter-pel pixels on each round - for (y = 0; y < height; y++) { - int dst_pos_y = (y << 2)*dst_stride; - int src_pos_y = y*src_stride; - for (x = 0; x < width; x++) { - // Calculate current dst and src pixel positions - int dst_pos = dst_pos_y + (x << 2); - int src_pos = src_pos_y + x; + // Filter horizontally and flip x and y + for (x = 0; x < width; ++x) { + for (y = 0; y < height; y += 8) { + int ypos = y - FILTER_OFFSET; + int xpos = x - FILTER_OFFSET; - // Original pixel - dst[dst_pos] = src[src_pos]; + eight_tap_filter_and_flip_avx2(g_luma_filter, &src[src_stride*ypos + xpos], src_stride, (int16_t*)&(flipped_hor_filtered[4 * x + 0][y])); + + } - // - if (hor_flag && !ver_flag) { + for (; y < height + FILTER_SIZE; ++y) { + int ypos = y - FILTER_OFFSET; + int xpos = x - FILTER_OFFSET; + flipped_hor_filtered[4 * x + 0][y] = eight_tap_filter_hor_avx2(c0, &src[src_stride*ypos + xpos]) << shift1; + flipped_hor_filtered[4 * x + 1][y] = eight_tap_filter_hor_avx2(c1, &src[src_stride*ypos + xpos]) << shift1; + flipped_hor_filtered[4 * x + 2][y] = eight_tap_filter_hor_avx2(c2, &src[src_stride*ypos + xpos]) << shift1; + flipped_hor_filtered[4 * x + 3][y] = eight_tap_filter_hor_avx2(c3, &src[src_stride*ypos + xpos]) << shift1; + } + } - temp[0][3] = eight_tap_filter_hor_avx2(c1, &src[src_pos - 3]) >> shift1; - temp[1][3] = eight_tap_filter_hor_avx2(c2, &src[src_pos - 3]) >> shift1; - temp[2][3] = eight_tap_filter_hor_avx2(c3, &src[src_pos - 3]) >> shift1; - } - // ea0,0 - needed only when ver_flag - if (ver_flag) { - dst[dst_pos + 1 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_avx2(c1, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 2 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_avx2(c2, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 3 * dst_stride] = fast_clip_16bit_to_pixel(((eight_tap_filter_ver_avx2(c3, &src[src_pos - 3 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - } - - // When both flags, we use _only_ this pixel (but still need ae0,0 for it) - if (hor_flag && ver_flag) { - - // Calculate temporary values.. - src_pos -= 3 * src_stride; //0,-3 - for (i = 0; i < 8; ++i) { - - temp[0][i] = eight_tap_filter_hor_avx2(c1, &src[src_pos + i * src_stride - 3]) >> shift1; // h0(0,-3+i) - temp[1][i] = eight_tap_filter_hor_avx2(c2, &src[src_pos + i * src_stride - 3]) >> shift1; // h1(0,-3+i) - temp[2][i] = eight_tap_filter_hor_avx2(c3, &src[src_pos + i * src_stride - 3]) >> shift1; // h2(0,-3+i) - } - - - - for (i = 0; i<3; ++i){ - dst[dst_pos + 1 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_avx2(c1, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 2 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_avx2(c2, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 3 * dst_stride + i + 1] = fast_clip_32bit_to_pixel(((eight_tap_filter_hor_16bit_avx2(c3, &temp[i][0]) + offset23) >> shift2) >> shift3); - - } - - } - - if (hor_flag) { - dst[dst_pos + 1] = fast_clip_32bit_to_pixel((temp[0][3] + offset3) >> shift3); - dst[dst_pos + 2] = fast_clip_32bit_to_pixel((temp[1][3] + offset3) >> shift3); - dst[dst_pos + 3] = fast_clip_32bit_to_pixel((temp[2][3] + offset3) >> shift3); - } + // Filter vertically and flip x and y + for (y = 0; y < height; ++y) { + for (x = 0; x < 4 * width - 3; x += 4) { + eight_tap_filter_and_flip_16bit_avx2(g_luma_filter, &flipped_hor_filtered[x][y], MAX_WIDTH, offset23, shift2 + shift3, &(dst[(4 * y + 0)*dst_stride + x]), dst_stride); } + } } @@ -416,15 +517,16 @@ void extend_borders_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int } } +#endif //COMPILE_INTEL_AVX2 int strategy_register_ipol_avx2(void* opaque) { bool success = true; - - success &= strategyselector_register(opaque, "filter_inter_quarterpel_luma", "avx2", 0, &filter_inter_quarterpel_luma_avx2); - success &= strategyselector_register(opaque, "filter_inter_halfpel_chroma", "avx2", 0, &filter_inter_halfpel_chroma_avx2); - success &= strategyselector_register(opaque, "filter_inter_octpel_chroma", "avx2", 0, &filter_inter_octpel_chroma_avx2); - success &= strategyselector_register(opaque, "extend_borders", "avx2", 0, &extend_borders_avx2); - +#if COMPILE_INTEL_AVX2 + success &= strategyselector_register(opaque, "filter_inter_quarterpel_luma", "avx2", 40, &filter_inter_quarterpel_luma_avx2); + success &= strategyselector_register(opaque, "filter_inter_halfpel_chroma", "avx2", 40, &filter_inter_halfpel_chroma_avx2); + success &= strategyselector_register(opaque, "filter_inter_octpel_chroma", "avx2", 40, &filter_inter_octpel_chroma_avx2); + success &= strategyselector_register(opaque, "extend_borders", "avx2", 40, &extend_borders_avx2); +#endif //COMPILE_INTEL_AVX2 return success; } From d2bb71739fd4c0602c13628f4bd3f744b32b7ead Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 11 Mar 2015 15:56:15 +0200 Subject: [PATCH 3/3] Clean up and comment WPP threading code. - Remove WPP row reconstruction dependency to the row above current one in the previous frame. It's obviously unnecessary. - Remove WPP row reconstruction dependency to the current row in the previous frame, unless the current row is the last row. --- src/encoderstate.c | 74 +++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index d1902b3a..75846dce 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -333,16 +333,18 @@ static void encoder_state_worker_encode_lcu(void * opaque) { } static void encoder_state_encode_leaf(encoder_state_t * const state) { - const encoder_control_t * const encoder = state->encoder_control; - - int i = 0; - assert(state->is_leaf); assert(state->lcu_order_count > 0); - //If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism - if (state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !state->parent->children[1].encoder_control)) { - for (i = 0; i < state->lcu_order_count; ++i) { + // Select whether to encode the frame/tile in current thread or to define + // wavefront jobs for other threads to handle. + bool wavefront = state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW; + bool use_parallel_encoding = (wavefront && state->parent->children[1].encoder_control); + if (!use_parallel_encoding) { + // Encode every LCU in order and perform SAO reconstruction after every + // frame is encoded. Deblocking and SAO search is done during LCU encoding. + + for (int i = 0; i < state->lcu_order_count; ++i) { PERFORMANCE_MEASURE_START(_DEBUG_PERF_ENCODE_LCU); encoder_state_worker_encode_lcu(&state->lcu_order[i]); @@ -355,7 +357,7 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) { #endif //_DEBUG } - if (encoder->sao_enable) { + if (state->encoder_control->sao_enable) { PERFORMANCE_MEASURE_START(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME); sao_reconstruct_frame(state); PERFORMANCE_MEASURE_END(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME, state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", state->global->frame, state->tile->id, state->slice->id, state->lcu_order[0].position.y + state->tile->lcu_offset_y, state->lcu_order[state->lcu_order_count-1].position.y + state->tile->lcu_offset_y, @@ -364,7 +366,10 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) { ); } } else { - for (i = 0; i < state->lcu_order_count; ++i) { + // Add every LCU in the frame as a job to a queue, along with + // their dependancies, so they can be processed in parallel. + + for (int i = 0; i < state->lcu_order_count; ++i) { const lcu_order_element_t * const lcu = &state->lcu_order[i]; #ifdef _DEBUG char job_description[256]; @@ -373,40 +378,41 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) { char* job_description = NULL; #endif state->tile->wf_jobs[lcu->id] = threadqueue_submit(state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description); + assert(state->tile->wf_jobs[lcu->id] != NULL); + + // Add dependancy for inter frames to the reconstruction of the row + // below current row in the previous frame. This ensures that we can + // search for motion vectors in the previous frame as long as we don't + // go more than one LCU below current row. if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_recon_done && !state->global->is_radl_frame) { - - //Only for the first in the row (we reconstruct row-wise) + // Only add the dependancy to the first LCU in the row. if (!lcu->left) { - //If we have a row below, then we wait till it's completed if (lcu->below) { threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->below->encoder_state->previous_encoder_state->tqj_recon_done); - } - //Also add always a dep on current line - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->encoder_state->previous_encoder_state->tqj_recon_done); - if (lcu->above) { - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->above->encoder_state->previous_encoder_state->tqj_recon_done); - } - } - } - if (state->tile->wf_jobs[lcu->id]) { - if (lcu->position.x > 0) { - // Wait for the LCU on the left. - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]); - } - if (lcu->position.y > 0) { - if (lcu->position.x < state->tile->frame->width_in_lcu - 1) { - // Wait for the LCU to the top-right of this one. - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]); } else { - // If there is no top-right LCU, wait for the one above. - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]); + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->encoder_state->previous_encoder_state->tqj_recon_done); } } - threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]); } + + // Add local WPP dependancy to the LCU on the left. + if (lcu->left) { + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]); + } + // Add local WPP dependancy to the LCU on the top right. + if (lcu->above) { + if (lcu->above->right) { + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]); + } else { + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]); + } + } + + threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]); + if (lcu->position.x == state->tile->frame->width_in_lcu - 1) { - if (!encoder->sao_enable) { - //No SAO + last LCU: the row is reconstructed + if (!state->encoder_control->sao_enable) { + // No SAO + last LCU: the row is reconstructed assert(!state->tqj_recon_done); state->tqj_recon_done = state->tile->wf_jobs[lcu->id]; }