diff --git a/src/strategies/sse41/alf-sse41.c b/src/strategies/sse41/alf-sse41.c index 2d7ded14..a803e4cc 100644 --- a/src/strategies/sse41/alf-sse41.c +++ b/src/strategies/sse41/alf-sse41.c @@ -236,13 +236,13 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state, // const uint32_t activity = std::min(15, tempAct * scale >> shift); // static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; - // uint8_t classIdx = th[activity]; + // uint8_t class_idx = th[activity]; const uint32_t scale = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64; const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64; __m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2))); activity = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift)); activity = _mm_min_epi32(activity, _mm_set1_epi32(15)); - __m128i classIdx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity); + __m128i class_idx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity); // if (sumV > sumH) // { @@ -297,48 +297,452 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state, // if (hvd1 * 2 > 9 * hvd0) // { - // classIdx += (dirIdx + 2) * 5; + // class_idx += (dirIdx + 2) * 5; // } // else if (hvd1 > 2 * hvd0) // { - // classIdx += (dirIdx + 1) * 5; + // class_idx += (dirIdx + 1) * 5; // } __m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0)); __m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3))); __m128i offset = _mm_and_si128(strength1, _mm_set1_epi32(5)); - classIdx = _mm_add_epi32(classIdx, offset); - classIdx = _mm_add_epi32(classIdx, _mm_and_si128(strength2, _mm_set1_epi32(5))); + class_idx = _mm_add_epi32(class_idx, offset); + class_idx = _mm_add_epi32(class_idx, _mm_and_si128(strength2, _mm_set1_epi32(5))); offset = _mm_andnot_si128(dirIdx, offset); offset = _mm_add_epi32(offset, offset); - classIdx = _mm_add_epi32(classIdx, offset); + class_idx = _mm_add_epi32(class_idx, offset); - // uint8_t transposeIdx = 2 * dirTempD + dirTempHV; - __m128i transposeIdx = _mm_set1_epi32(3); - transposeIdx = _mm_add_epi32(transposeIdx, dirTempHVMinus1); - transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); - transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); + // uint8_t transpose_idx = 2 * dirTempD + dirTempHV; + __m128i transpose_idx = _mm_set1_epi32(3); + transpose_idx = _mm_add_epi32(transpose_idx, dirTempHVMinus1); + transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1); + transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1); int yOffset = 2 * i + blk_pos_y; int xOffset = j + blk_pos_x; - static_assert(sizeof(alf_classifier) == 2, "ALFClassifier type must be 16 bits wide"); + static_assert(sizeof(alf_classifier) == 2, "alf_classifier type must be 16 bits wide"); __m128i v; - v = _mm_unpacklo_epi8(classIdx, transposeIdx); + v = _mm_unpacklo_epi8(class_idx, transpose_idx); v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); - _mm_storeu_si128((__m128i *) (classifier[yOffset] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 1] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 2] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 3] + xOffset), v); - v = _mm_unpackhi_epi8(classIdx, transposeIdx); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 1] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 2] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 3] + xOffset), v); + v = _mm_unpackhi_epi8(class_idx, transpose_idx); v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 4] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 5] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 6] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 7] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 4] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 5] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 6] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 7] + xOffset), v); } } } + +INLINE static void process2coeffs_5x5(__m128i params[2][3], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) { + const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur); + const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur); + const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur); + const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur); + __m128i val01A = _mm_unpacklo_epi16(val00, val10); + __m128i val01B = _mm_unpackhi_epi16(val00, val10); + __m128i val01C = _mm_unpacklo_epi16(val01, val11); + __m128i val01D = _mm_unpackhi_epi16(val01, val11); + + __m128i limit01A = params[1][i]; + + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01A); + val01C = _mm_min_epi16(val01C, limit01A); + val01D = _mm_min_epi16(val01D, limit01A); + + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + + val01A = _mm_max_epi16(val01A, limit01A); + val01B = _mm_max_epi16(val01B, limit01A); + val01C = _mm_max_epi16(val01C, limit01A); + val01D = _mm_max_epi16(val01D, limit01A); + + val01A = _mm_add_epi16(val01A, val01C); + val01B = _mm_add_epi16(val01B, val01D); + + __m128i coeff01A = params[0][i]; + + *accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A)); + *accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01A)); +}; + + +static void alf_filter_5x5_block_sse41(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height) +{ + + + assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2"); + + alf_component_id compId = COMPONENT_Cb; + + const size_t srcStride = src_stride; + const size_t dstStride = dst_stride; + + const int SHIFT = state->encoder_control->bitdepth - 1; + const int ROUND = 1 << (SHIFT - 1); + const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND); + + const size_t STEP_X = 8; + const size_t STEP_Y = 4; + + assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering"); + assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering"); + assert(height % STEP_Y == 0 && "Wrong endHeight in filtering"); + assert(width % 4 == 0 && "Wrong endWidth in filtering"); + + const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos; + kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x; + + + + const __m128i mmOffset = _mm_set1_epi32(ROUND); + const __m128i mmMin = _mm_set1_epi16(clp_rng.min); + const __m128i mmMax = _mm_set1_epi16(clp_rng.max); + + __m128i params[2][3]; + __m128i fs = _mm_loadu_si128((__m128i*) filter_set); + params[0][0] = _mm_shuffle_epi32(fs, 0x00); + params[0][1] = _mm_shuffle_epi32(fs, 0x55); + params[0][2] = _mm_shuffle_epi32(fs, 0xaa); + __m128i fc = _mm_loadu_si128((__m128i*) fClipSet); + params[1][0] = _mm_shuffle_epi32(fc, 0x00); + params[1][1] = _mm_shuffle_epi32(fc, 0x55); + params[1][2] = _mm_shuffle_epi32(fc, 0xaa); + + const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0); + + for (size_t i = 0; i < height; i += STEP_Y) + { + for (size_t j = 0; j < width; j += STEP_X) + { + + for (size_t ii = 0; ii < STEP_Y; ii++) + { + const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4; + + pImg0 = src + j + ii * srcStride; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + + const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1); + if (yVb < vb_pos && (yVb >= vb_pos - 2)) // above + { + pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1; + pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3; + + pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2; + pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4; + } + else if (yVb >= vb_pos && (yVb <= vb_pos + 1)) // bottom + { + pImg2 = (yVb == vb_pos) ? pImg0 : pImg2; + pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4; + + pImg1 = (yVb == vb_pos) ? pImg0 : pImg1; + pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3; + } + __m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128()); + + __m128i accumA = mmOffset; + __m128i accumB = mmOffset; + + + + process2coeffs_5x5(params, &cur, &accumA, &accumB, 0, pImg3 + 0, pImg4 + 0, pImg1 + 1, pImg2 - 1); + process2coeffs_5x5(params, &cur, &accumA, &accumB, 1, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1); + process2coeffs_5x5(params, &cur, &accumA, &accumB, 2, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1); + bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos); + if (!(isNearVBabove || isNearVBbelow)) + { + accumA = _mm_srai_epi32(accumA, SHIFT); + accumB = _mm_srai_epi32(accumB, SHIFT); + } + else + { + accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3); + accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3); + } + accumA = _mm_packs_epi32(accumA, accumB); + accumA = _mm_add_epi16(accumA, cur); + accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); + + if (j + STEP_X <= width) + { + //_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA); + _mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask)); + } + else + { + //_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), accumA); + _mm_store_ss((float*) (dst + ii * dstStride + j), _mm_castsi128_ps(_mm_shuffle_epi8(accumA, mask))); + } + } + + } + + src += srcStride * STEP_Y; + dst += dstStride * STEP_Y; + } +} + +#define sh(x) 0x0202 * (x & 7) + 0x0100 + 0x1010 * (x & 8) + +static const uint16_t shuffleTab[4][2][8] = { + { + { sh(0), sh(1), sh(2), sh(3), sh(4), sh(5), sh(6), sh(7) }, + { sh(8), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) }, + }, + { + { sh(9), sh(4), sh(10), sh(8), sh(1), sh(5), sh(11), sh(7) }, + { sh(3), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) }, + }, + { + { sh(0), sh(3), sh(2), sh(1), sh(8), sh(7), sh(6), sh(5) }, + { sh(4), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) }, + }, + { + { sh(9), sh(8), sh(10), sh(4), sh(3), sh(7), sh(11), sh(5) }, + { sh(1), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) }, + }, +}; + + + +INLINE static void process2coeffs_7x7(__m128i params[2][2][6], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) { + const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur); + const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur); + const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur); + const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur); + + __m128i val01A = _mm_unpacklo_epi16(val00, val10); + __m128i val01B = _mm_unpackhi_epi16(val00, val10); + __m128i val01C = _mm_unpacklo_epi16(val01, val11); + __m128i val01D = _mm_unpackhi_epi16(val01, val11); + + __m128i limit01A = params[0][1][i]; + __m128i limit01B = params[1][1][i]; + + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01B); + val01C = _mm_min_epi16(val01C, limit01A); + val01D = _mm_min_epi16(val01D, limit01B); + + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); + + val01A = _mm_max_epi16(val01A, limit01A); + val01B = _mm_max_epi16(val01B, limit01B); + val01C = _mm_max_epi16(val01C, limit01A); + val01D = _mm_max_epi16(val01D, limit01B); + + val01A = _mm_add_epi16(val01A, val01C); + val01B = _mm_add_epi16(val01B, val01D); + + const __m128i coeff01A = params[0][0][i]; + const __m128i coeff01B = params[1][0][i]; + + *accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A)); + *accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01B)); +}; + + + + +static void alf_filter_7x7_block_sse41(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height) +{ + assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2"); + alf_component_id compId = COMPONENT_Y; + + + const size_t srcStride = src_stride; + const size_t dstStride = dst_stride; + + const int SHIFT = state->encoder_control->bitdepth - 1; + const int ROUND = 1 << (SHIFT - 1); + + const size_t STEP_X = 8; + const size_t STEP_Y = 4; + + assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering"); + assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering"); + assert(height % STEP_Y == 0 && "Wrong endHeight in filtering"); + assert(width % STEP_X == 0 && "Wrong endWidth in filtering"); + + const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos; + kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x; + + const __m128i mmOffset = _mm_set1_epi32(ROUND); + const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND); + const __m128i mmMin = _mm_set1_epi16(clp_rng.min); + const __m128i mmMax = _mm_set1_epi16(clp_rng.max); + + const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0); + + for (size_t i = 0; i < height; i += STEP_Y) + { + const alf_classifier* pClass = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x; + + for (size_t j = 0; j < width; j += STEP_X) + { + __m128i params[2][2][6]; + + for (int k = 0; k < 2; ++k) + { + const alf_classifier* cl = &pClass[j + 4 * k]; + + const int transpose_idx = cl->transpose_idx; + const int class_idx = cl->class_idx; + + static_assert(sizeof(*filter_set) == 2, "ALF coeffs must be 16-bit wide"); + static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide"); + + __m128i rawCoeff0, rawCoeff1; + __m128i rawClip0, rawClip1; + + rawCoeff0 = _mm_loadu_si128((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF)); + rawCoeff1 = _mm_loadl_epi64((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8)); + + rawClip0 = _mm_loadu_si128((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF)); + rawClip1 = _mm_loadl_epi64((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8)); + + const __m128i s0 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][0]); + const __m128i s1 = _mm_xor_si128(s0, _mm_set1_epi8((char)0x80)); + const __m128i s2 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][1]); + const __m128i s3 = _mm_xor_si128(s2, _mm_set1_epi8((char)0x80)); + + const __m128i rawCoeffLo = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s0), _mm_shuffle_epi8(rawCoeff1, s1)); + const __m128i rawCoeffHi = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s2), _mm_shuffle_epi8(rawCoeff1, s3)); + const __m128i rawClipLo = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s0), _mm_shuffle_epi8(rawClip1, s1)); + const __m128i rawClipHi = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s2), _mm_shuffle_epi8(rawClip1, s3)); + + params[k][0][0] = _mm_shuffle_epi32(rawCoeffLo, 0x00); + params[k][0][1] = _mm_shuffle_epi32(rawCoeffLo, 0x55); + params[k][0][2] = _mm_shuffle_epi32(rawCoeffLo, 0xaa); + params[k][0][3] = _mm_shuffle_epi32(rawCoeffLo, 0xff); + params[k][0][4] = _mm_shuffle_epi32(rawCoeffHi, 0x00); + params[k][0][5] = _mm_shuffle_epi32(rawCoeffHi, 0x55); + params[k][1][0] = _mm_shuffle_epi32(rawClipLo, 0x00); + params[k][1][1] = _mm_shuffle_epi32(rawClipLo, 0x55); + params[k][1][2] = _mm_shuffle_epi32(rawClipLo, 0xaa); + params[k][1][3] = _mm_shuffle_epi32(rawClipLo, 0xff); + params[k][1][4] = _mm_shuffle_epi32(rawClipHi, 0x00); + params[k][1][5] = _mm_shuffle_epi32(rawClipHi, 0x55); + } + + for (size_t ii = 0; ii < STEP_Y; ii++) + { + const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4, * pImg5, * pImg6; + + pImg0 = src + j + ii * srcStride; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + + const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1); + if (yVb < vb_pos && (yVb >= vb_pos - 4)) // above + { + pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1; + pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3; + pImg5 = (yVb >= vb_pos - 3) ? pImg3 : pImg5; + + pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2; + pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4; + pImg6 = (yVb >= vb_pos - 3) ? pImg4 : pImg6; + } + else if (yVb >= vb_pos && (yVb <= vb_pos + 3)) // bottom + { + pImg2 = (yVb == vb_pos) ? pImg0 : pImg2; + pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4; + pImg6 = (yVb <= vb_pos + 2) ? pImg4 : pImg6; + + pImg1 = (yVb == vb_pos) ? pImg0 : pImg1; + pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3; + pImg5 = (yVb <= vb_pos + 2) ? pImg3 : pImg5; + } + __m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128()); + + __m128i accumA = mmOffset; + __m128i accumB = mmOffset; + + process2coeffs_7x7(params, &cur, &accumA, &accumB, 0, pImg5 + 0, pImg6 + 0, pImg3 + 1, pImg4 - 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 1, pImg3 + 0, pImg4 + 0, pImg3 - 1, pImg4 + 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 2, pImg1 + 2, pImg2 - 2, pImg1 + 1, pImg2 - 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 3, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 4, pImg1 - 2, pImg2 + 2, pImg0 + 3, pImg0 - 3); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 5, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + + + bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1); + bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos); + if (!(isNearVBabove || isNearVBbelow)) + { + accumA = _mm_srai_epi32(accumA, SHIFT); + accumB = _mm_srai_epi32(accumB, SHIFT); + } + else + { + accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3); + accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3); + } + accumA = _mm_packs_epi32(accumA, accumB); + accumA = _mm_add_epi16(accumA, cur); + accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); + + //_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA); + _mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask)); + } + } + + src += srcStride * STEP_Y; + dst += dstStride * STEP_Y; + } +} + + + #endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_SSE41 @@ -349,6 +753,8 @@ int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth) { #if KVZ_BIT_DEPTH == 8 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41); + success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "sse41", 0, &alf_filter_5x5_block_sse41); + success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "sse41", 0, &alf_filter_7x7_block_sse41); } #endif // KVZ_BIT_DEPTH == 8 #endif