[alf] Import SSE4.1 optimized 5x5 and 7x7 filters from VTM13

* Modified to work with 8-bit pixels
This commit is contained in:
Marko Viitanen 2021-08-25 11:36:04 +03:00
parent dc6a29b0d8
commit f61b9138cd

View file

@ -236,13 +236,13 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state,
// const uint32_t activity = std::min<uint32_t>(15, tempAct * scale >> shift); // const uint32_t activity = std::min<uint32_t>(15, tempAct * scale >> shift);
// static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; // static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
// uint8_t classIdx = th[activity]; // uint8_t class_idx = th[activity];
const uint32_t scale = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64; const uint32_t scale = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64;
const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64; const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64;
__m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2))); __m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2)));
activity = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift)); activity = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift));
activity = _mm_min_epi32(activity, _mm_set1_epi32(15)); activity = _mm_min_epi32(activity, _mm_set1_epi32(15));
__m128i classIdx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity); __m128i class_idx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity);
// if (sumV > sumH) // if (sumV > sumH)
// { // {
@ -297,48 +297,452 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state,
// if (hvd1 * 2 > 9 * hvd0) // if (hvd1 * 2 > 9 * hvd0)
// { // {
// classIdx += (dirIdx + 2) * 5; // class_idx += (dirIdx + 2) * 5;
// } // }
// else if (hvd1 > 2 * hvd0) // else if (hvd1 > 2 * hvd0)
// { // {
// classIdx += (dirIdx + 1) * 5; // class_idx += (dirIdx + 1) * 5;
// } // }
__m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0)); __m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0));
__m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3))); __m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3)));
__m128i offset = _mm_and_si128(strength1, _mm_set1_epi32(5)); __m128i offset = _mm_and_si128(strength1, _mm_set1_epi32(5));
classIdx = _mm_add_epi32(classIdx, offset); class_idx = _mm_add_epi32(class_idx, offset);
classIdx = _mm_add_epi32(classIdx, _mm_and_si128(strength2, _mm_set1_epi32(5))); class_idx = _mm_add_epi32(class_idx, _mm_and_si128(strength2, _mm_set1_epi32(5)));
offset = _mm_andnot_si128(dirIdx, offset); offset = _mm_andnot_si128(dirIdx, offset);
offset = _mm_add_epi32(offset, offset); offset = _mm_add_epi32(offset, offset);
classIdx = _mm_add_epi32(classIdx, offset); class_idx = _mm_add_epi32(class_idx, offset);
// uint8_t transposeIdx = 2 * dirTempD + dirTempHV; // uint8_t transpose_idx = 2 * dirTempD + dirTempHV;
__m128i transposeIdx = _mm_set1_epi32(3); __m128i transpose_idx = _mm_set1_epi32(3);
transposeIdx = _mm_add_epi32(transposeIdx, dirTempHVMinus1); transpose_idx = _mm_add_epi32(transpose_idx, dirTempHVMinus1);
transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1);
transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1);
int yOffset = 2 * i + blk_pos_y; int yOffset = 2 * i + blk_pos_y;
int xOffset = j + blk_pos_x; int xOffset = j + blk_pos_x;
static_assert(sizeof(alf_classifier) == 2, "ALFClassifier type must be 16 bits wide"); static_assert(sizeof(alf_classifier) == 2, "alf_classifier type must be 16 bits wide");
__m128i v; __m128i v;
v = _mm_unpacklo_epi8(classIdx, transposeIdx); v = _mm_unpacklo_epi8(class_idx, transpose_idx);
v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
_mm_storeu_si128((__m128i *) (classifier[yOffset] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset] + xOffset), v);
_mm_storeu_si128((__m128i *) (classifier[yOffset + 1] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 1] + xOffset), v);
_mm_storeu_si128((__m128i *) (classifier[yOffset + 2] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 2] + xOffset), v);
_mm_storeu_si128((__m128i *) (classifier[yOffset + 3] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 3] + xOffset), v);
v = _mm_unpackhi_epi8(classIdx, transposeIdx); v = _mm_unpackhi_epi8(class_idx, transpose_idx);
v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9));
_mm_storeu_si128((__m128i *) (classifier[yOffset + 4] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 4] + xOffset), v);
_mm_storeu_si128((__m128i *) (classifier[yOffset + 5] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 5] + xOffset), v);
_mm_storeu_si128((__m128i *) (classifier[yOffset + 6] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 6] + xOffset), v);
_mm_storeu_si128((__m128i *) (classifier[yOffset + 7] + xOffset), v); _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 7] + xOffset), v);
} }
} }
} }
INLINE static void process2coeffs_5x5(__m128i params[2][3], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) {
const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur);
const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur);
const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur);
const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur);
__m128i val01A = _mm_unpacklo_epi16(val00, val10);
__m128i val01B = _mm_unpackhi_epi16(val00, val10);
__m128i val01C = _mm_unpacklo_epi16(val01, val11);
__m128i val01D = _mm_unpackhi_epi16(val01, val11);
__m128i limit01A = params[1][i];
val01A = _mm_min_epi16(val01A, limit01A);
val01B = _mm_min_epi16(val01B, limit01A);
val01C = _mm_min_epi16(val01C, limit01A);
val01D = _mm_min_epi16(val01D, limit01A);
limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A);
val01A = _mm_max_epi16(val01A, limit01A);
val01B = _mm_max_epi16(val01B, limit01A);
val01C = _mm_max_epi16(val01C, limit01A);
val01D = _mm_max_epi16(val01D, limit01A);
val01A = _mm_add_epi16(val01A, val01C);
val01B = _mm_add_epi16(val01B, val01D);
__m128i coeff01A = params[0][i];
*accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A));
*accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01A));
};
static void alf_filter_5x5_block_sse41(encoder_state_t* const state,
const kvz_pixel* src_pixels,
kvz_pixel* dst_pixels,
const int src_stride,
const int dst_stride,
const short* filter_set,
const int16_t* fClipSet,
clp_rng clp_rng,
const int width,
const int height,
int x_pos,
int y_pos,
int blk_dst_x,
int blk_dst_y,
int vb_pos,
const int vb_ctu_height)
{
assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2");
alf_component_id compId = COMPONENT_Cb;
const size_t srcStride = src_stride;
const size_t dstStride = dst_stride;
const int SHIFT = state->encoder_control->bitdepth - 1;
const int ROUND = 1 << (SHIFT - 1);
const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND);
const size_t STEP_X = 8;
const size_t STEP_Y = 4;
assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering");
assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering");
assert(height % STEP_Y == 0 && "Wrong endHeight in filtering");
assert(width % 4 == 0 && "Wrong endWidth in filtering");
const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos;
kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x;
const __m128i mmOffset = _mm_set1_epi32(ROUND);
const __m128i mmMin = _mm_set1_epi16(clp_rng.min);
const __m128i mmMax = _mm_set1_epi16(clp_rng.max);
__m128i params[2][3];
__m128i fs = _mm_loadu_si128((__m128i*) filter_set);
params[0][0] = _mm_shuffle_epi32(fs, 0x00);
params[0][1] = _mm_shuffle_epi32(fs, 0x55);
params[0][2] = _mm_shuffle_epi32(fs, 0xaa);
__m128i fc = _mm_loadu_si128((__m128i*) fClipSet);
params[1][0] = _mm_shuffle_epi32(fc, 0x00);
params[1][1] = _mm_shuffle_epi32(fc, 0x55);
params[1][2] = _mm_shuffle_epi32(fc, 0xaa);
const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0);
for (size_t i = 0; i < height; i += STEP_Y)
{
for (size_t j = 0; j < width; j += STEP_X)
{
for (size_t ii = 0; ii < STEP_Y; ii++)
{
const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4;
pImg0 = src + j + ii * srcStride;
pImg1 = pImg0 + srcStride;
pImg2 = pImg0 - srcStride;
pImg3 = pImg1 + srcStride;
pImg4 = pImg2 - srcStride;
const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
if (yVb < vb_pos && (yVb >= vb_pos - 2)) // above
{
pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1;
pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3;
pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2;
pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4;
}
else if (yVb >= vb_pos && (yVb <= vb_pos + 1)) // bottom
{
pImg2 = (yVb == vb_pos) ? pImg0 : pImg2;
pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4;
pImg1 = (yVb == vb_pos) ? pImg0 : pImg1;
pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3;
}
__m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128());
__m128i accumA = mmOffset;
__m128i accumB = mmOffset;
process2coeffs_5x5(params, &cur, &accumA, &accumB, 0, pImg3 + 0, pImg4 + 0, pImg1 + 1, pImg2 - 1);
process2coeffs_5x5(params, &cur, &accumA, &accumB, 1, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1);
process2coeffs_5x5(params, &cur, &accumA, &accumB, 2, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1);
bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1);
bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos);
if (!(isNearVBabove || isNearVBbelow))
{
accumA = _mm_srai_epi32(accumA, SHIFT);
accumB = _mm_srai_epi32(accumB, SHIFT);
}
else
{
accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3);
accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3);
}
accumA = _mm_packs_epi32(accumA, accumB);
accumA = _mm_add_epi16(accumA, cur);
accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin));
if (j + STEP_X <= width)
{
//_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA);
_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask));
}
else
{
//_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), accumA);
_mm_store_ss((float*) (dst + ii * dstStride + j), _mm_castsi128_ps(_mm_shuffle_epi8(accumA, mask)));
}
}
}
src += srcStride * STEP_Y;
dst += dstStride * STEP_Y;
}
}
#define sh(x) 0x0202 * (x & 7) + 0x0100 + 0x1010 * (x & 8)
static const uint16_t shuffleTab[4][2][8] = {
{
{ sh(0), sh(1), sh(2), sh(3), sh(4), sh(5), sh(6), sh(7) },
{ sh(8), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) },
},
{
{ sh(9), sh(4), sh(10), sh(8), sh(1), sh(5), sh(11), sh(7) },
{ sh(3), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) },
},
{
{ sh(0), sh(3), sh(2), sh(1), sh(8), sh(7), sh(6), sh(5) },
{ sh(4), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) },
},
{
{ sh(9), sh(8), sh(10), sh(4), sh(3), sh(7), sh(11), sh(5) },
{ sh(1), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) },
},
};
INLINE static void process2coeffs_7x7(__m128i params[2][2][6], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) {
const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur);
const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur);
const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur);
const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur);
__m128i val01A = _mm_unpacklo_epi16(val00, val10);
__m128i val01B = _mm_unpackhi_epi16(val00, val10);
__m128i val01C = _mm_unpacklo_epi16(val01, val11);
__m128i val01D = _mm_unpackhi_epi16(val01, val11);
__m128i limit01A = params[0][1][i];
__m128i limit01B = params[1][1][i];
val01A = _mm_min_epi16(val01A, limit01A);
val01B = _mm_min_epi16(val01B, limit01B);
val01C = _mm_min_epi16(val01C, limit01A);
val01D = _mm_min_epi16(val01D, limit01B);
limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A);
limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B);
val01A = _mm_max_epi16(val01A, limit01A);
val01B = _mm_max_epi16(val01B, limit01B);
val01C = _mm_max_epi16(val01C, limit01A);
val01D = _mm_max_epi16(val01D, limit01B);
val01A = _mm_add_epi16(val01A, val01C);
val01B = _mm_add_epi16(val01B, val01D);
const __m128i coeff01A = params[0][0][i];
const __m128i coeff01B = params[1][0][i];
*accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A));
*accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01B));
};
static void alf_filter_7x7_block_sse41(encoder_state_t* const state,
const kvz_pixel* src_pixels,
kvz_pixel* dst_pixels,
const int src_stride,
const int dst_stride,
const short* filter_set,
const int16_t* fClipSet,
clp_rng clp_rng,
const int width,
const int height,
int x_pos,
int y_pos,
int blk_dst_x,
int blk_dst_y,
int vb_pos,
const int vb_ctu_height)
{
assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2");
alf_component_id compId = COMPONENT_Y;
const size_t srcStride = src_stride;
const size_t dstStride = dst_stride;
const int SHIFT = state->encoder_control->bitdepth - 1;
const int ROUND = 1 << (SHIFT - 1);
const size_t STEP_X = 8;
const size_t STEP_Y = 4;
assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering");
assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering");
assert(height % STEP_Y == 0 && "Wrong endHeight in filtering");
assert(width % STEP_X == 0 && "Wrong endWidth in filtering");
const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos;
kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x;
const __m128i mmOffset = _mm_set1_epi32(ROUND);
const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND);
const __m128i mmMin = _mm_set1_epi16(clp_rng.min);
const __m128i mmMax = _mm_set1_epi16(clp_rng.max);
const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0);
for (size_t i = 0; i < height; i += STEP_Y)
{
const alf_classifier* pClass = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x;
for (size_t j = 0; j < width; j += STEP_X)
{
__m128i params[2][2][6];
for (int k = 0; k < 2; ++k)
{
const alf_classifier* cl = &pClass[j + 4 * k];
const int transpose_idx = cl->transpose_idx;
const int class_idx = cl->class_idx;
static_assert(sizeof(*filter_set) == 2, "ALF coeffs must be 16-bit wide");
static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide");
__m128i rawCoeff0, rawCoeff1;
__m128i rawClip0, rawClip1;
rawCoeff0 = _mm_loadu_si128((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF));
rawCoeff1 = _mm_loadl_epi64((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8));
rawClip0 = _mm_loadu_si128((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF));
rawClip1 = _mm_loadl_epi64((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8));
const __m128i s0 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][0]);
const __m128i s1 = _mm_xor_si128(s0, _mm_set1_epi8((char)0x80));
const __m128i s2 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][1]);
const __m128i s3 = _mm_xor_si128(s2, _mm_set1_epi8((char)0x80));
const __m128i rawCoeffLo = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s0), _mm_shuffle_epi8(rawCoeff1, s1));
const __m128i rawCoeffHi = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s2), _mm_shuffle_epi8(rawCoeff1, s3));
const __m128i rawClipLo = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s0), _mm_shuffle_epi8(rawClip1, s1));
const __m128i rawClipHi = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s2), _mm_shuffle_epi8(rawClip1, s3));
params[k][0][0] = _mm_shuffle_epi32(rawCoeffLo, 0x00);
params[k][0][1] = _mm_shuffle_epi32(rawCoeffLo, 0x55);
params[k][0][2] = _mm_shuffle_epi32(rawCoeffLo, 0xaa);
params[k][0][3] = _mm_shuffle_epi32(rawCoeffLo, 0xff);
params[k][0][4] = _mm_shuffle_epi32(rawCoeffHi, 0x00);
params[k][0][5] = _mm_shuffle_epi32(rawCoeffHi, 0x55);
params[k][1][0] = _mm_shuffle_epi32(rawClipLo, 0x00);
params[k][1][1] = _mm_shuffle_epi32(rawClipLo, 0x55);
params[k][1][2] = _mm_shuffle_epi32(rawClipLo, 0xaa);
params[k][1][3] = _mm_shuffle_epi32(rawClipLo, 0xff);
params[k][1][4] = _mm_shuffle_epi32(rawClipHi, 0x00);
params[k][1][5] = _mm_shuffle_epi32(rawClipHi, 0x55);
}
for (size_t ii = 0; ii < STEP_Y; ii++)
{
const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4, * pImg5, * pImg6;
pImg0 = src + j + ii * srcStride;
pImg1 = pImg0 + srcStride;
pImg2 = pImg0 - srcStride;
pImg3 = pImg1 + srcStride;
pImg4 = pImg2 - srcStride;
pImg5 = pImg3 + srcStride;
pImg6 = pImg4 - srcStride;
const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1);
if (yVb < vb_pos && (yVb >= vb_pos - 4)) // above
{
pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1;
pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3;
pImg5 = (yVb >= vb_pos - 3) ? pImg3 : pImg5;
pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2;
pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4;
pImg6 = (yVb >= vb_pos - 3) ? pImg4 : pImg6;
}
else if (yVb >= vb_pos && (yVb <= vb_pos + 3)) // bottom
{
pImg2 = (yVb == vb_pos) ? pImg0 : pImg2;
pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4;
pImg6 = (yVb <= vb_pos + 2) ? pImg4 : pImg6;
pImg1 = (yVb == vb_pos) ? pImg0 : pImg1;
pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3;
pImg5 = (yVb <= vb_pos + 2) ? pImg3 : pImg5;
}
__m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128());
__m128i accumA = mmOffset;
__m128i accumB = mmOffset;
process2coeffs_7x7(params, &cur, &accumA, &accumB, 0, pImg5 + 0, pImg6 + 0, pImg3 + 1, pImg4 - 1);
process2coeffs_7x7(params, &cur, &accumA, &accumB, 1, pImg3 + 0, pImg4 + 0, pImg3 - 1, pImg4 + 1);
process2coeffs_7x7(params, &cur, &accumA, &accumB, 2, pImg1 + 2, pImg2 - 2, pImg1 + 1, pImg2 - 1);
process2coeffs_7x7(params, &cur, &accumA, &accumB, 3, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1);
process2coeffs_7x7(params, &cur, &accumA, &accumB, 4, pImg1 - 2, pImg2 + 2, pImg0 + 3, pImg0 - 3);
process2coeffs_7x7(params, &cur, &accumA, &accumB, 5, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1);
bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1);
bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos);
if (!(isNearVBabove || isNearVBbelow))
{
accumA = _mm_srai_epi32(accumA, SHIFT);
accumB = _mm_srai_epi32(accumB, SHIFT);
}
else
{
accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3);
accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3);
}
accumA = _mm_packs_epi32(accumA, accumB);
accumA = _mm_add_epi16(accumA, cur);
accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin));
//_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA);
_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask));
}
}
src += srcStride * STEP_Y;
dst += dstStride * STEP_Y;
}
}
#endif // KVZ_BIT_DEPTH == 8 #endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_SSE41 #endif //COMPILE_INTEL_SSE41
@ -349,6 +753,8 @@ int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth) {
#if KVZ_BIT_DEPTH == 8 #if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){ if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41); success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41);
success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "sse41", 0, &alf_filter_5x5_block_sse41);
success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "sse41", 0, &alf_filter_7x7_block_sse41);
} }
#endif // KVZ_BIT_DEPTH == 8 #endif // KVZ_BIT_DEPTH == 8
#endif #endif