_mm256_cvtsi256_si32 breaks system, too much bits. back to extract

This commit is contained in:
Reima Hyvönen 2018-08-10 09:59:33 +03:00
parent 4f1f80b2cb
commit 99dc43074f
2 changed files with 68 additions and 18 deletions

View file

@ -751,10 +751,7 @@ static void inter_recon_bipred_no_mov_avx2(
case 4: case 4:
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu + 0] = _mm256_extract_epi8(temp_y_epi8, 0); lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu] = _mm256_cvtsi256_si32(temp_y_epi8);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu + 1] = _mm256_extract_epi8(temp_y_epi8, 1);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu + 2] = _mm256_extract_epi8(temp_y_epi8, 2);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu + 3] = _mm256_extract_epi8(temp_y_epi8, 3);
break; break;
@ -813,8 +810,8 @@ static void inter_recon_bipred_no_mov_avx2(
case 8: case 8:
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8)); lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm256_cvtsi256_si32(temp_u_epi8);
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8)); lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm256_cvtsi256_si32(temp_v_epi8);
break; break;
@ -921,7 +918,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
case 4: case 4:
temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16); temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu] = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); //lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu] = _mm256_cvtsi256_si32(temp_epi8);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu] = _mm256_extract_epi8(temp_epi8, 0);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu+1] = _mm256_extract_epi8(temp_epi8, 1);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu+2] = _mm256_extract_epi8(temp_epi8, 2);
lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu+3] = _mm256_extract_epi8(temp_epi8, 3);
break; break;
case 8: case 8:
@ -936,7 +937,7 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
break; break;
case 16: case 16:
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_y_epi16, temp_y_epi16), _MM_SHUFFLE(0, 2, 1, 3)); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_y_epi16, temp_y_epi16), _MM_SHUFFLE(1, 3, 2, 0));
_mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); _mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
break; break;
@ -952,7 +953,7 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
} }
else { else {
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_y, temp_y_epi16), _MM_SHUFFLE(0, 2, 1, 3)); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_y, temp_y_epi16), _MM_SHUFFLE(1, 3, 2, 0));
// Store 256-bits of integer data into memory // Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), temp_epi8); _mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), temp_epi8);
@ -1009,10 +1010,22 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); // lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] =_mm256_cvtsi256_si32(temp_epi8);
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm256_extract_epi8(temp_epi8, 0);
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu+1] = _mm256_extract_epi8(temp_epi8, 1);
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu+2] = _mm256_extract_epi8(temp_epi8, 2);
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu+3] = _mm256_extract_epi8(temp_epi8, 3);
temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16); temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16);
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm256_extract_epi8(temp_epi8, 0);
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 1] = _mm256_extract_epi8(temp_epi8, 1);
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 2] = _mm256_extract_epi8(temp_epi8, 2);
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 3] = _mm256_extract_epi8(temp_epi8, 3);
//lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu] = _mm256_cvtsi256_si32(temp_epi8);
break; break;
@ -1032,10 +1045,10 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
case 32: case 32:
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(0, 2, 1, 3)); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(1, 3, 2, 0));
_mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_v_epi16, temp_v_epi16), _MM_SHUFFLE(0, 2, 1, 3)); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_v_epi16, temp_v_epi16), _MM_SHUFFLE(1, 3, 2, 0));
_mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); _mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
break; break;
@ -1054,11 +1067,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
} }
else { else {
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_u, temp_u_epi16), _MM_SHUFFLE(0, 2, 1, 3)); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_u, temp_u_epi16), _MM_SHUFFLE(1, 3, 2, 0));
_mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8); _mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8);
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_v, temp_v_epi16), _MM_SHUFFLE(0, 2, 1, 3)); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_v, temp_v_epi16), _MM_SHUFFLE(1, 3, 2, 0));
_mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8); _mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8);

View file

@ -141,13 +141,50 @@ TEST test_inter_recon_bipred()
memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64); memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32); memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32); memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
for (temp_y = 0; temp_y < height; ++temp_y) {
int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
for (temp_x = 0; temp_x < width; temp_x += 1) {
int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
printf("%d ", expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]);
}
}
printf("\n");
for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
printf("%d ", expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]);
}
}
printf("\n");
kvz_inter_recon_bipred_generic(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v); kvz_inter_recon_bipred_generic(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v);
for (temp_y = 0; temp_y < height; ++temp_y) {
int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
for (temp_x = 0; temp_x < width; temp_x += 1) {
int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
printf("%d ", result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]);
}
}
printf("\n");
for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
printf("%d ", result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]);
}
}
printf("\n");
for (temp_y = 0; temp_y < height; ++temp_y) { for (temp_y = 0; temp_y < height; ++temp_y) {
int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
for (temp_x = 0; temp_x < width; ++temp_x) { for (temp_x = 0; temp_x < width; temp_x+=1) {
int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
ASSERT_EQ_FMT(expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], "%d"); ASSERT_EQ_FMT(expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], "%d");
} }