mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
first working optimation
This commit is contained in:
parent
c148aff9fb
commit
96dc60a1ed
|
@ -394,56 +394,6 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
|
|||
}
|
||||
|
||||
|
||||
static void sao_reconstruct_color_avx(const encoder_control_t * const encoder,
|
||||
const kvz_pixel *rec_data,
|
||||
kvz_pixel *new_rec_data,
|
||||
const sao_info_t *sao,
|
||||
int stride,
|
||||
int new_stride,
|
||||
int block_width,
|
||||
int block_height,
|
||||
color_t color_i)
|
||||
{
|
||||
// Arrays orig_data and rec_data are quarter size for chroma.
|
||||
int offset_v = color_i == COLOR_V ? 5 : 0;
|
||||
|
||||
if (sao->type == SAO_TYPE_BAND) {
|
||||
int offsets[1 << KVZ_BIT_DEPTH];
|
||||
kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
|
||||
for (int y = 0; y < block_height; ++y) {
|
||||
for (int x = 0; x < block_width; ++x) {
|
||||
new_rec_data[y * new_stride + x] = offsets[rec_data[y * stride + x]];
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Don't sample the edge pixels because this function doesn't have access to
|
||||
// their neighbours.
|
||||
for (int y = 0; y < block_height; ++y) {
|
||||
for (int x = 0; x < block_width; x += 8) {
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
|
||||
int test = x + i;
|
||||
vector2d_t a_ofs = g_sao_edge_offsets[sao->eo_class][0];
|
||||
vector2d_t b_ofs = g_sao_edge_offsets[sao->eo_class][1];
|
||||
const kvz_pixel *c_data = &rec_data[y * stride + test];
|
||||
kvz_pixel *new_data = &new_rec_data[y * new_stride + test];
|
||||
kvz_pixel a = c_data[a_ofs.y * stride + a_ofs.x];
|
||||
kvz_pixel c = c_data[0];
|
||||
kvz_pixel b = c_data[b_ofs.y * stride + b_ofs.x];
|
||||
|
||||
int eo_cat = sao_calc_eo_cat(a, b, c);
|
||||
|
||||
new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat + offset_v]);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
|
||||
const kvz_pixel *rec_data,
|
||||
kvz_pixel *new_rec_data,
|
||||
|
@ -458,11 +408,15 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
|
|||
int offset_v = color_i == COLOR_V ? 5 : 0;
|
||||
|
||||
|
||||
|
||||
/* Optimate this
|
||||
*/
|
||||
if (sao->type == SAO_TYPE_BAND) {
|
||||
int offsets[1 << KVZ_BIT_DEPTH];
|
||||
kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
|
||||
for (int y = 0; y < block_height; ++y) {
|
||||
for (int x = 0; x < block_width; ++x) {
|
||||
|
||||
new_rec_data[y * new_stride + x] = offsets[rec_data[y * stride + x]];
|
||||
}
|
||||
}
|
||||
|
@ -502,33 +456,20 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
|
|||
__m256i vector_sao_offsets_epi32 = _mm256_set_epi32(sao->offsets[temp[7]], sao->offsets[temp[6]], sao->offsets[temp[5]], sao->offsets[temp[4]], sao->offsets[temp[3]], sao->offsets[temp[2]], sao->offsets[temp[1]], sao->offsets[temp[0]]);
|
||||
vector_sao_offsets_epi32 = _mm256_add_epi32(vector_sao_offsets_epi32, vector_c_data0_epi32);
|
||||
|
||||
|
||||
// Convert int to int8_t
|
||||
__m256i temp_epi16 = _mm256_packus_epi32(vector_sao_offsets_epi32, vector_sao_offsets_epi32);
|
||||
temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
__m256i temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
|
||||
|
||||
// Store 64-bits from vector to memory
|
||||
_mm_storel_epi64((__m128i*)&(new_rec_data[y * new_stride + x]), _mm256_castsi256_si128(temp_epi8));
|
||||
|
||||
|
||||
int*temp2 = (int*)&vector_sao_offsets_epi32;
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
|
||||
const kvz_pixel *c_data = &rec_data[y * stride + x + i];
|
||||
|
||||
kvz_pixel *new_data = &new_rec_data[y * new_stride + x + i];
|
||||
|
||||
//printf("%d ", c_data[0] + sao->offsets[temp[i]]);
|
||||
//printf("%d \n", temp2[i]);
|
||||
|
||||
|
||||
new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, temp2[i]);//c_data[0] + sao->offsets[temp[i]]);
|
||||
test = x;
|
||||
}
|
||||
//Low = 0
|
||||
//High = (1 << KVZ_BIT_DEPTH)
|
||||
//Value = c_data[0] + sao->offsets[eo_cat + offset_v]
|
||||
//new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat + offset_v]);
|
||||
test = x;
|
||||
}
|
||||
|
||||
|
||||
/* Some optimation still need to be done, because this function uses only 6 pixels
|
||||
*/
|
||||
for (int i = 0; i < (block_width - test); ++i) {
|
||||
|
||||
const kvz_pixel *c_data = &rec_data[y * stride + test + i];
|
||||
|
@ -550,7 +491,8 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------------
|
||||
// Remove when done
|
||||
static int sao_band_ddistortion_avx2(const encoder_state_t * const state,
|
||||
const kvz_pixel *orig_data,
|
||||
const kvz_pixel *rec_data,
|
||||
|
@ -580,6 +522,68 @@ static int sao_band_ddistortion_avx2(const encoder_state_t * const state,
|
|||
|
||||
return sum;
|
||||
}
|
||||
//--------------------------------------------------------------------------------------
|
||||
static int sao_band_ddistortion_avx(const encoder_state_t * const state,
|
||||
const kvz_pixel *orig_data,
|
||||
const kvz_pixel *rec_data,
|
||||
int block_width,
|
||||
int block_height,
|
||||
int band_pos,
|
||||
int sao_bands[4])
|
||||
{
|
||||
int y, x;
|
||||
int shift = state->encoder_control->bitdepth - 5;
|
||||
int sum = 0;
|
||||
|
||||
__m256i sum_epi32 = _mm256_setzero_si256();
|
||||
|
||||
__m256i band_pos_epi32 = _mm256_set1_epi32(band_pos);
|
||||
|
||||
for (y = 0; y < block_height; ++y) {
|
||||
for (x = 0; x < block_width; x += 8) {
|
||||
|
||||
//int band = (rec_data[y * block_width + x] >> shift) - band_pos;
|
||||
|
||||
__m256i band_epi32 = _mm256_loadu_si256((__m256i*)&rec_data[y * block_width + x]);
|
||||
band_epi32 = _mm256_srli_epi32(band_epi32, shift);
|
||||
band_epi32 = _mm256_sub_epi32(band_epi32, band_pos_epi32);
|
||||
|
||||
__m256i offset_epi32 = _mm256_setzero_si256();
|
||||
__m256i temp1 = _mm256_cmpeq_epi32(offset_epi32, band_epi32);
|
||||
temp1 = _mm256_or_si256(temp1, _mm256_cmpgt_epi32(band_epi32, offset_epi32));
|
||||
__m256i temp2 = _mm256_cmpgt_epi32(_mm256_set1_epi32(4), band_epi32);
|
||||
|
||||
__m256i mask_epi32 = _mm256_andnot_si256(temp2, temp1);
|
||||
int*band = (int*)&band_epi32;
|
||||
|
||||
offset_epi32 = _mm256_setr_epi32(band[0], band[1], band[2], band[3], band[4], band[5], band[6], band[7]);
|
||||
|
||||
|
||||
__m256i orig_data_epi32 = _mm256_loadu_si256((__m256i*)&orig_data[y * block_width + x]);
|
||||
__m256i rec_data_epi32 = _mm256_loadu_si256((__m256i*)&rec_data[y * block_width + x]);
|
||||
__m256i diff_epi32 = _mm256_sub_epi32(orig_data_epi32, rec_data_epi32);
|
||||
|
||||
temp1 = _mm256_sub_epi32(diff_epi32, offset_epi32);
|
||||
temp1 = _mm256_mullo_epi32(temp1, temp1);
|
||||
|
||||
temp2 = _mm256_mullo_epi32(diff_epi32, diff_epi32);
|
||||
|
||||
temp1 = _mm256_sub_epi32(temp1, temp2);
|
||||
temp1 = _mm256_and_si256(temp1, mask_epi32);
|
||||
|
||||
sum_epi32 = _mm256_add_epi32(sum_epi32, temp1);
|
||||
}
|
||||
}
|
||||
|
||||
sum_epi32 = _mm256_hadd_epi32(sum_epi32, sum_epi32);
|
||||
sum_epi32 = _mm256_hadd_epi32(sum_epi32, sum_epi32);
|
||||
|
||||
int*temp = (int*)&sum_epi32;
|
||||
|
||||
sum = temp[0] + temp[3];
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
#endif //COMPILE_INTEL_AVX2
|
||||
|
||||
|
|
Loading…
Reference in a new issue