mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Take 8/16/24b loads and stores into separate functions
This commit is contained in:
parent
10979f58fe
commit
de0e97f711
|
@ -170,6 +170,32 @@ static INLINE __m128i truncate_epi32_epi8(const __m128i v)
|
||||||
return sbs_8;
|
return sbs_8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Read 0-3 bytes (pixels) into uint32_t
|
||||||
|
static INLINE uint32_t load_border_bytes(const uint8_t *buf,
|
||||||
|
const int32_t start_pos,
|
||||||
|
const int32_t width_rest)
|
||||||
|
{
|
||||||
|
uint32_t last_dword = 0;
|
||||||
|
for (int32_t i = 0; i < width_rest; i++) {
|
||||||
|
uint8_t currb = buf[start_pos + i];
|
||||||
|
uint32_t currd = ((uint32_t)currb) << (i * 8);
|
||||||
|
last_dword |= currd;
|
||||||
|
}
|
||||||
|
return last_dword;
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void store_border_bytes( uint8_t *buf,
|
||||||
|
const uint32_t start_pos,
|
||||||
|
const int32_t width_rest,
|
||||||
|
uint32_t data)
|
||||||
|
{
|
||||||
|
for (uint32_t i = 0; i < width_rest; i++) {
|
||||||
|
uint8_t currb = data & 0xff;
|
||||||
|
buf[start_pos + i] = currb;
|
||||||
|
data >>= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Used for edge_ddistortion and band_ddistortion
|
// Used for edge_ddistortion and band_ddistortion
|
||||||
static __m256i calc_diff_off_delta(const __m256i diff_lo,
|
static __m256i calc_diff_off_delta(const __m256i diff_lo,
|
||||||
const __m256i diff_hi,
|
const __m256i diff_hi,
|
||||||
|
@ -306,23 +332,10 @@ static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
|
||||||
const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
|
const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
|
||||||
|
|
||||||
// Same trick to read a narrow line as there is in the band SAO routine
|
// Same trick to read a narrow line as there is in the band SAO routine
|
||||||
uint32_t a_last = 0, b_last = 0, c_last = 0, orig_last = 0;
|
uint32_t a_last = load_border_bytes(rec_data, rest_apos, width_rest);
|
||||||
for (uint32_t i = 0; i < width_rest; i++) {
|
uint32_t b_last = load_border_bytes(rec_data, rest_bpos, width_rest);
|
||||||
uint8_t currb_a = rec_data[rest_apos + (int32_t)i];
|
uint32_t c_last = load_border_bytes(rec_data, rest_cpos, width_rest);
|
||||||
uint8_t currb_b = rec_data[rest_bpos + (int32_t)i];
|
uint32_t orig_last = load_border_bytes(orig_data, rest_cpos, width_rest);
|
||||||
uint8_t currb_c = rec_data[rest_cpos + (int32_t)i];
|
|
||||||
uint8_t currb_orig = orig_data[rest_cpos + (int32_t)i];
|
|
||||||
|
|
||||||
uint32_t currd_a = ((uint32_t)currb_a) << (i * 8);
|
|
||||||
uint32_t currd_b = ((uint32_t)currb_b) << (i * 8);
|
|
||||||
uint32_t currd_c = ((uint32_t)currb_c) << (i * 8);
|
|
||||||
uint32_t currd_orig = ((uint32_t)currb_orig) << (i * 8);
|
|
||||||
|
|
||||||
a_last |= currd_a;
|
|
||||||
b_last |= currd_b;
|
|
||||||
c_last |= currd_c;
|
|
||||||
orig_last |= currd_orig;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos);
|
const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos);
|
||||||
const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos);
|
const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos);
|
||||||
|
@ -437,29 +450,15 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
|
||||||
const int32_t curr_bpos = (y + b_ofs.y) * block_width + x + b_ofs.x;
|
const int32_t curr_bpos = (y + b_ofs.y) * block_width + x + b_ofs.x;
|
||||||
const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
|
const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
|
||||||
|
|
||||||
// Same trick to read a narrow line as there is in the band SAO routine
|
uint32_t a_last = load_border_bytes(rec_data, rest_apos, width_rest);
|
||||||
uint32_t a_last = 0, b_last = 0, c_last = 0, orig_last = 0;
|
uint32_t b_last = load_border_bytes(rec_data, rest_bpos, width_rest);
|
||||||
for (uint32_t i = 0; i < width_rest; i++) {
|
uint32_t c_last = load_border_bytes(rec_data, rest_cpos, width_rest);
|
||||||
uint8_t currb_a = rec_data[rest_apos + (int32_t)i];
|
uint32_t orig_last = load_border_bytes(orig_data, rest_cpos, width_rest);
|
||||||
uint8_t currb_b = rec_data[rest_bpos + (int32_t)i];
|
|
||||||
uint8_t currb_c = rec_data[rest_cpos + (int32_t)i];
|
|
||||||
uint8_t currb_orig = orig_data[rest_cpos + (int32_t)i];
|
|
||||||
|
|
||||||
uint32_t currd_a = ((uint32_t)currb_a) << (i * 8);
|
const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos);
|
||||||
uint32_t currd_b = ((uint32_t)currb_b) << (i * 8);
|
const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos);
|
||||||
uint32_t currd_c = ((uint32_t)currb_c) << (i * 8);
|
const int32_t *c_ptr = (const int32_t *)(rec_data + curr_cpos);
|
||||||
uint32_t currd_orig = ((uint32_t)currb_orig) << (i * 8);
|
const int32_t *orig_ptr = (const int32_t *)(orig_data + curr_cpos);
|
||||||
|
|
||||||
a_last |= currd_a;
|
|
||||||
b_last |= currd_b;
|
|
||||||
c_last |= currd_c;
|
|
||||||
orig_last |= currd_orig;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos);
|
|
||||||
const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos);
|
|
||||||
const int32_t *c_ptr = (const int32_t *)(rec_data + curr_cpos);
|
|
||||||
const int32_t *orig_ptr = (const int32_t *)(orig_data + curr_cpos);
|
|
||||||
|
|
||||||
__m256i a = _mm256_maskload_epi32(a_ptr, db4_mask);
|
__m256i a = _mm256_maskload_epi32(a_ptr, db4_mask);
|
||||||
__m256i b = _mm256_maskload_epi32(b_ptr, db4_mask);
|
__m256i b = _mm256_maskload_epi32(b_ptr, db4_mask);
|
||||||
|
@ -649,12 +648,8 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
|
||||||
// that particular place can never be loaded into by the maskmove
|
// that particular place can never be loaded into by the maskmove
|
||||||
// (otherwise that vector would go through the divisible-by-32 code
|
// (otherwise that vector would go through the divisible-by-32 code
|
||||||
// path).
|
// path).
|
||||||
uint32_t last_dword = 0;
|
uint32_t last_dword = load_border_bytes(rec_data, rest_srcpos, width_rest);
|
||||||
for (uint32_t i = 0; i < width_rest; i++) {
|
|
||||||
uint8_t currb = rec_data[rest_srcpos + i];
|
|
||||||
uint32_t currd = ((uint32_t)currb) << (i * 8);
|
|
||||||
last_dword |= currd;
|
|
||||||
}
|
|
||||||
const int32_t *src_ptr = (const int32_t *)( rec_data + curr_srcpos);
|
const int32_t *src_ptr = (const int32_t *)( rec_data + curr_srcpos);
|
||||||
int32_t *dst_ptr = ( int32_t *)(new_rec_data + curr_dstpos);
|
int32_t *dst_ptr = ( int32_t *)(new_rec_data + curr_dstpos);
|
||||||
|
|
||||||
|
@ -665,11 +660,7 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
|
||||||
_mm256_maskstore_epi32(dst_ptr, db4_mask, result);
|
_mm256_maskstore_epi32(dst_ptr, db4_mask, result);
|
||||||
uint32_t last_dword_dst = _mm256_extract_epi32(result, 7);
|
uint32_t last_dword_dst = _mm256_extract_epi32(result, 7);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < width_rest; i++) {
|
store_border_bytes(new_rec_data, rest_dstpos, width_rest, last_dword_dst);
|
||||||
uint8_t currb = last_dword_dst & 0xff;
|
|
||||||
new_rec_data[rest_dstpos + i] = currb;
|
|
||||||
last_dword_dst >>= 8;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -761,21 +752,10 @@ static INLINE void reconstruct_color_other(const encoder_control_t *encoder,
|
||||||
const uint32_t curr_dstpos = y * new_stride + x;
|
const uint32_t curr_dstpos = y * new_stride + x;
|
||||||
const uint32_t rest_dstpos = y * new_stride + width_db4;
|
const uint32_t rest_dstpos = y * new_stride + width_db4;
|
||||||
|
|
||||||
// Same trick to read a narrow line as there is in the band SAO routine
|
uint32_t a_last = load_border_bytes(rec_data, rest_apos, width_rest);
|
||||||
uint32_t a_last = 0, b_last = 0, c_last = 0;
|
uint32_t b_last = load_border_bytes(rec_data, rest_bpos, width_rest);
|
||||||
for (uint32_t i = 0; i < width_rest; i++) {
|
uint32_t c_last = load_border_bytes(rec_data, rest_srcpos, width_rest);
|
||||||
uint8_t currb_a = rec_data[rest_apos + (int32_t)i];
|
|
||||||
uint8_t currb_b = rec_data[rest_bpos + (int32_t)i];
|
|
||||||
uint8_t currb_c = rec_data[rest_srcpos + (int32_t)i];
|
|
||||||
|
|
||||||
uint32_t currd_a = ((uint32_t)currb_a) << (i * 8);
|
|
||||||
uint32_t currd_b = ((uint32_t)currb_b) << (i * 8);
|
|
||||||
uint32_t currd_c = ((uint32_t)currb_c) << (i * 8);
|
|
||||||
|
|
||||||
a_last |= currd_a;
|
|
||||||
b_last |= currd_b;
|
|
||||||
c_last |= currd_c;
|
|
||||||
}
|
|
||||||
const int32_t *a_ptr = (const int32_t *)( rec_data + curr_apos);
|
const int32_t *a_ptr = (const int32_t *)( rec_data + curr_apos);
|
||||||
const int32_t *b_ptr = (const int32_t *)( rec_data + curr_bpos);
|
const int32_t *b_ptr = (const int32_t *)( rec_data + curr_bpos);
|
||||||
const int32_t *c_ptr = (const int32_t *)( rec_data + curr_srcpos);
|
const int32_t *c_ptr = (const int32_t *)( rec_data + curr_srcpos);
|
||||||
|
@ -794,11 +774,7 @@ static INLINE void reconstruct_color_other(const encoder_control_t *encoder,
|
||||||
|
|
||||||
uint32_t last_dword = _mm256_extract_epi32(res, 7);
|
uint32_t last_dword = _mm256_extract_epi32(res, 7);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < width_rest; i++) {
|
store_border_bytes(new_rec_data, rest_dstpos, width_rest, last_dword);
|
||||||
uint8_t currb = last_dword & 0xff;
|
|
||||||
new_rec_data[rest_dstpos + i] = currb;
|
|
||||||
last_dword >>= 8;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue