Take 8/16/24b loads and stores into separate functions

This commit is contained in:
Pauli Oikkonen 2019-08-05 21:11:25 +03:00
parent 10979f58fe
commit de0e97f711

View file

@ -170,6 +170,32 @@ static INLINE __m128i truncate_epi32_epi8(const __m128i v)
return sbs_8; return sbs_8;
} }
// Read 0-3 bytes (pixels) into uint32_t
static INLINE uint32_t load_border_bytes(const uint8_t *buf,
const int32_t start_pos,
const int32_t width_rest)
{
uint32_t last_dword = 0;
for (int32_t i = 0; i < width_rest; i++) {
uint8_t currb = buf[start_pos + i];
uint32_t currd = ((uint32_t)currb) << (i * 8);
last_dword |= currd;
}
return last_dword;
}
static INLINE void store_border_bytes( uint8_t *buf,
const uint32_t start_pos,
const int32_t width_rest,
uint32_t data)
{
for (uint32_t i = 0; i < width_rest; i++) {
uint8_t currb = data & 0xff;
buf[start_pos + i] = currb;
data >>= 8;
}
}
// Used for edge_ddistortion and band_ddistortion // Used for edge_ddistortion and band_ddistortion
static __m256i calc_diff_off_delta(const __m256i diff_lo, static __m256i calc_diff_off_delta(const __m256i diff_lo,
const __m256i diff_hi, const __m256i diff_hi,
@ -306,23 +332,10 @@ static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1; const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
// Same trick to read a narrow line as there is in the band SAO routine // Same trick to read a narrow line as there is in the band SAO routine
uint32_t a_last = 0, b_last = 0, c_last = 0, orig_last = 0; uint32_t a_last = load_border_bytes(rec_data, rest_apos, width_rest);
for (uint32_t i = 0; i < width_rest; i++) { uint32_t b_last = load_border_bytes(rec_data, rest_bpos, width_rest);
uint8_t currb_a = rec_data[rest_apos + (int32_t)i]; uint32_t c_last = load_border_bytes(rec_data, rest_cpos, width_rest);
uint8_t currb_b = rec_data[rest_bpos + (int32_t)i]; uint32_t orig_last = load_border_bytes(orig_data, rest_cpos, width_rest);
uint8_t currb_c = rec_data[rest_cpos + (int32_t)i];
uint8_t currb_orig = orig_data[rest_cpos + (int32_t)i];
uint32_t currd_a = ((uint32_t)currb_a) << (i * 8);
uint32_t currd_b = ((uint32_t)currb_b) << (i * 8);
uint32_t currd_c = ((uint32_t)currb_c) << (i * 8);
uint32_t currd_orig = ((uint32_t)currb_orig) << (i * 8);
a_last |= currd_a;
b_last |= currd_b;
c_last |= currd_c;
orig_last |= currd_orig;
}
const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos); const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos);
const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos); const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos);
@ -437,29 +450,15 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
const int32_t curr_bpos = (y + b_ofs.y) * block_width + x + b_ofs.x; const int32_t curr_bpos = (y + b_ofs.y) * block_width + x + b_ofs.x;
const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1; const int32_t rest_bpos = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
// Same trick to read a narrow line as there is in the band SAO routine uint32_t a_last = load_border_bytes(rec_data, rest_apos, width_rest);
uint32_t a_last = 0, b_last = 0, c_last = 0, orig_last = 0; uint32_t b_last = load_border_bytes(rec_data, rest_bpos, width_rest);
for (uint32_t i = 0; i < width_rest; i++) { uint32_t c_last = load_border_bytes(rec_data, rest_cpos, width_rest);
uint8_t currb_a = rec_data[rest_apos + (int32_t)i]; uint32_t orig_last = load_border_bytes(orig_data, rest_cpos, width_rest);
uint8_t currb_b = rec_data[rest_bpos + (int32_t)i];
uint8_t currb_c = rec_data[rest_cpos + (int32_t)i];
uint8_t currb_orig = orig_data[rest_cpos + (int32_t)i];
uint32_t currd_a = ((uint32_t)currb_a) << (i * 8); const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos);
uint32_t currd_b = ((uint32_t)currb_b) << (i * 8); const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos);
uint32_t currd_c = ((uint32_t)currb_c) << (i * 8); const int32_t *c_ptr = (const int32_t *)(rec_data + curr_cpos);
uint32_t currd_orig = ((uint32_t)currb_orig) << (i * 8); const int32_t *orig_ptr = (const int32_t *)(orig_data + curr_cpos);
a_last |= currd_a;
b_last |= currd_b;
c_last |= currd_c;
orig_last |= currd_orig;
}
const int32_t *a_ptr = (const int32_t *)(rec_data + curr_apos);
const int32_t *b_ptr = (const int32_t *)(rec_data + curr_bpos);
const int32_t *c_ptr = (const int32_t *)(rec_data + curr_cpos);
const int32_t *orig_ptr = (const int32_t *)(orig_data + curr_cpos);
__m256i a = _mm256_maskload_epi32(a_ptr, db4_mask); __m256i a = _mm256_maskload_epi32(a_ptr, db4_mask);
__m256i b = _mm256_maskload_epi32(b_ptr, db4_mask); __m256i b = _mm256_maskload_epi32(b_ptr, db4_mask);
@ -649,12 +648,8 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
// that particular place can never be loaded into by the maskmove // that particular place can never be loaded into by the maskmove
// (otherwise that vector would go through the divisible-by-32 code // (otherwise that vector would go through the divisible-by-32 code
// path). // path).
uint32_t last_dword = 0; uint32_t last_dword = load_border_bytes(rec_data, rest_srcpos, width_rest);
for (uint32_t i = 0; i < width_rest; i++) {
uint8_t currb = rec_data[rest_srcpos + i];
uint32_t currd = ((uint32_t)currb) << (i * 8);
last_dword |= currd;
}
const int32_t *src_ptr = (const int32_t *)( rec_data + curr_srcpos); const int32_t *src_ptr = (const int32_t *)( rec_data + curr_srcpos);
int32_t *dst_ptr = ( int32_t *)(new_rec_data + curr_dstpos); int32_t *dst_ptr = ( int32_t *)(new_rec_data + curr_dstpos);
@ -665,11 +660,7 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
_mm256_maskstore_epi32(dst_ptr, db4_mask, result); _mm256_maskstore_epi32(dst_ptr, db4_mask, result);
uint32_t last_dword_dst = _mm256_extract_epi32(result, 7); uint32_t last_dword_dst = _mm256_extract_epi32(result, 7);
for (uint32_t i = 0; i < width_rest; i++) { store_border_bytes(new_rec_data, rest_dstpos, width_rest, last_dword_dst);
uint8_t currb = last_dword_dst & 0xff;
new_rec_data[rest_dstpos + i] = currb;
last_dword_dst >>= 8;
}
} }
} }
} }
@ -761,21 +752,10 @@ static INLINE void reconstruct_color_other(const encoder_control_t *encoder,
const uint32_t curr_dstpos = y * new_stride + x; const uint32_t curr_dstpos = y * new_stride + x;
const uint32_t rest_dstpos = y * new_stride + width_db4; const uint32_t rest_dstpos = y * new_stride + width_db4;
// Same trick to read a narrow line as there is in the band SAO routine uint32_t a_last = load_border_bytes(rec_data, rest_apos, width_rest);
uint32_t a_last = 0, b_last = 0, c_last = 0; uint32_t b_last = load_border_bytes(rec_data, rest_bpos, width_rest);
for (uint32_t i = 0; i < width_rest; i++) { uint32_t c_last = load_border_bytes(rec_data, rest_srcpos, width_rest);
uint8_t currb_a = rec_data[rest_apos + (int32_t)i];
uint8_t currb_b = rec_data[rest_bpos + (int32_t)i];
uint8_t currb_c = rec_data[rest_srcpos + (int32_t)i];
uint32_t currd_a = ((uint32_t)currb_a) << (i * 8);
uint32_t currd_b = ((uint32_t)currb_b) << (i * 8);
uint32_t currd_c = ((uint32_t)currb_c) << (i * 8);
a_last |= currd_a;
b_last |= currd_b;
c_last |= currd_c;
}
const int32_t *a_ptr = (const int32_t *)( rec_data + curr_apos); const int32_t *a_ptr = (const int32_t *)( rec_data + curr_apos);
const int32_t *b_ptr = (const int32_t *)( rec_data + curr_bpos); const int32_t *b_ptr = (const int32_t *)( rec_data + curr_bpos);
const int32_t *c_ptr = (const int32_t *)( rec_data + curr_srcpos); const int32_t *c_ptr = (const int32_t *)( rec_data + curr_srcpos);
@ -794,11 +774,7 @@ static INLINE void reconstruct_color_other(const encoder_control_t *encoder,
uint32_t last_dword = _mm256_extract_epi32(res, 7); uint32_t last_dword = _mm256_extract_epi32(res, 7);
for (uint32_t i = 0; i < width_rest; i++) { store_border_bytes(new_rec_data, rest_dstpos, width_rest, last_dword);
uint8_t currb = last_dword & 0xff;
new_rec_data[rest_dstpos + i] = currb;
last_dword >>= 8;
}
} }
} }
} }