Use the efficient method to find first and last nz coeffs in block

This commit is contained in:
Pauli Oikkonen 2018-12-10 19:21:27 +02:00
parent 7e9203f566
commit 50a888e789

View file

@ -26,6 +26,23 @@
#include "kvz_math.h" #include "kvz_math.h"
#include <immintrin.h> #include <immintrin.h>
// If ints is completely zero, returns 16 in *first and -1 in *last
static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
{
// Note that nonzero_bytes will always have both bytes set for a set word
// even if said word only had one of its bytes set, because we're doing 16
// bit wide comparisons. No big deal, just shift results to the right by one
// bit to have the results represent indexes of first set words, not bytes.
// Another note, it has to use right shift instead of division to preserve
// behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
const __m256i zero = _mm256_setzero_si256();
__m256i zeros = _mm256_cmpeq_epi16(ints, zero);
uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
*first = ( (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
*last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
}
/** /**
* \brief Encode (X,Y) position of the last significant coefficient * \brief Encode (X,Y) position of the last significant coefficient
* *
@ -254,7 +271,7 @@ void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
// TODO: reorder coeff and vectorize? // TODO: reorder coeff and vectorize?
const __m256i ns = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m256i ns = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m256i coeffs = _mm256_load_si256((__m256i *)coeff_reord); __m256i coeffs = _mm256_load_si256((__m256i *)(coeff_reord + sub_pos));
__m256i sigs_inv = _mm256_cmpeq_epi16(coeffs, zero); __m256i sigs_inv = _mm256_cmpeq_epi16(coeffs, zero);
__m256i is = _mm256_set1_epi16(i); __m256i is = _mm256_set1_epi16(i);
__m256i is_zero = _mm256_cmpeq_epi16(is, zero); __m256i is_zero = _mm256_cmpeq_epi16(is, zero);
@ -275,17 +292,14 @@ void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
} }
if (sig) { if (sig) {
abs_coeff[num_non_zero] = abs(coeff[blk_pos]); abs_coeff[num_non_zero] = abs(coeff_reord[scan_pos_sig]);
coeff_signs = 2 * coeff_signs + (coeff[blk_pos] < 0); coeff_signs = 2 * coeff_signs + (coeff[blk_pos] < 0);
num_non_zero++; num_non_zero++;
}
}
__m256i masked_coeffs = _mm256_andnot_si256(sigs_inv, coeffs);
get_first_last_nz_int16(masked_coeffs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
if (last_nz_pos_in_cg == -1) {
last_nz_pos_in_cg = scan_pos_sig;
}
first_nz_pos_in_cg = scan_pos_sig;
}
}
} else { } else {
scan_pos_sig = sub_pos - 1; scan_pos_sig = sub_pos - 1;
} }