Exclude 8-bit-only code from 10-bit builds and use uint8_t instead of kvz_pixel for code that assumes 8-bit pixels

This commit is contained in:
Pauli Oikkonen 2020-09-02 17:44:19 +03:00
parent 31ef4e4216
commit 780da4568a
12 changed files with 234 additions and 179 deletions

View file

@ -25,6 +25,8 @@
#include "strategies/avx2/dct-avx2.h" #include "strategies/avx2/dct-avx2.h"
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include <immintrin.h> #include <immintrin.h>
#include "strategyselector.h" #include "strategyselector.h"
@ -924,12 +926,14 @@ static void matrix_i ## type ## _## n ## x ## n ## _avx2(int8_t bitdepth, const
TRANSFORM(dct, 32); TRANSFORM(dct, 32);
ITRANSFORM(dct, 32); ITRANSFORM(dct, 32);
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2 #endif //COMPILE_INTEL_AVX2
int kvz_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth) int kvz_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
{ {
bool success = true; bool success = true;
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){ if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2); success &= kvz_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2);
@ -945,6 +949,7 @@ int kvz_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2); success &= kvz_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2);
success &= kvz_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2); success &= kvz_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2 #endif //COMPILE_INTEL_AVX2
return success; return success;
} }

View file

@ -21,10 +21,12 @@
#include "strategies/avx2/intra-avx2.h" #include "strategies/avx2/intra-avx2.h"
#if COMPILE_INTEL_AVX2 && defined X86_64 #if COMPILE_INTEL_AVX2 && defined X86_64
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include <immintrin.h> #include <immintrin.h>
#include <stdlib.h> #include <stdlib.h>
#include "kvazaar.h"
#include "strategyselector.h" #include "strategyselector.h"
#include "strategies/missing-intel-intrinsics.h" #include "strategies/missing-intel-intrinsics.h"
@ -35,7 +37,7 @@
* \param delta_pos Fractional pixel precise position of sample displacement * \param delta_pos Fractional pixel precise position of sample displacement
* \param x Sample offset in direction x in ref_main array * \param x Sample offset in direction x in ref_main array
*/ */
static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ static INLINE __m128i filter_4x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){
int8_t delta_int = delta_pos >> 5; int8_t delta_int = delta_pos >> 5;
int8_t delta_fract = delta_pos & (32-1); int8_t delta_fract = delta_pos & (32-1);
@ -58,7 +60,7 @@ static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_p
* \param sample_disp Sample displacement per row * \param sample_disp Sample displacement per row
* \param vertical_mode Mode direction, true if vertical * \param vertical_mode Mode direction, true if vertical
*/ */
static void filter_4x4_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){ static void filter_4x4_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){
__m128i row0 = filter_4x1_avx2(ref_main, 1 * sample_disp, 0); __m128i row0 = filter_4x1_avx2(ref_main, 1 * sample_disp, 0);
__m128i row1 = filter_4x1_avx2(ref_main, 2 * sample_disp, 0); __m128i row1 = filter_4x1_avx2(ref_main, 2 * sample_disp, 0);
@ -86,7 +88,7 @@ static void filter_4x4_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl
* \param delta_pos Fractional pixel precise position of sample displacement * \param delta_pos Fractional pixel precise position of sample displacement
* \param x Sample offset in direction x in ref_main array * \param x Sample offset in direction x in ref_main array
*/ */
static INLINE __m128i filter_8x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ static INLINE __m128i filter_8x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){
int8_t delta_int = delta_pos >> 5; int8_t delta_int = delta_pos >> 5;
int8_t delta_fract = delta_pos & (32-1); int8_t delta_fract = delta_pos & (32-1);
@ -110,7 +112,7 @@ static INLINE __m128i filter_8x1_avx2(const kvz_pixel *ref_main, int16_t delta_p
* \param sample_disp Sample displacement per row * \param sample_disp Sample displacement per row
* \param vertical_mode Mode direction, true if vertical * \param vertical_mode Mode direction, true if vertical
*/ */
static void filter_8x8_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){ static void filter_8x8_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){
__m128i row0 = filter_8x1_avx2(ref_main, 1 * sample_disp, 0); __m128i row0 = filter_8x1_avx2(ref_main, 1 * sample_disp, 0);
__m128i row1 = filter_8x1_avx2(ref_main, 2 * sample_disp, 0); __m128i row1 = filter_8x1_avx2(ref_main, 2 * sample_disp, 0);
__m128i row2 = filter_8x1_avx2(ref_main, 3 * sample_disp, 0); __m128i row2 = filter_8x1_avx2(ref_main, 3 * sample_disp, 0);
@ -163,7 +165,7 @@ static void filter_8x8_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl
* \param delta_pos Fractional pixel precise position of sample displacement * \param delta_pos Fractional pixel precise position of sample displacement
* \param x Sample offset in direction x in ref_main array * \param x Sample offset in direction x in ref_main array
*/ */
static INLINE __m256i filter_16x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ static INLINE __m256i filter_16x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){
int8_t delta_int = delta_pos >> 5; int8_t delta_int = delta_pos >> 5;
int8_t delta_fract = delta_pos & (32-1); int8_t delta_fract = delta_pos & (32-1);
@ -189,7 +191,7 @@ static INLINE __m256i filter_16x1_avx2(const kvz_pixel *ref_main, int16_t delta_
* \param sample_disp Sample displacement per row * \param sample_disp Sample displacement per row
* \param vertical_mode Mode direction, true if vertical * \param vertical_mode Mode direction, true if vertical
*/ */
static void filter_16x16_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){ static void filter_16x16_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){
for (int y = 0; y < 16; y += 8) { for (int y = 0; y < 16; y += 8) {
__m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, 0); __m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, 0);
__m256i row1 = filter_16x1_avx2(ref_main, (y + 2) * sample_disp, 0); __m256i row1 = filter_16x1_avx2(ref_main, (y + 2) * sample_disp, 0);
@ -281,7 +283,7 @@ static void filter_16x16_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sam
* \param vertical_mode Mode direction, true if vertical * \param vertical_mode Mode direction, true if vertical
* \param width Block width * \param width Block width
*/ */
static void filter_NxN_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode, int width){ static void filter_NxN_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode, int width){
for (int y = 0; y < width; y += 8) { for (int y = 0; y < width; y += 8) {
for (int x = 0; x < width; x += 16) { for (int x = 0; x < width; x += 16) {
__m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, x); __m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, x);
@ -376,9 +378,9 @@ static void filter_NxN_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl
static void kvz_angular_pred_avx2( static void kvz_angular_pred_avx2(
const int_fast8_t log2_width, const int_fast8_t log2_width,
const int_fast8_t intra_mode, const int_fast8_t intra_mode,
const kvz_pixel *const in_ref_above, const uint8_t *const in_ref_above,
const kvz_pixel *const in_ref_left, const uint8_t *const in_ref_left,
kvz_pixel *const dst) uint8_t *const dst)
{ {
assert(log2_width >= 2 && log2_width <= 5); assert(log2_width >= 2 && log2_width <= 5);
assert(intra_mode >= 2 && intra_mode <= 34); assert(intra_mode >= 2 && intra_mode <= 34);
@ -388,7 +390,7 @@ static void kvz_angular_pred_avx2(
// Temporary buffer for modes 11-25. // Temporary buffer for modes 11-25.
// It only needs to be big enough to hold indices from -width to width-1. // It only needs to be big enough to hold indices from -width to width-1.
kvz_pixel tmp_ref[2 * 32]; uint8_t tmp_ref[2 * 32];
const int_fast8_t width = 1 << log2_width; const int_fast8_t width = 1 << log2_width;
// Whether to swap references to always project on the left reference row. // Whether to swap references to always project on the left reference row.
@ -399,9 +401,9 @@ static void kvz_angular_pred_avx2(
const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
// Pointer for the reference we are interpolating from. // Pointer for the reference we are interpolating from.
const kvz_pixel *ref_main; const uint8_t *ref_main;
// Pointer for the other reference. // Pointer for the other reference.
const kvz_pixel *ref_side; const uint8_t *ref_side;
// Set ref_main and ref_side such that, when indexed with 0, they point to // Set ref_main and ref_side such that, when indexed with 0, they point to
// index 0 in block coordinates. // index 0 in block coordinates.
@ -463,15 +465,15 @@ static void kvz_angular_pred_avx2(
*/ */
static void kvz_intra_pred_planar_avx2( static void kvz_intra_pred_planar_avx2(
const int_fast8_t log2_width, const int_fast8_t log2_width,
const kvz_pixel *const ref_top, const uint8_t *const ref_top,
const kvz_pixel *const ref_left, const uint8_t *const ref_left,
kvz_pixel *const dst) uint8_t *const dst)
{ {
assert(log2_width >= 2 && log2_width <= 5); assert(log2_width >= 2 && log2_width <= 5);
const int_fast8_t width = 1 << log2_width; const int_fast8_t width = 1 << log2_width;
const kvz_pixel top_right = ref_top[width + 1]; const uint8_t top_right = ref_top[width + 1];
const kvz_pixel bottom_left = ref_left[width + 1]; const uint8_t bottom_left = ref_left[width + 1];
if (log2_width > 2) { if (log2_width > 2) {
@ -888,12 +890,11 @@ static void pred_filtered_dc_32x32(const uint8_t *ref_top,
*/ */
static void kvz_intra_pred_filtered_dc_avx2( static void kvz_intra_pred_filtered_dc_avx2(
const int_fast8_t log2_width, const int_fast8_t log2_width,
const kvz_pixel *ref_top, const uint8_t *ref_top,
const kvz_pixel *ref_left, const uint8_t *ref_left,
kvz_pixel *out_block) uint8_t *out_block)
{ {
assert(log2_width >= 2 && log2_width <= 5); assert(log2_width >= 2 && log2_width <= 5);
assert(sizeof(kvz_pixel) == sizeof(uint8_t));
if (log2_width == 2) { if (log2_width == 2) {
pred_filtered_dc_4x4(ref_top, ref_left, out_block); pred_filtered_dc_4x4(ref_top, ref_left, out_block);
@ -906,17 +907,20 @@ static void kvz_intra_pred_filtered_dc_avx2(
} }
} }
#endif //KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2 && defined X86_64 #endif //COMPILE_INTEL_AVX2 && defined X86_64
int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)
{ {
bool success = true; bool success = true;
#if COMPILE_INTEL_AVX2 && defined X86_64 #if COMPILE_INTEL_AVX2 && defined X86_64
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8) { if (bitdepth == 8) {
success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2); success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2);
success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &kvz_intra_pred_planar_avx2); success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &kvz_intra_pred_planar_avx2);
success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &kvz_intra_pred_filtered_dc_avx2); success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &kvz_intra_pred_filtered_dc_avx2);
} }
#endif //KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2 && defined X86_64 #endif //COMPILE_INTEL_AVX2 && defined X86_64
return success; return success;
} }

View file

@ -25,23 +25,25 @@
#include "strategies/avx2/ipol-avx2.h" #include "strategies/avx2/ipol-avx2.h"
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#include "kvazaar.h"
#include <immintrin.h> #include <immintrin.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include "encoder.h" #include "encoder.h"
#include "kvazaar.h"
#include "search_inter.h" #include "search_inter.h"
#include "strategies/generic/picture-generic.h" #include "strategies/generic/picture-generic.h"
#include "strategies/strategies-ipol.h" #include "strategies/strategies-ipol.h"
#include "strategyselector.h" #include "strategyselector.h"
#include "strategies/generic/ipol-generic.h" #include "strategies/generic/ipol-generic.h"
#if KVZ_BIT_DEPTH == 8
extern int8_t kvz_g_luma_filter[4][8]; extern int8_t kvz_g_luma_filter[4][8];
extern int8_t kvz_g_chroma_filter[8][4]; extern int8_t kvz_g_chroma_filter[8][4];
static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, uint8_t *data)
{ {
__m128i fir = _mm_loadl_epi64((__m128i*)filter); __m128i fir = _mm_loadl_epi64((__m128i*)filter);
__m128i row = _mm_loadl_epi64((__m128i*)data); __m128i row = _mm_loadl_epi64((__m128i*)data);
@ -100,7 +102,7 @@ static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) {
filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45 filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45
} }
static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out, static void kvz_eight_tap_filter_hor_8x1_avx2(uint8_t *data, int16_t * out,
__m256i *shuf_01_23, __m256i *shuf_45_67, __m256i *shuf_01_23, __m256i *shuf_45_67,
__m256i *taps_01_23, __m256i *taps_45_67) { __m256i *taps_01_23, __m256i *taps_45_67) {
@ -117,7 +119,7 @@ static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out,
_mm_storeu_si128((__m128i*)out, filtered); _mm_storeu_si128((__m128i*)out, filtered);
} }
static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, static void kvz_four_tap_filter_hor_4x4_avx2(uint8_t *data, int stride, int16_t * out, int out_stride,
__m256i *shuf_01, __m256i *shuf_23, __m256i *shuf_01, __m256i *shuf_23,
__m256i *taps_01, __m256i *taps_23) { __m256i *taps_01, __m256i *taps_23) {
@ -143,7 +145,7 @@ static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_
_mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper)); _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper));
} }
static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, static void kvz_four_tap_filter_hor_4xN_avx2(uint8_t *data, int stride, int16_t * out, int out_stride,
__m256i *shuf_01_23, __m256i *taps_01_23, __m256i *shuf_01_23, __m256i *taps_01_23,
int rows) { int rows) {
@ -177,7 +179,7 @@ static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data
return filtered; return filtered;
} }
static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out) static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out)
{ {
// Interpolation filter shifts // Interpolation filter shifts
int32_t shift2 = 6; int32_t shift2 = 6;
@ -243,7 +245,7 @@ static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *dat
_mm_storel_epi64((__m128i*)out, filtered); _mm_storel_epi64((__m128i*)out, filtered);
} }
static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride) static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out, int16_t out_stride)
{ {
// Interpolation filter shifts // Interpolation filter shifts
int32_t shift2 = 6; int32_t shift2 = 6;
@ -366,7 +368,7 @@ static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int1
_mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23)); _mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23));
} }
INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride) INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, uint8_t * out, int64_t out_stride)
{ {
// Interpolation filter shifts // Interpolation filter shifts
int32_t shift2 = 6; int32_t shift2 = 6;
@ -589,7 +591,7 @@ INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t s
_mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6); _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6);
} }
INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride) INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, uint8_t *out, int out_stride)
{ {
// Filter even rows // Filter even rows
filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6 filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6
@ -610,11 +612,11 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil
} }
static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
int8_t fme_level, int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@ -695,10 +697,10 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
} }
// VERTICAL STEP // VERTICAL STEP
kvz_pixel *out_l = filtered[0]; uint8_t *out_l = filtered[0];
kvz_pixel *out_r = filtered[1]; uint8_t *out_r = filtered[1];
kvz_pixel *out_t = filtered[2]; uint8_t *out_t = filtered[2];
kvz_pixel *out_b = filtered[3]; uint8_t *out_b = filtered[3];
__m256i taps[4]; __m256i taps[4];
kvz_init_ver_filter_taps(fir0, taps); kvz_init_ver_filter_taps(fir0, taps);
@ -746,11 +748,11 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
} }
static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
int8_t fme_level, int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@ -774,10 +776,10 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco
int16_t *col_pos2 = hor_first_cols[2]; int16_t *col_pos2 = hor_first_cols[2];
// VERTICAL STEP // VERTICAL STEP
kvz_pixel *out_tl = filtered[0]; uint8_t *out_tl = filtered[0];
kvz_pixel *out_tr = filtered[1]; uint8_t *out_tr = filtered[1];
kvz_pixel *out_bl = filtered[2]; uint8_t *out_bl = filtered[2];
kvz_pixel *out_br = filtered[3]; uint8_t *out_br = filtered[3];
__m256i taps[4]; __m256i taps[4];
kvz_init_ver_filter_taps(fir2, taps); kvz_init_ver_filter_taps(fir2, taps);
@ -829,11 +831,11 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco
} }
static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
int8_t fme_level, int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@ -928,10 +930,10 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
} }
// VERTICAL STEP // VERTICAL STEP
kvz_pixel *out_l = filtered[0]; uint8_t *out_l = filtered[0];
kvz_pixel *out_r = filtered[1]; uint8_t *out_r = filtered[1];
kvz_pixel *out_t = filtered[2]; uint8_t *out_t = filtered[2];
kvz_pixel *out_b = filtered[3]; uint8_t *out_b = filtered[3];
int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0; int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0;
int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0; int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0;
@ -1056,11 +1058,11 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
} }
static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
int8_t fme_level, int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@ -1088,10 +1090,10 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
int16_t hor_stride = LCU_WIDTH; int16_t hor_stride = LCU_WIDTH;
// VERTICAL STEP // VERTICAL STEP
kvz_pixel *out_tl = filtered[0]; uint8_t *out_tl = filtered[0];
kvz_pixel *out_tr = filtered[1]; uint8_t *out_tr = filtered[1];
kvz_pixel *out_bl = filtered[2]; uint8_t *out_bl = filtered[2];
kvz_pixel *out_br = filtered[3]; uint8_t *out_br = filtered[3];
int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
@ -1214,11 +1216,11 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
} }
static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
kvz_pixel *dst, uint8_t *dst,
int16_t dst_stride, int16_t dst_stride,
int8_t hor_flag, int8_t hor_flag,
int8_t ver_flag, int8_t ver_flag,
@ -1268,7 +1270,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
} }
static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder, static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
@ -1323,11 +1325,11 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons
static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
kvz_pixel *dst, uint8_t *dst,
int16_t dst_stride, int16_t dst_stride,
int8_t hor_flag, int8_t hor_flag,
int8_t ver_flag, int8_t ver_flag,
@ -1385,7 +1387,7 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encode
} }
static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder, static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder,
kvz_pixel *src, uint8_t *src,
int16_t src_stride, int16_t src_stride,
int width, int width,
int height, int height,
@ -1447,6 +1449,8 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const
} }
} }
#endif //KVZ_BIT_DEPTH == 8
void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filter_size, int width, int height, kvz_extended_block *out) { int filter_size, int width, int height, kvz_extended_block *out) {
@ -1510,6 +1514,8 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth)
{ {
bool success = true; bool success = true;
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){ if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2); success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2);
success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2); success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2);
@ -1520,7 +1526,10 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2);
success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2); success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2);
} }
#endif //KVZ_BIT_DEPTH == 8
success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2); success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2);
#endif //COMPILE_INTEL_AVX2 #endif //COMPILE_INTEL_AVX2
return success; return success;
} }

View file

@ -25,6 +25,8 @@
#include "global.h" #include "global.h"
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include "strategies/avx2/picture-avx2.h" #include "strategies/avx2/picture-avx2.h"
#include "strategies/avx2/reg_sad_pow2_widths-avx2.h" #include "strategies/avx2/reg_sad_pow2_widths-avx2.h"
@ -33,7 +35,6 @@
#include <mmintrin.h> #include <mmintrin.h>
#include <xmmintrin.h> #include <xmmintrin.h>
#include <string.h> #include <string.h>
#include "kvazaar.h"
#include "strategies/strategies-picture.h" #include "strategies/strategies-picture.h"
#include "strategyselector.h" #include "strategyselector.h"
#include "strategies/generic/picture-generic.h" #include "strategies/generic/picture-generic.h"
@ -52,7 +53,7 @@
* *
* \returns Sum of Absolute Differences * \returns Sum of Absolute Differences
*/ */
uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const data2, uint32_t kvz_reg_sad_avx2(const uint8_t * const data1, const uint8_t * const data2,
const int width, const int height, const unsigned stride1, const unsigned stride2) const int width, const int height, const unsigned stride1, const unsigned stride2)
{ {
if (width == 0) if (width == 0)
@ -123,7 +124,7 @@ static INLINE uint32_t m256i_horizontal_sum(const __m256i sum)
} }
static unsigned sad_8bit_8x8_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) static unsigned sad_8bit_8x8_avx2(const uint8_t *buf1, const uint8_t *buf2)
{ {
const __m256i *const a = (const __m256i *)buf1; const __m256i *const a = (const __m256i *)buf1;
const __m256i *const b = (const __m256i *)buf2; const __m256i *const b = (const __m256i *)buf2;
@ -133,7 +134,7 @@ static unsigned sad_8bit_8x8_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2)
} }
static unsigned sad_8bit_16x16_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) static unsigned sad_8bit_16x16_avx2(const uint8_t *buf1, const uint8_t *buf2)
{ {
const __m256i *const a = (const __m256i *)buf1; const __m256i *const a = (const __m256i *)buf1;
const __m256i *const b = (const __m256i *)buf2; const __m256i *const b = (const __m256i *)buf2;
@ -143,7 +144,7 @@ static unsigned sad_8bit_16x16_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2
} }
static unsigned sad_8bit_32x32_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) static unsigned sad_8bit_32x32_avx2(const uint8_t *buf1, const uint8_t *buf2)
{ {
const __m256i *const a = (const __m256i *)buf1; const __m256i *const a = (const __m256i *)buf1;
const __m256i *const b = (const __m256i *)buf2; const __m256i *const b = (const __m256i *)buf2;
@ -163,7 +164,7 @@ static unsigned sad_8bit_32x32_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2
} }
static unsigned sad_8bit_64x64_avx2(const kvz_pixel * buf1, const kvz_pixel * buf2) static unsigned sad_8bit_64x64_avx2(const uint8_t * buf1, const uint8_t * buf2)
{ {
const __m256i *const a = (const __m256i *)buf1; const __m256i *const a = (const __m256i *)buf1;
const __m256i *const b = (const __m256i *)buf2; const __m256i *const b = (const __m256i *)buf2;
@ -182,7 +183,7 @@ static unsigned sad_8bit_64x64_avx2(const kvz_pixel * buf1, const kvz_pixel * bu
return m256i_horizontal_sum(sum0); return m256i_horizontal_sum(sum0);
} }
static unsigned satd_4x4_8bit_avx2(const kvz_pixel *org, const kvz_pixel *cur) static unsigned satd_4x4_8bit_avx2(const uint8_t *org, const uint8_t *cur)
{ {
__m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org));
@ -228,7 +229,7 @@ static unsigned satd_4x4_8bit_avx2(const kvz_pixel *org, const kvz_pixel *cur)
static void satd_8bit_4x4_dual_avx2( static void satd_8bit_4x4_dual_avx2(
const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) const pred_buffer preds, const uint8_t * const orig, unsigned num_modes, unsigned *satds_out)
{ {
__m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig))); __m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig)));
@ -423,14 +424,14 @@ INLINE static void sum_block_dual_avx2(__m256i *ver_row, unsigned *sum0, unsigne
*sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1)); *sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1));
} }
INLINE static __m128i diff_row_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) INLINE static __m128i diff_row_avx2(const uint8_t *buf1, const uint8_t *buf2)
{ {
__m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf1)); __m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf1));
__m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf2)); __m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf2));
return _mm_sub_epi16(buf1_row, buf2_row); return _mm_sub_epi16(buf1_row, buf2_row);
} }
INLINE static __m256i diff_row_dual_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2, const kvz_pixel *orig) INLINE static __m256i diff_row_dual_avx2(const uint8_t *buf1, const uint8_t *buf2, const uint8_t *orig)
{ {
__m128i temp1 = _mm_loadl_epi64((__m128i*)buf1); __m128i temp1 = _mm_loadl_epi64((__m128i*)buf1);
__m128i temp2 = _mm_loadl_epi64((__m128i*)buf2); __m128i temp2 = _mm_loadl_epi64((__m128i*)buf2);
@ -442,8 +443,8 @@ INLINE static __m256i diff_row_dual_avx2(const kvz_pixel *buf1, const kvz_pixel
} }
INLINE static void diff_blocks_avx2(__m128i (*row_diff)[8], INLINE static void diff_blocks_avx2(__m128i (*row_diff)[8],
const kvz_pixel * buf1, unsigned stride1, const uint8_t * buf1, unsigned stride1,
const kvz_pixel * orig, unsigned stride_orig) const uint8_t * orig, unsigned stride_orig)
{ {
(*row_diff)[0] = diff_row_avx2(buf1 + 0 * stride1, orig + 0 * stride_orig); (*row_diff)[0] = diff_row_avx2(buf1 + 0 * stride1, orig + 0 * stride_orig);
(*row_diff)[1] = diff_row_avx2(buf1 + 1 * stride1, orig + 1 * stride_orig); (*row_diff)[1] = diff_row_avx2(buf1 + 1 * stride1, orig + 1 * stride_orig);
@ -457,9 +458,9 @@ INLINE static void diff_blocks_avx2(__m128i (*row_diff)[8],
} }
INLINE static void diff_blocks_dual_avx2(__m256i (*row_diff)[8], INLINE static void diff_blocks_dual_avx2(__m256i (*row_diff)[8],
const kvz_pixel * buf1, unsigned stride1, const uint8_t * buf1, unsigned stride1,
const kvz_pixel * buf2, unsigned stride2, const uint8_t * buf2, unsigned stride2,
const kvz_pixel * orig, unsigned stride_orig) const uint8_t * orig, unsigned stride_orig)
{ {
(*row_diff)[0] = diff_row_dual_avx2(buf1 + 0 * stride1, buf2 + 0 * stride2, orig + 0 * stride_orig); (*row_diff)[0] = diff_row_dual_avx2(buf1 + 0 * stride1, buf2 + 0 * stride2, orig + 0 * stride_orig);
(*row_diff)[1] = diff_row_dual_avx2(buf1 + 1 * stride1, buf2 + 1 * stride2, orig + 1 * stride_orig); (*row_diff)[1] = diff_row_dual_avx2(buf1 + 1 * stride1, buf2 + 1 * stride2, orig + 1 * stride_orig);
@ -496,9 +497,9 @@ INLINE static void hor_transform_block_dual_avx2(__m256i (*row_diff)[8])
hor_transform_row_dual_avx2((*row_diff) + 7); hor_transform_row_dual_avx2((*row_diff) + 7);
} }
static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned stride1, static void kvz_satd_8bit_8x8_general_dual_avx2(const uint8_t * buf1, unsigned stride1,
const kvz_pixel * buf2, unsigned stride2, const uint8_t * buf2, unsigned stride2,
const kvz_pixel * orig, unsigned stride_orig, const uint8_t * orig, unsigned stride_orig,
unsigned *sum0, unsigned *sum1) unsigned *sum0, unsigned *sum1)
{ {
__m256i temp[8]; __m256i temp[8];
@ -516,18 +517,18 @@ static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned
/** /**
* \brief Calculate SATD between two 4x4 blocks inside bigger arrays. * \brief Calculate SATD between two 4x4 blocks inside bigger arrays.
*/ */
static unsigned kvz_satd_4x4_subblock_8bit_avx2(const kvz_pixel * buf1, static unsigned kvz_satd_4x4_subblock_8bit_avx2(const uint8_t * buf1,
const int32_t stride1, const int32_t stride1,
const kvz_pixel * buf2, const uint8_t * buf2,
const int32_t stride2) const int32_t stride2)
{ {
// TODO: AVX2 implementation // TODO: AVX2 implementation
return kvz_satd_4x4_subblock_generic(buf1, stride1, buf2, stride2); return kvz_satd_4x4_subblock_generic(buf1, stride1, buf2, stride2);
} }
static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds[4], static void kvz_satd_4x4_subblock_quad_avx2(const uint8_t *preds[4],
const int stride, const int stride,
const kvz_pixel *orig, const uint8_t *orig,
const int orig_stride, const int orig_stride,
unsigned costs[4]) unsigned costs[4])
{ {
@ -535,7 +536,7 @@ static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds[4],
kvz_satd_4x4_subblock_quad_generic(preds, stride, orig, orig_stride, costs); kvz_satd_4x4_subblock_quad_generic(preds, stride, orig, orig_stride, costs);
} }
static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2) static unsigned satd_8x8_subblock_8bit_avx2(const uint8_t * buf1, unsigned stride1, const uint8_t * buf2, unsigned stride2)
{ {
__m128i temp[8]; __m128i temp[8];
@ -549,9 +550,9 @@ static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned str
return result; return result;
} }
static void satd_8x8_subblock_quad_avx2(const kvz_pixel **preds, static void satd_8x8_subblock_quad_avx2(const uint8_t **preds,
const int stride, const int stride,
const kvz_pixel *orig, const uint8_t *orig,
const int orig_stride, const int orig_stride,
unsigned *costs) unsigned *costs)
{ {
@ -570,7 +571,7 @@ SATD_ANY_SIZE(8bit_avx2)
// multiples of 8x8 with the 8x8 hadamard function. // multiples of 8x8 with the 8x8 hadamard function.
#define SATD_NXN_DUAL_AVX2(n) \ #define SATD_NXN_DUAL_AVX2(n) \
static void satd_8bit_ ## n ## x ## n ## _dual_avx2( \ static void satd_8bit_ ## n ## x ## n ## _dual_avx2( \
const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) \ const pred_buffer preds, const uint8_t * const orig, unsigned num_modes, unsigned *satds_out) \
{ \ { \
unsigned x, y; \ unsigned x, y; \
satds_out[0] = 0; \ satds_out[0] = 0; \
@ -590,7 +591,7 @@ static void satd_8bit_ ## n ## x ## n ## _dual_avx2( \
} }
static void satd_8bit_8x8_dual_avx2( static void satd_8bit_8x8_dual_avx2(
const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) const pred_buffer preds, const uint8_t * const orig, unsigned num_modes, unsigned *satds_out)
{ {
unsigned x, y; unsigned x, y;
satds_out[0] = 0; satds_out[0] = 0;
@ -618,17 +619,17 @@ SATD_NXN_DUAL_AVX2(64)
static cost_pixel_any_size_multi_func satd_any_size_## suffix; \ static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
static void satd_any_size_ ## suffix ( \ static void satd_any_size_ ## suffix ( \
int width, int height, \ int width, int height, \
const kvz_pixel **preds, \ const uint8_t **preds, \
const int stride, \ const int stride, \
const kvz_pixel *orig, \ const uint8_t *orig, \
const int orig_stride, \ const int orig_stride, \
unsigned num_modes, \ unsigned num_modes, \
unsigned *costs_out, \ unsigned *costs_out, \
int8_t *valid) \ int8_t *valid) \
{ \ { \
unsigned sums[num_parallel_blocks] = { 0 }; \ unsigned sums[num_parallel_blocks] = { 0 }; \
const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\ const uint8_t *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
const kvz_pixel *orig_ptr = orig; \ const uint8_t *orig_ptr = orig; \
costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \ costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
if (width % 8 != 0) { \ if (width % 8 != 0) { \
/* Process the first column using 4x4 blocks. */ \ /* Process the first column using 4x4 blocks. */ \
@ -681,7 +682,7 @@ SATD_NXN_DUAL_AVX2(64)
SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4) SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)
static unsigned pixels_calc_ssd_avx2(const kvz_pixel *const ref, const kvz_pixel *const rec, static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec,
const int ref_stride, const int rec_stride, const int ref_stride, const int rec_stride,
const int width) const int width)
{ {
@ -767,9 +768,9 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
const hi_prec_buf_t*high_precision_rec0, const hi_prec_buf_t*high_precision_rec0,
const hi_prec_buf_t*high_precision_rec1, const hi_prec_buf_t*high_precision_rec1,
lcu_t* lcu, lcu_t* lcu,
kvz_pixel* temp_lcu_y, uint8_t* temp_lcu_y,
kvz_pixel* temp_lcu_u, uint8_t* temp_lcu_u,
kvz_pixel* temp_lcu_v, uint8_t* temp_lcu_v,
bool predict_luma, bool predict_luma,
bool predict_chroma) bool predict_chroma)
{ {
@ -799,7 +800,7 @@ bool predict_chroma)
int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
} }
} }
@ -834,7 +835,7 @@ bool predict_chroma)
int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
} }
} }
@ -877,11 +878,11 @@ bool predict_chroma)
int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
} }
} }
@ -940,11 +941,11 @@ bool predict_chroma)
int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
} }
} else { } else {
// Load 8 pixels to vector // Load 8 pixels to vector
@ -1013,7 +1014,7 @@ static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)
return NULL; return NULL;
} }
static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t ver_sad_avx2(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t width, int32_t height, uint32_t stride) int32_t width, int32_t height, uint32_t stride)
{ {
if (width == 0) if (width == 0)
@ -1030,7 +1031,7 @@ static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat
return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
} }
static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t hor_sad_avx2(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t width, int32_t height, uint32_t pic_stride, int32_t width, int32_t height, uint32_t pic_stride,
uint32_t ref_stride, uint32_t left, uint32_t right) uint32_t ref_stride, uint32_t left, uint32_t right)
{ {
@ -1051,7 +1052,7 @@ static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat
pic_stride, ref_stride, left, right); pic_stride, ref_stride, left, right);
} }
static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len) static double pixel_var_avx2_largebuf(const uint8_t *buf, const uint32_t len)
{ {
const float len_f = (float)len; const float len_f = (float)len;
const __m256i zero = _mm256_setzero_si256(); const __m256i zero = _mm256_setzero_si256();
@ -1154,7 +1155,7 @@ static __m256i hsum_epi32_to_epi64(const __m256i v)
return sums_64; return sums_64;
} }
static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len) static double pixel_var_avx2(const uint8_t *buf, const uint32_t len)
{ {
assert(sizeof(*buf) == 1); assert(sizeof(*buf) == 1);
assert((len & 31) == 0); assert((len & 31) == 0);
@ -1223,19 +1224,21 @@ static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
#else // INACCURATE_VARIANCE_CALCULATION #else // INACCURATE_VARIANCE_CALCULATION
static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len) static double pixel_var_avx2(const uint8_t *buf, const uint32_t len)
{ {
return pixel_var_avx2_largebuf(buf, len); return pixel_var_avx2_largebuf(buf, len);
} }
#endif // !INACCURATE_VARIANCE_CALCULATION #endif // !INACCURATE_VARIANCE_CALCULATION
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2 #endif //COMPILE_INTEL_AVX2
int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
{ {
bool success = true; bool success = true;
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#if KVZ_BIT_DEPTH == 8
// We don't actually use SAD for intra right now, other than 4x4 for // We don't actually use SAD for intra right now, other than 4x4 for
// transform skip, but we might again one day and this is some of the // transform skip, but we might again one day and this is some of the
// simplest code to look at for anyone interested in doing more // simplest code to look at for anyone interested in doing more
@ -1271,6 +1274,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "pixel_var", "avx2", 40, &pixel_var_avx2); success &= kvz_strategyselector_register(opaque, "pixel_var", "avx2", 40, &pixel_var_avx2);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif #endif
return success; return success;
} }

View file

@ -506,35 +506,37 @@ void kvz_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
#undef LOG2_SCAN_SET_SIZE #undef LOG2_SCAN_SET_SIZE
} }
static INLINE __m128i get_residual_4x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){ #if KVZ_BIT_DEPTH == 8
static INLINE __m128i get_residual_4x1_avx2(const uint8_t *a_in, const uint8_t *b_in){
__m128i a = _mm_cvtsi32_si128(*(int32_t*)a_in); __m128i a = _mm_cvtsi32_si128(*(int32_t*)a_in);
__m128i b = _mm_cvtsi32_si128(*(int32_t*)b_in); __m128i b = _mm_cvtsi32_si128(*(int32_t*)b_in);
__m128i diff = _mm_sub_epi16(_mm_cvtepu8_epi16(a), _mm_cvtepu8_epi16(b) ); __m128i diff = _mm_sub_epi16(_mm_cvtepu8_epi16(a), _mm_cvtepu8_epi16(b) );
return diff; return diff;
} }
static INLINE __m128i get_residual_8x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){ static INLINE __m128i get_residual_8x1_avx2(const uint8_t *a_in, const uint8_t *b_in){
__m128i a = _mm_cvtsi64_si128(*(int64_t*)a_in); __m128i a = _mm_cvtsi64_si128(*(int64_t*)a_in);
__m128i b = _mm_cvtsi64_si128(*(int64_t*)b_in); __m128i b = _mm_cvtsi64_si128(*(int64_t*)b_in);
__m128i diff = _mm_sub_epi16(_mm_cvtepu8_epi16(a), _mm_cvtepu8_epi16(b) ); __m128i diff = _mm_sub_epi16(_mm_cvtepu8_epi16(a), _mm_cvtepu8_epi16(b) );
return diff; return diff;
} }
static INLINE int32_t get_quantized_recon_4x1_avx2(int16_t *residual, const kvz_pixel *pred_in){ static INLINE int32_t get_quantized_recon_4x1_avx2(int16_t *residual, const uint8_t *pred_in){
__m128i res = _mm_loadl_epi64((__m128i*)residual); __m128i res = _mm_loadl_epi64((__m128i*)residual);
__m128i pred = _mm_cvtsi32_si128(*(int32_t*)pred_in); __m128i pred = _mm_cvtsi32_si128(*(int32_t*)pred_in);
__m128i rec = _mm_add_epi16(res, _mm_cvtepu8_epi16(pred)); __m128i rec = _mm_add_epi16(res, _mm_cvtepu8_epi16(pred));
return _mm_cvtsi128_si32(_mm_packus_epi16(rec, rec)); return _mm_cvtsi128_si32(_mm_packus_epi16(rec, rec));
} }
static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const kvz_pixel *pred_in){ static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const uint8_t *pred_in){
__m128i res = _mm_loadu_si128((__m128i*)residual); __m128i res = _mm_loadu_si128((__m128i*)residual);
__m128i pred = _mm_cvtsi64_si128(*(int64_t*)pred_in); __m128i pred = _mm_cvtsi64_si128(*(int64_t*)pred_in);
__m128i rec = _mm_add_epi16(res, _mm_cvtepu8_epi16(pred)); __m128i rec = _mm_add_epi16(res, _mm_cvtepu8_epi16(pred));
return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec)); return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec));
} }
static void get_residual_avx2(const kvz_pixel *ref_in, const kvz_pixel *pred_in, int16_t *residual, int width, int in_stride){ static void get_residual_avx2(const uint8_t *ref_in, const uint8_t *pred_in, int16_t *residual, int width, int in_stride){
__m128i diff = _mm_setzero_si128(); __m128i diff = _mm_setzero_si128();
switch (width) { switch (width) {
@ -579,7 +581,7 @@ static void get_residual_avx2(const kvz_pixel *ref_in, const kvz_pixel *pred_in,
} }
} }
static void get_quantized_recon_avx2(int16_t *residual, const kvz_pixel *pred_in, int in_stride, kvz_pixel *rec_out, int out_stride, int width){ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width){
switch (width) { switch (width) {
case 4: case 4:
@ -629,8 +631,8 @@ int kvz_quantize_residual_avx2(encoder_state_t *const state,
const cu_info_t *const cur_cu, const int width, const color_t color, const cu_info_t *const cur_cu, const int width, const color_t color,
const coeff_scan_order_t scan_order, const int use_trskip, const coeff_scan_order_t scan_order, const int use_trskip,
const int in_stride, const int out_stride, const int in_stride, const int out_stride,
const kvz_pixel *const ref_in, const kvz_pixel *const pred_in, const uint8_t *const ref_in, const uint8_t *const pred_in,
kvz_pixel *rec_out, coeff_t *coeff_out, uint8_t *rec_out, coeff_t *coeff_out,
bool early_skip) bool early_skip)
{ {
// Temporary arrays to pass data to and from kvz_quant and transform functions. // Temporary arrays to pass data to and from kvz_quant and transform functions.
@ -766,6 +768,8 @@ void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
} }
} }
#endif // KVZ_BIT_DEPTH == 8
static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length) static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
{ {
assert(length % 8 == 0); assert(length % 8 == 0);
@ -870,11 +874,13 @@ int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
bool success = true; bool success = true;
#if COMPILE_INTEL_AVX2 && defined X86_64 #if COMPILE_INTEL_AVX2 && defined X86_64
success &= kvz_strategyselector_register(opaque, "quant", "avx2", 40, &kvz_quant_avx2); #if KVZ_BIT_DEPTH == 8
if (bitdepth == 8) { if (bitdepth == 8) {
success &= kvz_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &kvz_quantize_residual_avx2); success &= kvz_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &kvz_quantize_residual_avx2);
success &= kvz_strategyselector_register(opaque, "dequant", "avx2", 40, &kvz_dequant_avx2); success &= kvz_strategyselector_register(opaque, "dequant", "avx2", 40, &kvz_dequant_avx2);
} }
#endif // KVZ_BIT_DEPTH == 8
success &= kvz_strategyselector_register(opaque, "quant", "avx2", 40, &kvz_quant_avx2);
success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2); success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2);
success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "avx2", 40, &fast_coeff_cost_avx2); success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "avx2", 40, &fast_coeff_cost_avx2);
#endif //COMPILE_INTEL_AVX2 && defined X86_64 #endif //COMPILE_INTEL_AVX2 && defined X86_64

View file

@ -21,10 +21,13 @@
#ifndef REG_SAD_POW2_WIDTHS_AVX2_H_ #ifndef REG_SAD_POW2_WIDTHS_AVX2_H_
#define REG_SAD_POW2_WIDTHS_AVX2_H_ #define REG_SAD_POW2_WIDTHS_AVX2_H_
#include "strategies/sse41/reg_sad_pow2_widths-sse41.h"
#include "kvazaar.h" #include "kvazaar.h"
static INLINE uint32_t reg_sad_w32(const kvz_pixel * const data1, const kvz_pixel * const data2, #if KVZ_BIT_DEPTH == 8
#include "strategies/sse41/reg_sad_pow2_widths-sse41.h"
static INLINE uint32_t reg_sad_w32(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -74,7 +77,7 @@ static INLINE uint32_t reg_sad_w32(const kvz_pixel * const data1, const kvz_pixe
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static INLINE uint32_t reg_sad_w64(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_w64(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -129,7 +132,7 @@ static INLINE uint32_t reg_sad_w64(const kvz_pixel * const data1, const kvz_pixe
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t hor_sad_avx2_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t hor_sad_avx2_w32(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t pic_stride, uint32_t ref_stride, int32_t height, uint32_t pic_stride, uint32_t ref_stride,
const uint32_t left, const uint32_t right) const uint32_t left, const uint32_t right)
{ {
@ -206,4 +209,6 @@ static uint32_t hor_sad_avx2_w32(const kvz_pixel *pic_data, const kvz_pixel *ref
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif #endif

View file

@ -21,6 +21,8 @@
#include "strategies/avx2/sao-avx2.h" #include "strategies/avx2/sao-avx2.h"
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include <immintrin.h> #include <immintrin.h>
#include <nmmintrin.h> #include <nmmintrin.h>
@ -31,7 +33,6 @@
#include "cu.h" #include "cu.h"
#include "encoder.h" #include "encoder.h"
#include "encoderstate.h" #include "encoderstate.h"
#include "kvazaar.h"
#include "sao.h" #include "sao.h"
#include "strategyselector.h" #include "strategyselector.h"
@ -271,8 +272,8 @@ static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a,
return calc_diff_off_delta(diff_lo, diff_hi, offset, orig); return calc_diff_off_delta(diff_lo, diff_hi, offset, orig);
} }
static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, static int32_t sao_edge_ddistortion_avx2(const uint8_t *orig_data,
const kvz_pixel *rec_data, const uint8_t *rec_data,
int32_t block_width, int32_t block_width,
int32_t block_height, int32_t block_height,
int32_t eo_class, int32_t eo_class,
@ -407,8 +408,8 @@ static void FIX_W32 calc_edge_dir_one_ymm(const __m256i a,
} }
} }
static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, static void calc_sao_edge_dir_avx2(const uint8_t *orig_data,
const kvz_pixel *rec_data, const uint8_t *rec_data,
int32_t eo_class, int32_t eo_class,
int32_t block_width, int32_t block_width,
int32_t block_height, int32_t block_height,
@ -608,8 +609,8 @@ static __m256i lookup_color_band_ymm(const __m256i curr_row,
} }
static INLINE void reconstruct_color_band(const encoder_control_t *encoder, static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
const kvz_pixel *rec_data, const uint8_t *rec_data,
kvz_pixel *new_rec_data, uint8_t *new_rec_data,
const sao_info_t *sao, const sao_info_t *sao,
int32_t stride, int32_t stride,
int32_t new_stride, int32_t new_stride,
@ -695,8 +696,8 @@ static __m256i FIX_W32 do_one_nonband_ymm(const __m256i a,
} }
static INLINE void reconstruct_color_other(const encoder_control_t *encoder, static INLINE void reconstruct_color_other(const encoder_control_t *encoder,
const kvz_pixel *rec_data, const uint8_t *rec_data,
kvz_pixel *new_rec_data, uint8_t *new_rec_data,
const sao_info_t *sao, const sao_info_t *sao,
int32_t stride, int32_t stride,
int32_t new_stride, int32_t new_stride,
@ -784,8 +785,8 @@ static INLINE void reconstruct_color_other(const encoder_control_t *encoder,
} }
static void sao_reconstruct_color_avx2(const encoder_control_t *encoder, static void sao_reconstruct_color_avx2(const encoder_control_t *encoder,
const kvz_pixel *rec_data, const uint8_t *rec_data,
kvz_pixel *new_rec_data, uint8_t *new_rec_data,
const sao_info_t *sao, const sao_info_t *sao,
int32_t stride, int32_t stride,
int32_t new_stride, int32_t new_stride,
@ -884,18 +885,21 @@ use_generic:
block_height, band_pos, sao_bands); block_height, band_pos, sao_bands);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2 #endif //COMPILE_INTEL_AVX2
int kvz_strategy_register_sao_avx2(void* opaque, uint8_t bitdepth) int kvz_strategy_register_sao_avx2(void* opaque, uint8_t bitdepth)
{ {
bool success = true; bool success = true;
#if COMPILE_INTEL_AVX2 #if COMPILE_INTEL_AVX2
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8) { if (bitdepth == 8) {
success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "avx2", 40, &sao_edge_ddistortion_avx2); success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "avx2", 40, &sao_edge_ddistortion_avx2);
success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "avx2", 40, &calc_sao_edge_dir_avx2); success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "avx2", 40, &calc_sao_edge_dir_avx2);
success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "avx2", 40, &sao_reconstruct_color_avx2); success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "avx2", 40, &sao_reconstruct_color_avx2);
success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "avx2", 40, &sao_band_ddistortion_avx2); success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "avx2", 40, &sao_band_ddistortion_avx2);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2 #endif //COMPILE_INTEL_AVX2
return success; return success;
} }

View file

@ -21,14 +21,15 @@
#include "strategies/sse2/picture-sse2.h" #include "strategies/sse2/picture-sse2.h"
#if COMPILE_INTEL_SSE2 #if COMPILE_INTEL_SSE2
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include <immintrin.h> #include <immintrin.h>
#include <stdlib.h> #include <stdlib.h>
#include "kvazaar.h"
#include "strategyselector.h" #include "strategyselector.h"
static unsigned reg_sad_sse2(const kvz_pixel * const data1, const kvz_pixel * const data2, static unsigned reg_sad_sse2(const uint8_t * const data1, const uint8_t * const data2,
const int width, const int height, const unsigned stride1, const unsigned stride2) const int width, const int height, const unsigned stride1, const unsigned stride2)
{ {
int y, x; int y, x;
@ -53,7 +54,7 @@ static unsigned reg_sad_sse2(const kvz_pixel * const data1, const kvz_pixel * co
return sad; return sad;
} }
static unsigned sad_8bit_4x4_sse2(const kvz_pixel *buf1, const kvz_pixel *buf2) static unsigned sad_8bit_4x4_sse2(const uint8_t *buf1, const uint8_t *buf2)
{ {
const __m128i *const mbuf1 = (const __m128i *)buf1; const __m128i *const mbuf1 = (const __m128i *)buf1;
const __m128i *const mbuf2 = (const __m128i *)buf2; const __m128i *const mbuf2 = (const __m128i *)buf2;
@ -65,15 +66,18 @@ static unsigned sad_8bit_4x4_sse2(const kvz_pixel *buf1, const kvz_pixel *buf2)
return result[0] + result[2]; return result[0] + result[2];
} }
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_SSE2 #endif //COMPILE_INTEL_SSE2
int kvz_strategy_register_picture_sse2(void* opaque, uint8_t bitdepth) { int kvz_strategy_register_picture_sse2(void* opaque, uint8_t bitdepth) {
bool success = true; bool success = true;
#if COMPILE_INTEL_SSE2 #if COMPILE_INTEL_SSE2
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){ if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "reg_sad", "sse2", 10, &reg_sad_sse2); success &= kvz_strategyselector_register(opaque, "reg_sad", "sse2", 10, &reg_sad_sse2);
success &= kvz_strategyselector_register(opaque, "sad_4x4", "sse2", 10, &sad_8bit_4x4_sse2); success &= kvz_strategyselector_register(opaque, "sad_4x4", "sse2", 10, &sad_8bit_4x4_sse2);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif #endif
return success; return success;
} }

View file

@ -21,16 +21,17 @@
#include "global.h" #include "global.h"
#if COMPILE_INTEL_SSE41 #if COMPILE_INTEL_SSE41
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include "strategies/sse41/picture-sse41.h" #include "strategies/sse41/picture-sse41.h"
#include "strategies/sse41/reg_sad_pow2_widths-sse41.h" #include "strategies/sse41/reg_sad_pow2_widths-sse41.h"
#include <immintrin.h> #include <immintrin.h>
#include <stdlib.h> #include <stdlib.h>
#include "kvazaar.h"
#include "strategyselector.h" #include "strategyselector.h"
uint32_t kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2, uint32_t kvz_reg_sad_sse41(const uint8_t * const data1, const uint8_t * const data2,
const int32_t width, const int32_t height, const uint32_t stride1, const int32_t width, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -68,7 +69,7 @@ static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width)
return NULL; return NULL;
} }
static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t ver_sad_sse41(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t width, int32_t height, uint32_t stride) int32_t width, int32_t height, uint32_t stride)
{ {
if (width == 0) if (width == 0)
@ -85,7 +86,7 @@ static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_da
return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
} }
static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t hor_sad_sse41_w32(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t pic_stride, uint32_t ref_stride, int32_t height, uint32_t pic_stride, uint32_t ref_stride,
uint32_t left, uint32_t right) uint32_t left, uint32_t right)
{ {
@ -194,7 +195,7 @@ static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *re
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t hor_sad_sse41(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t width, int32_t height, uint32_t pic_stride, int32_t width, int32_t height, uint32_t pic_stride,
uint32_t ref_stride, uint32_t left, uint32_t right) uint32_t ref_stride, uint32_t left, uint32_t right)
{ {
@ -215,18 +216,21 @@ static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_da
pic_stride, ref_stride, left, right); pic_stride, ref_stride, left, right);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif //COMPILE_INTEL_SSE41 #endif //COMPILE_INTEL_SSE41
int kvz_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) { int kvz_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) {
bool success = true; bool success = true;
#if COMPILE_INTEL_SSE41 #if COMPILE_INTEL_SSE41
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){ if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41); success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41);
success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41); success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41);
success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41); success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41);
success &= kvz_strategyselector_register(opaque, "hor_sad", "sse41", 20, &hor_sad_sse41); success &= kvz_strategyselector_register(opaque, "hor_sad", "sse41", 20, &hor_sad_sse41);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif #endif
return success; return success;
} }

View file

@ -22,17 +22,20 @@
#define REG_SAD_POW2_WIDTHS_SSE41_H_ #define REG_SAD_POW2_WIDTHS_SSE41_H_
#include "kvazaar.h" #include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include "strategies/missing-intel-intrinsics.h" #include "strategies/missing-intel-intrinsics.h"
#include <immintrin.h> #include <immintrin.h>
static INLINE uint32_t reg_sad_w0(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_w0(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
return 0; return 0;
} }
static INLINE uint32_t reg_sad_w4(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_w4(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -71,7 +74,7 @@ static INLINE uint32_t reg_sad_w4(const kvz_pixel * const data1, const kvz_pixel
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static INLINE uint32_t reg_sad_w8(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_w8(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -122,7 +125,7 @@ static INLINE uint32_t reg_sad_w8(const kvz_pixel * const data1, const kvz_pixel
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static INLINE uint32_t reg_sad_w12(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_w12(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -141,7 +144,7 @@ static INLINE uint32_t reg_sad_w12(const kvz_pixel * const data1, const kvz_pixe
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static INLINE uint32_t reg_sad_w16(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_w16(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -186,7 +189,7 @@ static INLINE uint32_t reg_sad_w16(const kvz_pixel * const data1, const kvz_pixe
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static INLINE uint32_t reg_sad_w24(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_w24(const uint8_t * const data1, const uint8_t * const data2,
const int32_t height, const uint32_t stride1, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -238,7 +241,7 @@ static INLINE uint32_t reg_sad_w24(const kvz_pixel * const data1, const kvz_pixe
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kvz_pixel * const data2, static INLINE uint32_t reg_sad_arbitrary(const uint8_t * const data1, const uint8_t * const data2,
const int32_t width, const int32_t height, const uint32_t stride1, const int32_t width, const int32_t height, const uint32_t stride1,
const uint32_t stride2) const uint32_t stride2)
{ {
@ -334,7 +337,7 @@ static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kv
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t ver_sad_w4(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t stride) int32_t height, uint32_t stride)
{ {
__m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data); __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data);
@ -371,7 +374,7 @@ static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t ver_sad_w8(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t stride) int32_t height, uint32_t stride)
{ {
const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data); const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data);
@ -415,7 +418,7 @@ static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t ver_sad_w12(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t stride) int32_t height, uint32_t stride)
{ {
const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
@ -434,7 +437,7 @@ static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t ver_sad_w16(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t stride) int32_t height, uint32_t stride)
{ {
const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
@ -474,7 +477,7 @@ static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t ver_sad_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t width, int32_t height, uint32_t stride) int32_t width, int32_t height, uint32_t stride)
{ {
int32_t y, x; int32_t y, x;
@ -561,7 +564,7 @@ static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *re
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t hor_sad_sse41_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t hor_sad_sse41_w4(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t pic_stride, uint32_t ref_stride, int32_t height, uint32_t pic_stride, uint32_t ref_stride,
uint32_t left, uint32_t right) uint32_t left, uint32_t right)
{ {
@ -625,7 +628,7 @@ static uint32_t hor_sad_sse41_w4(const kvz_pixel *pic_data, const kvz_pixel *ref
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t hor_sad_sse41_w8(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t pic_stride, uint32_t ref_stride, int32_t height, uint32_t pic_stride, uint32_t ref_stride,
uint32_t left, uint32_t right) uint32_t left, uint32_t right)
{ {
@ -738,7 +741,7 @@ static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref
* border pixel, and use a suitable mask to fill all the other pixels with * border pixel, and use a suitable mask to fill all the other pixels with
* that value. * that value.
*/ */
static uint32_t hor_sad_sse41_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static uint32_t hor_sad_sse41_w16(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t height, uint32_t pic_stride, uint32_t ref_stride, int32_t height, uint32_t pic_stride, uint32_t ref_stride,
const uint32_t left, const uint32_t right) const uint32_t left, const uint32_t right)
{ {
@ -821,7 +824,7 @@ static uint32_t hor_sad_sse41_w16(const kvz_pixel *pic_data, const kvz_pixel *re
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
static INLINE uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, static INLINE uint32_t hor_sad_sse41_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data,
int32_t width, int32_t height, uint32_t pic_stride, int32_t width, int32_t height, uint32_t pic_stride,
uint32_t ref_stride, uint32_t left, uint32_t right) uint32_t ref_stride, uint32_t left, uint32_t right)
{ {
@ -1024,4 +1027,6 @@ static INLINE uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const
return _mm_cvtsi128_si32(sad); return _mm_cvtsi128_si32(sad);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif #endif

View file

@ -27,17 +27,18 @@
*/ */
#include "global.h" // IWYU pragma: keep #include "global.h" // IWYU pragma: keep
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
unsigned kvz_sad_4x4_avx(const uint8_t*, const uint8_t*);
unsigned kvz_sad_8x8_avx(const uint8_t*, const uint8_t*);
unsigned kvz_sad_16x16_avx(const uint8_t*, const uint8_t*);
unsigned kvz_sad_4x4_avx(const kvz_pixel*, const kvz_pixel*); unsigned kvz_sad_4x4_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_8x8_avx(const kvz_pixel*, const kvz_pixel*); unsigned kvz_sad_8x8_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_16x16_avx(const kvz_pixel*, const kvz_pixel*); unsigned kvz_sad_16x16_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_32x32_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); unsigned kvz_sad_64x64_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); #endif // KVZ_BIT_DEPTH == 8
unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride);
#endif #endif

View file

@ -21,16 +21,17 @@
#include "strategies/x86_asm/picture-x86-asm.h" #include "strategies/x86_asm/picture-x86-asm.h"
#if defined(KVZ_COMPILE_ASM) #if defined(KVZ_COMPILE_ASM)
#include "kvazaar.h"
#if KVZ_BIT_DEPTH == 8
#include <stdlib.h> #include <stdlib.h>
#include "kvazaar.h"
#include "strategies/x86_asm/picture-x86-asm-sad.h" #include "strategies/x86_asm/picture-x86-asm-sad.h"
#include "strategies/x86_asm/picture-x86-asm-satd.h" #include "strategies/x86_asm/picture-x86-asm-satd.h"
#include "strategies/sse41/picture-sse41.h" #include "strategies/sse41/picture-sse41.h"
#include "strategyselector.h" #include "strategyselector.h"
static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2) static unsigned kvz_sad_32x32_avx(const uint8_t *data1, const uint8_t *data2)
{ {
unsigned sad = 0; unsigned sad = 0;
sad += kvz_sad_16x16_avx(data1, data2); sad += kvz_sad_16x16_avx(data1, data2);
@ -40,7 +41,7 @@ static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2
return sad; return sad;
} }
static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2) static unsigned kvz_sad_64x64_avx(const uint8_t *data1, const uint8_t *data2)
{ {
unsigned sad = 0; unsigned sad = 0;
sad += kvz_sad_32x32_avx(data1, data2); sad += kvz_sad_32x32_avx(data1, data2);
@ -50,7 +51,7 @@ static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2
return sad; return sad;
} }
static unsigned kvz_sad_other_avx(const kvz_pixel *data1, const kvz_pixel *data2, static unsigned kvz_sad_other_avx(const uint8_t *data1, const uint8_t *data2,
int width, int height, int width, int height,
unsigned stride) unsigned stride)
{ {
@ -65,7 +66,7 @@ static unsigned kvz_sad_other_avx(const kvz_pixel *data1, const kvz_pixel *data2
return sad; return sad;
} }
static unsigned reg_sad_x86_asm(const kvz_pixel *data1, const kvz_pixel * data2, static unsigned reg_sad_x86_asm(const uint8_t *data1, const uint8_t * data2,
const int width, const int height, const int width, const int height,
const unsigned stride1, const unsigned stride2) const unsigned stride1, const unsigned stride2)
{ {
@ -90,12 +91,14 @@ static unsigned reg_sad_x86_asm(const kvz_pixel *data1, const kvz_pixel * data2,
} }
} }
#endif // KVZ_BIT_DEPTH == 8
#endif //defined(KVZ_COMPILE_ASM) #endif //defined(KVZ_COMPILE_ASM)
int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth) int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth)
{ {
bool success = true; bool success = true;
#if defined(KVZ_COMPILE_ASM) #if defined(KVZ_COMPILE_ASM)
#if KVZ_BIT_DEPTH == 8
if (bitdepth == 8){ if (bitdepth == 8){
success &= kvz_strategyselector_register(opaque, "reg_sad", "x86_asm_avx", 30, &reg_sad_x86_asm); success &= kvz_strategyselector_register(opaque, "reg_sad", "x86_asm_avx", 30, &reg_sad_x86_asm);
@ -111,6 +114,7 @@ int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "satd_32x32", "x86_asm_avx", 30, &kvz_satd_32x32_avx); success &= kvz_strategyselector_register(opaque, "satd_32x32", "x86_asm_avx", 30, &kvz_satd_32x32_avx);
success &= kvz_strategyselector_register(opaque, "satd_64x64", "x86_asm_avx", 30, &kvz_satd_64x64_avx); success &= kvz_strategyselector_register(opaque, "satd_64x64", "x86_asm_avx", 30, &kvz_satd_64x64_avx);
} }
#endif // KVZ_BIT_DEPTH == 8
#endif //!defined(KVZ_COMPILE_ASM) #endif //!defined(KVZ_COMPILE_ASM)
return success; return success;
} }