uvg266/src/strategies/avx2/intra-avx2.c

/*****************************************************************************
 * This file is part of Kvazaar HEVC encoder.
 *
 * Copyright (C) 2013-2015 Tampere University of Technology and others (see
 * COPYING file).
 *
 * Kvazaar is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the
 * Free Software Foundation; either version 2.1 of the License, or (at your
 * option) any later version.
 *
 * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
 ****************************************************************************/

/*
 * \file
 */

#include <stdlib.h>

#include "intra-avx2.h"
#include "strategyselector.h"

#if COMPILE_INTEL_AVX2
#include <immintrin.h>

 /**
 * \brief Generage angular predictions.
 * \param log2_width    Log2 of width, range 2..5.
 * \param intra_mode    Angular mode in range 2..34.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
 * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 */
static void kvz_angular_pred_avx2(
  const int_fast8_t log2_width,
  const int_fast8_t intra_mode,
  const kvz_pixel *const in_ref_above,
  const kvz_pixel *const in_ref_left,
  kvz_pixel *const dst)
{
  assert(log2_width >= 2 && log2_width <= 5);
  assert(intra_mode >= 2 && intra_mode <= 34);

  static const int8_t modedisp2sampledisp[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
  static const int16_t modedisp2invsampledisp[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / sampledisp

                                                    // Temporary buffer for modes 11-25.
                                                    // It only needs to be big enough to hold indices from -width to width-1.
  kvz_pixel tmp_ref[2 * 32];
  const int_fast8_t width = 1 << log2_width;

  // Whether to swap references to always project on the left reference row.
  const bool vertical_mode = intra_mode >= 18;
  // Modes distance to horizontal or vertical mode.
  const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
  // Sample displacement per column in fractions of 32.
  const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];

  // Pointer for the reference we are interpolating from.
  const kvz_pixel *ref_main;
  // Pointer for the other reference.
  const kvz_pixel *ref_side;

  // Set ref_main and ref_side such that, when indexed with 0, they point to
  // index 0 in block coordinates.
  if (sample_disp < 0) {
    // Negative sample_disp means, we need to use both references.

    ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
    ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;

    // Move the reference pixels to start from the middle to the later half of
    // the tmp_ref, so there is room for negative indices.
    for (int_fast8_t x = -1; x < width; ++x) {
      tmp_ref[x + width] = ref_main[x];
    }
    // Get a pointer to block index 0 in tmp_ref.
    ref_main = &tmp_ref[width];

    // Extend the side reference to the negative indices of main reference.
    int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
    int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
    int_fast8_t most_negative_index = (width * sample_disp) >> 5;
    for (int_fast8_t x = -2; x >= most_negative_index; --x) {
      col_sample_disp += inv_abs_sample_disp;
      int_fast8_t side_index = col_sample_disp >> 8;
      tmp_ref[x + width] = ref_side[side_index - 1];
    }
  }
  else {
    // sample_disp >= 0 means we don't need to refer to negative indices,
    // which means we can just use the references as is.
    ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
    ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
  }

  if (sample_disp != 0) {
    // The mode is not horizontal or vertical, we have to do interpolation.

    int_fast16_t delta_pos = 0;
    for (int_fast8_t y = 0; y < width; ++y) {
      delta_pos += sample_disp;
      int_fast8_t delta_int = delta_pos >> 5;
      int_fast8_t delta_fract = delta_pos & (32 - 1);

      if (delta_fract) {
        // Do linear filtering
        if (width < 8) {
          for (int_fast8_t x = 0; x < width; ++x) {
            kvz_pixel ref1 = ref_main[x + delta_int];
            kvz_pixel ref2 = ref_main[x + delta_int + 1];
            dst[y * width + x] = ((32 - delta_fract) * ref1 + delta_fract * ref2 + 16) >> 5;
          }
        } else {
          struct { uint8_t w1; uint8_t w2; } packed_weights = { 32 - delta_fract, delta_fract };
          __m128i v_weights = _mm_set1_epi16(*(int16_t*)&packed_weights);

          for (int_fast8_t x = 0; x < width; x += 8) {
            __m128i v_ref1 = _mm_loadl_epi64((__m128i*)&(ref_main[x + delta_int]));
            __m128i v_ref2 = _mm_loadl_epi64((__m128i*)&(ref_main[x + delta_int + 1]));
            __m128i v_refs = _mm_unpacklo_epi8(v_ref1, v_ref2);  
            __m128i v_tmp = _mm_maddubs_epi16(v_refs, v_weights);
            v_tmp = _mm_add_epi16(v_tmp, _mm_set1_epi16(16));
            v_tmp = _mm_srli_epi16(v_tmp, 5);
            v_tmp = _mm_packus_epi16(v_tmp, v_tmp);
            _mm_storel_epi64((__m128i*)(dst + y * width + x), v_tmp);
          }
        }
      }
      else {
        // Just copy the integer samples
        for (int_fast8_t x = 0; x < width; x+=4) {
          *(int32_t*)(&dst[y * width + x]) = *(int32_t*)(&ref_main[x + delta_int]);
        }
      }
    }
  }
  else {
    // Mode is horizontal or vertical, just copy the pixels.

    for (int_fast8_t y = 0; y < width; ++y) {
      for (int_fast8_t x = 0; x < width; x+=4) {
        *(int32_t*)&(dst[y * width + x]) = *(int32_t*)&(ref_main[x]);
      }
    }
  }

  // Flip the block if this is was a horizontal mode.
  if (!vertical_mode) {
    for (int_fast8_t y = 0; y < width - 1; ++y) {
      for (int_fast8_t x = y + 1; x < width; ++x) {
        SWAP(dst[y * width + x], dst[x * width + y], kvz_pixel);
      }
    }
  }
}

#endif //COMPILE_INTEL_AVX2

int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)
{
  bool success = true;
#if COMPILE_INTEL_AVX2
  if (bitdepth == 8) {
    success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2);
  }
#endif //COMPILE_INTEL_AVX2
  return success;
}
Add AVX2 strategy. Copy generic implementation there. 2015-10-08 07:02:29 +00:00			`/*****************************************************************************`
			`* This file is part of Kvazaar HEVC encoder.`
			`*`
			`* Copyright (C) 2013-2015 Tampere University of Technology and others (see`
			`* COPYING file).`
			`*`
			`* Kvazaar is free software: you can redistribute it and/or modify it under`
			`* the terms of the GNU Lesser General Public License as published by the`
			`* Free Software Foundation; either version 2.1 of the License, or (at your`
			`* option) any later version.`
			`*`
			`* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY`
			`* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS`
			`* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for`
			`* more details.`
			`*`
			`* You should have received a copy of the GNU General Public License along`
			`* with Kvazaar. If not, see <http://www.gnu.org/licenses/>.`
			`****************************************************************************/`

			`/*`
			`* \file`
			`*/`

			`#include <stdlib.h>`

			`#include "intra-avx2.h"`
			`#include "strategyselector.h"`

			`#if COMPILE_INTEL_AVX2`
			`#include <immintrin.h>`

			`/**`
			`* \brief Generage angular predictions.`
			`* \param log2_width Log2 of width, range 2..5.`
			`* \param intra_mode Angular mode in range 2..34.`
			`* \param in_ref_above Pointer to -1 index of above reference, length=width*2+1.`
			`* \param in_ref_left Pointer to -1 index of left reference, length=width*2+1.`
			`* \param dst Buffer of size width*width.`
			`*/`
			`static void kvz_angular_pred_avx2(`
			`const int_fast8_t log2_width,`
			`const int_fast8_t intra_mode,`
			`const kvz_pixel *const in_ref_above,`
			`const kvz_pixel *const in_ref_left,`
			`kvz_pixel *const dst)`
			`{`
			`assert(log2_width >= 2 && log2_width <= 5);`
			`assert(intra_mode >= 2 && intra_mode <= 34);`

			`static const int8_t modedisp2sampledisp[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };`
			`static const int16_t modedisp2invsampledisp[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / sampledisp`

			`// Temporary buffer for modes 11-25.`
			`// It only needs to be big enough to hold indices from -width to width-1.`
			`kvz_pixel tmp_ref[2 * 32];`
			`const int_fast8_t width = 1 << log2_width;`

			`// Whether to swap references to always project on the left reference row.`
			`const bool vertical_mode = intra_mode >= 18;`
			`// Modes distance to horizontal or vertical mode.`
			`const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;`
			`// Sample displacement per column in fractions of 32.`
			`const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];`

			`// Pointer for the reference we are interpolating from.`
			`const kvz_pixel *ref_main;`
			`// Pointer for the other reference.`
			`const kvz_pixel *ref_side;`

			`// Set ref_main and ref_side such that, when indexed with 0, they point to`
			`// index 0 in block coordinates.`
			`if (sample_disp < 0) {`
			`// Negative sample_disp means, we need to use both references.`

			`ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;`
			`ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;`

			`// Move the reference pixels to start from the middle to the later half of`
			`// the tmp_ref, so there is room for negative indices.`
			`for (int_fast8_t x = -1; x < width; ++x) {`
			`tmp_ref[x + width] = ref_main[x];`
			`}`
			`// Get a pointer to block index 0 in tmp_ref.`
			`ref_main = &tmp_ref[width];`

			`// Extend the side reference to the negative indices of main reference.`
			`int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"`
			`int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];`
			`int_fast8_t most_negative_index = (width * sample_disp) >> 5;`
			`for (int_fast8_t x = -2; x >= most_negative_index; --x) {`
			`col_sample_disp += inv_abs_sample_disp;`
			`int_fast8_t side_index = col_sample_disp >> 8;`
			`tmp_ref[x + width] = ref_side[side_index - 1];`
			`}`
			`}`
			`else {`
			`// sample_disp >= 0 means we don't need to refer to negative indices,`
			`// which means we can just use the references as is.`
			`ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;`
			`ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;`
			`}`

			`if (sample_disp != 0) {`
			`// The mode is not horizontal or vertical, we have to do interpolation.`

			`int_fast16_t delta_pos = 0;`
			`for (int_fast8_t y = 0; y < width; ++y) {`
			`delta_pos += sample_disp;`
			`int_fast8_t delta_int = delta_pos >> 5;`
			`int_fast8_t delta_fract = delta_pos & (32 - 1);`

			`if (delta_fract) {`
			`// Do linear filtering`
Enable AVX2 strategy. Add first version of optimizations. 2015-10-08 09:10:08 +00:00			`if (width < 8) {`
			`for (int_fast8_t x = 0; x < width; ++x) {`
			`kvz_pixel ref1 = ref_main[x + delta_int];`
			`kvz_pixel ref2 = ref_main[x + delta_int + 1];`
			`dst[y * width + x] = ((32 - delta_fract) * ref1 + delta_fract * ref2 + 16) >> 5;`
			`}`
			`} else {`
			`struct { uint8_t w1; uint8_t w2; } packed_weights = { 32 - delta_fract, delta_fract };`
			`__m128i v_weights = _mm_set1_epi16((int16_t)&packed_weights);`

			`for (int_fast8_t x = 0; x < width; x += 8) {`
			`__m128i v_ref1 = _mm_loadl_epi64((__m128i*)&(ref_main[x + delta_int]));`
			`__m128i v_ref2 = _mm_loadl_epi64((__m128i*)&(ref_main[x + delta_int + 1]));`
			`__m128i v_refs = _mm_unpacklo_epi8(v_ref1, v_ref2);`
			`__m128i v_tmp = _mm_maddubs_epi16(v_refs, v_weights);`
			`v_tmp = _mm_add_epi16(v_tmp, _mm_set1_epi16(16));`
			`v_tmp = _mm_srli_epi16(v_tmp, 5);`
			`v_tmp = _mm_packus_epi16(v_tmp, v_tmp);`
			`_mm_storel_epi64((__m128i)(dst + y width + x), v_tmp);`
			`}`
Add AVX2 strategy. Copy generic implementation there. 2015-10-08 07:02:29 +00:00			`}`
			`}`
			`else {`
			`// Just copy the integer samples`
Enable AVX2 strategy. Add first version of optimizations. 2015-10-08 09:10:08 +00:00			`for (int_fast8_t x = 0; x < width; x+=4) {`
			`(int32_t)(&dst[y * width + x]) = (int32_t)(&ref_main[x + delta_int]);`
Add AVX2 strategy. Copy generic implementation there. 2015-10-08 07:02:29 +00:00			`}`
			`}`
			`}`
			`}`
			`else {`
			`// Mode is horizontal or vertical, just copy the pixels.`

			`for (int_fast8_t y = 0; y < width; ++y) {`
Enable AVX2 strategy. Add first version of optimizations. 2015-10-08 09:10:08 +00:00			`for (int_fast8_t x = 0; x < width; x+=4) {`
			`(int32_t)&(dst[y * width + x]) = (int32_t)&(ref_main[x]);`
Add AVX2 strategy. Copy generic implementation there. 2015-10-08 07:02:29 +00:00			`}`
			`}`
			`}`

			`// Flip the block if this is was a horizontal mode.`
			`if (!vertical_mode) {`
			`for (int_fast8_t y = 0; y < width - 1; ++y) {`
			`for (int_fast8_t x = y + 1; x < width; ++x) {`
			`SWAP(dst[y * width + x], dst[x * width + y], kvz_pixel);`
			`}`
			`}`
			`}`
			`}`

			`#endif //COMPILE_INTEL_AVX2`

			`int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)`
			`{`
			`bool success = true;`
			`#if COMPILE_INTEL_AVX2`
Enable AVX2 strategy. Add first version of optimizations. 2015-10-08 09:10:08 +00:00			`if (bitdepth == 8) {`
			`success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2);`
			`}`
Add AVX2 strategy. Copy generic implementation there. 2015-10-08 07:02:29 +00:00			`#endif //COMPILE_INTEL_AVX2`
			`return success;`
			`}`