From bcf12567d021ed73252cbf7ba8e8554699c269ef Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 3 Oct 2014 17:47:51 +0300 Subject: [PATCH] Added some comments. --- src/strategies/avx2/dct-avx2.c | 100 ++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 39 deletions(-) diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index 47425e9f..5038e441 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -1,25 +1,25 @@ /***************************************************************************** - * This file is part of Kvazaar HEVC encoder. - * - * Copyright (C) 2013-2014 Tampere University of Technology and others (see - * COPYING file). - * - * Kvazaar is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * Kvazaar is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Kvazaar. If not, see . - ****************************************************************************/ +* This file is part of Kvazaar HEVC encoder. +* +* Copyright (C) 2013-2014 Tampere University of Technology and others (see +* COPYING file). +* +* Kvazaar is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License version 2 as published +* by the Free Software Foundation. +* +* Kvazaar is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with Kvazaar. If not, see . +****************************************************************************/ /* - * \file - */ +* \file +*/ #include @@ -42,39 +42,43 @@ extern const int16_t g_dct_8_t[8][8]; extern const int16_t g_dct_16_t[16][16]; extern const int16_t g_dct_32_t[32][32]; -/** -* \brief AVX2 transform functions -* -* TODO: description -* -* \param TODO -* -* \returns TODO +/* +* \file +* \brief AVX2 transformations. */ +// 4x4 matrix multiplication with value clipping. +// Parameters: Two 4x4 matrices containing 16-bit values in consecutive addresses, +// destination for the result and the shift value for clipping. static void mul_clip_matrix_4x4_avx2(const int16_t *left, const int16_t *right, int16_t *dst, int32_t shift) { - __m256i b[2], a, result, even[2], odd[2]; + __m256i b[2], a, result, even[2], odd[2]; const int32_t add = 1 << (shift - 1); a = _mm256_loadu_si256((__m256i*) left); b[0] = _mm256_loadu_si256((__m256i*) right); + // Interleave values in both 128-bit lanes b[0] = _mm256_unpacklo_epi16(b[0], _mm256_srli_si256(b[0], 8)); b[1] = _mm256_permute2x128_si256(b[0], b[0], 1 + 16); b[0] = _mm256_permute2x128_si256(b[0], b[0], 0); + // Fill both 128-lanes with the first pair of 16-bit factors in the lane. even[0] = _mm256_shuffle_epi32(a, 0); odd[0] = _mm256_shuffle_epi32(a, 1 + 4 + 16 + 64); + // Multiply packed elements and sum pairs. Input 16-bit output 32-bit. even[0] = _mm256_madd_epi16(even[0], b[0]); odd[0] = _mm256_madd_epi16(odd[0], b[1]); + // Add the halves of the dot product and + // round. result = _mm256_add_epi32(even[0], odd[0]); result = _mm256_add_epi32(result, _mm256_set1_epi32(add)); result = _mm256_srai_epi32(result, shift); + //Repeat for the remaining parts even[1] = _mm256_shuffle_epi32(a, 2 + 8 + 32 + 128); odd[1] = _mm256_shuffle_epi32(a, 3 + 12 + 48 + 192); @@ -85,11 +89,16 @@ static void mul_clip_matrix_4x4_avx2(const int16_t *left, const int16_t *right, odd[1] = _mm256_add_epi32(odd[1], _mm256_set1_epi32(add)); odd[1] = _mm256_srai_epi32(odd[1], shift); + // Truncate to 16-bit values result = _mm256_packs_epi32(result, odd[1]); _mm256_storeu_si256((__m256i*)dst, result); } +// 8x8 matrix multiplication with value clipping. +// Parameters: Two 8x8 matrices containing 16-bit values in consecutive addresses, +// destination for the result and the shift value for clipping. +// static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift) { int i, j; @@ -108,7 +117,7 @@ static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, even[0] = _mm256_set1_epi32(((int32_t*)left)[4 * i]); even[0] = _mm256_madd_epi16(even[0], b[0]); accu[i] = even[0]; - + odd[0] = _mm256_set1_epi32(((int32_t*)left)[4 * (i + 1)]); odd[0] = _mm256_madd_epi16(odd[0], b[0]); accu[i + 1] = odd[0]; @@ -123,7 +132,7 @@ static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, b[0] = _mm256_inserti128_si256(b[0], _mm256_castsi256_si128(b[1]), 1); for (i = 0; i < 8; i += 2) { - + even[0] = _mm256_set1_epi32(((int32_t*)left)[4 * i + j]); even[0] = _mm256_madd_epi16(even[0], b[0]); accu[i] = _mm256_add_epi32(accu[i], even[0]); @@ -135,16 +144,19 @@ static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, } for (i = 0; i < 8; i += 2) { - __m256i result, first_half, second_half; - + __m256i result, first_half, second_half; + first_half = _mm256_srai_epi32(_mm256_add_epi32(accu[i], _mm256_set1_epi32(add)), shift); second_half = _mm256_srai_epi32(_mm256_add_epi32(accu[i + 1], _mm256_set1_epi32(add)), shift); result = _mm256_permute4x64_epi64(_mm256_packs_epi32(first_half, second_half), 0 + 8 + 16 + 192); _mm256_storeu_si256((__m256i*)dst + i / 2, result); - } + } } +// 16x16 matrix multiplication with value clipping. +// Parameters: Two 16x16 matrices containing 16-bit values in consecutive addresses, +// destination for the result and the shift value for clipping. static void mul_clip_matrix_16x16_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift) { int i, j; @@ -168,11 +180,11 @@ static void mul_clip_matrix_16x16_avx2(const int16_t *left, const int16_t *right accu[i][1] = _mm256_madd_epi16(even, row[1]); odd = _mm256_set1_epi32(((int32_t*)left)[stride * (i + 1)]); - accu[i+1][0] = _mm256_madd_epi16(odd, row[0]); - accu[i+1][1] = _mm256_madd_epi16(odd, row[1]); + accu[i + 1][0] = _mm256_madd_epi16(odd, row[0]); + accu[i + 1][1] = _mm256_madd_epi16(odd, row[1]); } - for (j = 2; j < 16; j+=2) { + for (j = 2; j < 16; j += 2) { row[0] = _mm256_loadu_si256((__m256i*)right + j); row[1] = _mm256_loadu_si256((__m256i*)right + j + 1); @@ -183,11 +195,11 @@ static void mul_clip_matrix_16x16_avx2(const int16_t *left, const int16_t *right for (i = 0; i < 16; i += 2) { - even = _mm256_set1_epi32(((int32_t*)left)[stride * i + j/2]); + even = _mm256_set1_epi32(((int32_t*)left)[stride * i + j / 2]); accu[i][0] = _mm256_add_epi32(accu[i][0], _mm256_madd_epi16(even, row[0])); accu[i][1] = _mm256_add_epi32(accu[i][1], _mm256_madd_epi16(even, row[1])); - odd = _mm256_set1_epi32(((int32_t*)left)[stride * (i + 1) + j/2]); + odd = _mm256_set1_epi32(((int32_t*)left)[stride * (i + 1) + j / 2]); accu[i + 1][0] = _mm256_add_epi32(accu[i + 1][0], _mm256_madd_epi16(odd, row[0])); accu[i + 1][1] = _mm256_add_epi32(accu[i + 1][1], _mm256_madd_epi16(odd, row[1])); @@ -205,6 +217,9 @@ static void mul_clip_matrix_16x16_avx2(const int16_t *left, const int16_t *right } } +// 32x32 matrix multiplication with value clipping. +// Parameters: Two 32x32 matrices containing 16-bit values in consecutive addresses, +// destination for the result and the shift value for clipping. static void mul_clip_matrix_32x32_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift) { int i, j; @@ -288,9 +303,12 @@ static void mul_clip_matrix_32x32_avx2(const int16_t *left, const int16_t *right result = _mm256_permute4x64_epi64(_mm256_packs_epi32(third_quarter, fourth_quarter), 0 + 8 + 16 + 192); _mm256_storeu_si256((__m256i*)dst + 2 * i + 1, result); - } + } } +// Macro that generates 2D transform functions with clipping values. +// Sets correct shift values and matrices according to transform type and +// block size. Performs matrix multiplication horizontally and vertically. #define TRANSFORM(type, n) static void matrix_ ## type ## _ ## n ## x ## n ## _avx2(int8_t bitdepth, const int16_t *input, int16_t *output)\ {\ int32_t shift_1st = g_convert_to_bit[n] + 1 + (bitdepth - 8); \ @@ -303,6 +321,9 @@ static void mul_clip_matrix_32x32_avx2(const int16_t *left, const int16_t *right mul_clip_matrix_ ## n ## x ## n ## _avx2(dct, tmp, output, shift_2nd);\ }\ +// Macro that generates 2D inverse transform functions with clipping values. +// Sets correct shift values and matrices according to transform type and +// block size. Performs matrix multiplication horizontally and vertically. #define ITRANSFORM(type, n) \ static void matrix_i ## type ## _## n ## x ## n ## _avx2(int8_t bitdepth, const int16_t *input, int16_t *output)\ {\ @@ -316,6 +337,7 @@ static void matrix_i ## type ## _## n ## x ## n ## _avx2(int8_t bitdepth, const mul_clip_matrix_ ## n ## x ## n ## _avx2(tmp, dct, output, shift_2nd);\ }\ +// Generate all the transform functions TRANSFORM(dst, 4); TRANSFORM(dct, 4); TRANSFORM(dct, 8);