From a8f71037973bb89c4e0b7232f0d8bc519bd7aa1b Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 14 Jul 2014 16:08:19 +0300 Subject: [PATCH] Add AVX2 implementations for sad_8bit_ 8x8, 16x16 and 32x32. --- build/kvazaar_lib/kvazaar_lib.vcxproj | 7 + build/kvazaar_lib/kvazaar_lib.vcxproj.filters | 12 ++ src/Makefile | 4 +- src/strategies/avx2/picture-avx2.c | 142 ++++++++++++++++++ src/strategies/avx2/picture-avx2.h | 24 +++ src/strategies/strategies-picture.c | 4 + tests/intra_sad_tests.c | 11 +- 7 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 src/strategies/avx2/picture-avx2.c create mode 100644 src/strategies/avx2/picture-avx2.h diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj index 5a191679..f51d4c26 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj @@ -139,6 +139,12 @@ + + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + @@ -172,6 +178,7 @@ + diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters index d29f63d3..6177bf56 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters @@ -49,6 +49,12 @@ {9e275b7f-3094-4614-b817-6ce0bee827b2} + + {80a08f4c-ca27-488b-bd63-8df6eacd1f3a} + + + {4ffb5d27-c5bb-44d5-a935-fa93066a259e} + @@ -153,6 +159,9 @@ Source Files\strategies\sse2 + + Source Files\strategies\avx2 + @@ -269,6 +278,9 @@ Header Files\strategies\sse2 + + Header Files\strategies\avx2 + diff --git a/src/Makefile b/src/Makefile index 9bf13f0b..2efcd4bf 100644 --- a/src/Makefile +++ b/src/Makefile @@ -90,7 +90,8 @@ OBJS = interface_main.o \ strategies/generic/picture-generic.o \ strategies/sse2/picture-sse2.o \ strategies/sse41/picture-sse41.o \ - strategies/altivec/picture-altivec.o + strategies/altivec/picture-altivec.o \ + strategies/avx2/picture-avx2.o PROG = ./kvazaar PROGS = $(PROG) @@ -108,6 +109,7 @@ ifeq ($(ARCH), ppc64) else strategies/sse2/%.o: EXTRA_FLAGS += -msse2 strategies/sse41/%.o: EXTRA_FLAGS += -msse4.1 + strategies/avx2/%.o: EXTRA_FLAGS += -mavx2 endif diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c new file mode 100644 index 00000000..9a0745c6 --- /dev/null +++ b/src/strategies/avx2/picture-avx2.c @@ -0,0 +1,142 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2014 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Kvazaar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kvazaar. If not, see . + ****************************************************************************/ + +/* + * \file + */ +#include "picture-avx2.h" +#include "strategyselector.h" + +#if COMPILE_INTEL_AVX2 +# include "image.h" +# include + + +static unsigned sad_8bit_8x8_avx2(const pixel *buf1, const pixel *buf2) +{ + __m256i sum; + { + // Get SADs for 8x8 pixels and add the results hierarchically into sum0. + const __m256i *const a = (const __m256i *)buf1; + const __m256i *const b = (const __m256i *)buf2; + + __m256i sum0, sum1; + sum0 = _mm256_sad_epu8(_mm256_load_si256(a + 0), _mm256_load_si256(b + 0)); + sum1 = _mm256_sad_epu8(_mm256_load_si256(a + 1), _mm256_load_si256(b + 1)); + sum = _mm256_add_epi32(sum0, sum1); + } + + // Add the high 128 bits to low 128 bits. + __m128i mm128_result = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extractf128_si256(sum, 1)); + // Add the high 64 bits to low 64 bits. + uint32_t result[4]; + _mm_storeu_si128((__m128i*)result, mm128_result); + return result[0] + result[2]; +} + + +static unsigned sad_8bit_16x16_avx2(const pixel *buf1, const pixel *buf2) +{ + __m256i sum; + { + // Get SADs for 16x16 pixels and add the results hierarchically into sum. + const __m256i *const a = (const __m256i *)buf1; + const __m256i *const b = (const __m256i *)buf2; + + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + sum0 = _mm256_sad_epu8(_mm256_load_si256(a + 0), _mm256_load_si256(b + 0)); + sum1 = _mm256_sad_epu8(_mm256_load_si256(a + 1), _mm256_load_si256(b + 1)); + sum2 = _mm256_sad_epu8(_mm256_load_si256(a + 2), _mm256_load_si256(b + 2)); + sum3 = _mm256_sad_epu8(_mm256_load_si256(a + 3), _mm256_load_si256(b + 3)); + sum4 = _mm256_sad_epu8(_mm256_load_si256(a + 4), _mm256_load_si256(b + 4)); + sum5 = _mm256_sad_epu8(_mm256_load_si256(a + 5), _mm256_load_si256(b + 5)); + sum6 = _mm256_sad_epu8(_mm256_load_si256(a + 6), _mm256_load_si256(b + 6)); + sum7 = _mm256_sad_epu8(_mm256_load_si256(a + 7), _mm256_load_si256(b + 7)); + + sum0 = _mm256_add_epi32(sum0, sum1); + sum2 = _mm256_add_epi32(sum2, sum3); + sum4 = _mm256_add_epi32(sum4, sum5); + sum6 = _mm256_add_epi32(sum6, sum7); + + sum0 = _mm256_add_epi32(sum0, sum2); + sum4 = _mm256_add_epi32(sum4, sum6); + + sum = _mm256_add_epi32(sum0, sum4); + } + + // Add the high 128 bits to low 128 bits. + __m128i mm128_result = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extractf128_si256(sum, 1)); + // Add the high 64 bits to low 64 bits. + uint32_t result[4]; + _mm_storeu_si128((__m128i*)result, mm128_result); + return result[0] + result[2]; +} + + +static unsigned sad_8bit_32x32_avx2(const pixel *buf1, const pixel *buf2) +{ + // Do 32x32 in 4 blocks. + __m256i sum = _mm256_setzero_si256(); + for (int i = 0; i < 32; i += 8) { + // Get SADs for 32x8 pixels and add the results hierarchically into sum. + const __m256i *const a = (const __m256i *)buf1 + i; + const __m256i *const b = (const __m256i *)buf2 + i; + + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + sum0 = _mm256_sad_epu8(_mm256_load_si256(a + 0), _mm256_load_si256(b + 0)); + sum1 = _mm256_sad_epu8(_mm256_load_si256(a + 1), _mm256_load_si256(b + 1)); + sum2 = _mm256_sad_epu8(_mm256_load_si256(a + 2), _mm256_load_si256(b + 2)); + sum3 = _mm256_sad_epu8(_mm256_load_si256(a + 3), _mm256_load_si256(b + 3)); + sum4 = _mm256_sad_epu8(_mm256_load_si256(a + 4), _mm256_load_si256(b + 4)); + sum5 = _mm256_sad_epu8(_mm256_load_si256(a + 5), _mm256_load_si256(b + 5)); + sum6 = _mm256_sad_epu8(_mm256_load_si256(a + 6), _mm256_load_si256(b + 6)); + sum7 = _mm256_sad_epu8(_mm256_load_si256(a + 7), _mm256_load_si256(b + 7)); + + sum0 = _mm256_add_epi32(sum0, sum1); + sum2 = _mm256_add_epi32(sum2, sum3); + sum4 = _mm256_add_epi32(sum4, sum5); + sum6 = _mm256_add_epi32(sum6, sum7); + + sum0 = _mm256_add_epi32(sum0, sum2); + sum4 = _mm256_add_epi32(sum4, sum6); + + sum = _mm256_add_epi32(sum, sum0); + sum = _mm256_add_epi32(sum, sum4); + } + + // Add the high 128 bits to low 128 bits. + __m128i mm128_result = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extractf128_si256(sum, 1)); + // Add the high 64 bits to low 64 bits. + uint32_t result[4]; + _mm_storeu_si128((__m128i*)result, mm128_result); + return result[0] + result[2]; +} + +#endif //COMPILE_INTEL_AVX2 + + +int strategy_register_picture_avx2(void* opaque) { + bool success = true; +#if COMPILE_INTEL_AVX2 + success &= strategyselector_register(opaque, "sad_8bit_8x8", "avx2", 40, &sad_8bit_8x8_avx2); + success &= strategyselector_register(opaque, "sad_8bit_16x16", "avx2", 40, &sad_8bit_16x16_avx2); + success &= strategyselector_register(opaque, "sad_8bit_32x32", "avx2", 40, &sad_8bit_32x32_avx2); +#endif + return success; +} diff --git a/src/strategies/avx2/picture-avx2.h b/src/strategies/avx2/picture-avx2.h new file mode 100644 index 00000000..10474d1d --- /dev/null +++ b/src/strategies/avx2/picture-avx2.h @@ -0,0 +1,24 @@ +#ifndef STRATEGIES_PICTURE_AVX2_H_ +#define STRATEGIES_PICTURE_AVX2_H_ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2014 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Kvazaar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kvazaar. If not, see . + ****************************************************************************/ + +int strategy_register_picture_avx2(void* opaque); + +#endif //STRATEGIES_PICTURE_AVX2_H_ diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index 1505659a..a52c9fc2 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -21,6 +21,7 @@ cost_pixel_nxn_func * satd_8bit_64x64 = 0; #include "generic/picture-generic.h" #include "sse2/picture-sse2.h" #include "sse41/picture-sse41.h" +#include "avx2/picture-avx2.h" #include "altivec/picture-altivec.h" @@ -35,6 +36,9 @@ int strategy_register_picture(void* opaque) { if (g_hardware_flags.intel_flags.sse41) { success &= strategy_register_picture_sse41(opaque); } + if (g_hardware_flags.intel_flags.avx2) { + success &= strategy_register_picture_avx2(opaque); + } if (g_hardware_flags.powerpc_flags.altivec) { success &= strategy_register_picture_altivec(opaque); } diff --git a/tests/intra_sad_tests.c b/tests/intra_sad_tests.c index 50c1274b..331eade3 100644 --- a/tests/intra_sad_tests.c +++ b/tests/intra_sad_tests.c @@ -52,8 +52,11 @@ static void setup_tests() for (int w = LCU_MIN_LOG_W; w <= LCU_MAX_LOG_W; ++w) { unsigned size = 1 << (w * 2); - bufs[test][w][0] = malloc(size * sizeof(pixel)); - bufs[test][w][1] = malloc(size * sizeof(pixel)); + bufs[test][w][0] = malloc(size * sizeof(pixel) + SIMD_ALIGNMENT); + bufs[test][w][0] = ALIGNED_POINTER(bufs[test][w][0], SIMD_ALIGNMENT); + + bufs[test][w][1] = malloc(size * sizeof(pixel) + SIMD_ALIGNMENT); + bufs[test][w][1] = ALIGNED_POINTER(bufs[test][w][1], SIMD_ALIGNMENT); } } @@ -69,8 +72,8 @@ static void tear_down_tests() { for (int test = 0; test < NUM_TESTS; ++test) { for (int log_width = 2; log_width <= 6; ++log_width) { - free(bufs[test][log_width][0]); - free(bufs[test][log_width][1]); + //free(bufs[test][log_width][0]); + //free(bufs[test][log_width][1]); } } }