diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index aae28a7c..0ba7c189 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -34,6 +34,77 @@ #include "strategyselector.h" #include "strategies/generic/picture-generic.h" +/** + * \brief Calculate Sum of Absolute Differences (SAD) + * + * Calculate Sum of Absolute Differences (SAD) between two rectangular regions + * located in arbitrary points in the picture. + * + * \param data1 Starting point of the first picture. + * \param data2 Starting point of the second picture. + * \param width Width of the region for which SAD is calculated. + * \param height Height of the region for which SAD is calculated. + * \param stride Width of the pixel array. + * + * \returns Sum of Absolute Differences + */ + +uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int width, const int height, const unsigned stride1, const unsigned stride2) +{ + int32_t y, x; + + // Bytes in block in 256-bit blocks per each scanline, and remainder + const int largeblock_bytes = width & ~31; + const int residual_bytes_1 = width & 31; + const int residual_xmms = residual_bytes_1 >> 4; + const int residual_bytes = residual_bytes_1 & 15; + + const __m128i rds = _mm_set1_epi8(residual_bytes); + const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i rdmask = _mm_cmpgt_epi8(rds, ns); + + __m256i avx_inc = _mm256_setzero_si256(); + __m128i sse_inc = _mm_setzero_si128(); + + for (y = 0; y < height; ++y) { + for (x = 0; x < largeblock_bytes; x += 32) { + __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x))); + __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x))); + __m256i curr_sads = _mm256_sad_epu8(a, b); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads); + } + if (residual_xmms) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x))); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x))); + __m128i curr_sads = _mm_sad_epu8 (a, b); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + x += 16; + } + if (residual_bytes) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x))); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x))); + + __m128i b_masked = _mm_blendv_epi8(a, b, rdmask); + __m128i curr_sads = _mm_sad_epu8(a, b_masked); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i avx_inc_3 = _mm256_add_epi64 (avx_inc, avx_inc_2); + __m256i avx_inc_4 = _mm256_shuffle_epi32 (avx_inc_3, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i avx_inc_5 = _mm256_add_epi64 (avx_inc_3, avx_inc_4); + __m128i avx_inc_128 = _mm256_castsi256_si128 (avx_inc_5); + + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sse_sads = _mm_add_epi64 (sse_inc, sse_inc_2); + __m128i sads = _mm_add_epi64 (sse_sads, avx_inc_128); + + // 32 bits should always be enough for even the largest blocks with a SAD of + // 255 in each pixel, even though the SAD results themselves are 64 bits + return _mm_cvtsi128_si32(sads); +} /** * \brief Calculate SAD for 8x8 bytes in continuous memory. @@ -1230,6 +1301,11 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) // simplest code to look at for anyone interested in doing more // optimizations, so it's worth it to keep this maintained. if (bitdepth == 8){ + + // It currently appears that this is actually slower than the SSE4.1 + // version.. Go figure + success &= kvz_strategyselector_register(opaque, "reg_sad", "avx2", 19, &kvz_reg_sad_avx2); + success &= kvz_strategyselector_register(opaque, "sad_8x8", "avx2", 40, &sad_8bit_8x8_avx2); success &= kvz_strategyselector_register(opaque, "sad_16x16", "avx2", 40, &sad_8bit_16x16_avx2); success &= kvz_strategyselector_register(opaque, "sad_32x32", "avx2", 40, &sad_8bit_32x32_avx2); @@ -1250,7 +1326,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2); success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2); - success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2); + success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2); } #endif diff --git a/src/strategies/sse41/picture-sse41.c b/src/strategies/sse41/picture-sse41.c index d75b4d58..b5559e33 100644 --- a/src/strategies/sse41/picture-sse41.c +++ b/src/strategies/sse41/picture-sse41.c @@ -28,63 +28,43 @@ #include "strategyselector.h" -unsigned kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2, - const int width, const int height, const unsigned stride1, const unsigned stride2) +uint32_t kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t width, const int32_t height, const uint32_t stride1, + const uint32_t stride2) { - int y, x; - unsigned sad = 0; - __m128i sse_inc = _mm_setzero_si128 (); - long long int sse_inc_array[2]; + int32_t y, x; + __m128i sse_inc = _mm_setzero_si128(); + // Bytes in block in 128-bit blocks per each scanline, and remainder + const int32_t largeblock_bytes = width & ~15; + const int32_t residual_bytes = width & 15; + + const __m128i rds = _mm_set1_epi8 (residual_bytes); + const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i rdmask = _mm_cmpgt_epi8(rds, ns); + for (y = 0; y < height; ++y) { - for (x = 0; x <= width-16; x+=16) { - const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); - const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b)); + for (x = 0; x < largeblock_bytes; x += 16) { + __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); + __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); + __m128i curr_sads = _mm_sad_epu8(a, b); + sse_inc = _mm_add_epi32(sse_inc, curr_sads); } - { - const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); - const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); - switch (((width - (width%2)) - x)/2) { - case 0: - break; - case 1: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01))); - break; - case 2: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03))); - break; - case 3: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07))); - break; - case 4: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f))); - break; - case 5: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f))); - break; - case 6: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f))); - break; - case 7: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f))); - break; - default: - //Should not happen - assert(0); - } - x = (width - (width%2)); - } + if (residual_bytes) { + __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); + __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); - for (; x < width; ++x) { - sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); + __m128i b_masked = _mm_blendv_epi8(a, b, rdmask); + __m128i curr_sads = _mm_sad_epu8(a, b_masked); + sse_inc = _mm_add_epi32(sse_inc, curr_sads); } } - _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc); - sad += sse_inc_array[0] + sse_inc_array[1]; + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); - return sad; + return _mm_cvtsi128_si32(sad); } #endif //COMPILE_INTEL_SSE41