From 8b253209a8b0e05cb998bb0ed829ed3ba6ad9a53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reima=20Hyv=C3=B6nen?= Date: Thu, 28 Mar 2019 12:46:41 +0200 Subject: [PATCH] Found false address load from calc_sao_edge_dir. Should now work like generic --- src/strategies/avx2/sao-avx2.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c index 4dd7eaf7..764a20eb 100644 --- a/src/strategies/avx2/sao-avx2.c +++ b/src/strategies/avx2/sao-avx2.c @@ -252,13 +252,13 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, // Check wich values are right for specific cat amount. // It's done for every single value that cat could get {1, 2, 0, 3, 4} - + //-------------------------------------------------------------------------- // v_cat == 0 __m256i mask_epi32 = _mm256_cmpeq_epi32(zeros_epi32, v_cat_epi32); int temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4; cat_sum_cnt[1][0] += temp_cnt; - temp_mem_epi32 = _mm256_load_si256((__m256i*)&orig_data[y * block_width + x] - c_data[0]); + temp_mem_epi32 = _mm256_sub_epi32(_mm256_load_si256((__m256i*)&orig_data[y * block_width + x]), _mm256_cvtepu8_epi32(vector_c_epi8)); temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32); tmp_zero_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32); //-------------------------------------------------------------------------- @@ -336,9 +336,11 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, __m256i v_cat_epi32 = sao_calc_eo_cat_avx2(&vector_a_epi8, &vector_b_epi8, &vector_c_epi8); - __m256i temp_mem_epi32 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&orig_data[y * block_width + x] - c_data[0])); - __m128i temp_mem_upper_epi32 = _mm_loadl_epi64((__m128i*)&orig_data[y * block_width + x + 4] - c_data[0]); + __m256i temp_mem_epi32 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&orig_data[y * block_width + x])); + __m128i temp_mem_upper_epi32 = _mm_loadl_epi64((__m128i*)&orig_data[y * block_width + x + 4]); _mm256_inserti128_si256(temp_mem_epi32, temp_mem_upper_epi32, 1); + + temp_mem_epi32 = _mm256_sub_epi32(temp_mem_epi32, _mm256_cvtepu8_epi32(vector_c_epi8)); // Check wich values are right for specific cat amount. // It's done for every single value that cat could get {1, 2, 0, 3, 4}