diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c index a87d3beb..b3d6e764 100644 --- a/src/strategies/avx2/sao-avx2.c +++ b/src/strategies/avx2/sao-avx2.c @@ -138,11 +138,11 @@ static int sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, __m256i v_cat_epi32 = sao_calc_eo_cat_avx2(vector_a_epi8, vector_b_epi8, vector_c_epi8); - __m128i diff_lower_epi32 = _mm_loadu_si128((__m128i*)&orig_data[y * block_width + x] - c); + tmp_diff_epi32 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&orig_data[y * block_width + x] - c)); __m128i diff_upper_epi32 = _mm_loadl_epi64((__m128i*)&orig_data[y * block_width + x + 4] - c); - - tmp_diff_epi32 = _mm256_set_m128i(diff_upper_epi32, diff_lower_epi32); + _mm256_insertf128_si256(tmp_diff_epi32, diff_upper_epi32, 0x1); + tmp_offset_epi32 = _mm256_permutevar8x32_epi32(offsets_epi32, v_cat_epi32); offset_zeros_epi32 = _mm256_cmpeq_epi32(zeros_epi32, tmp_offset_epi32); @@ -306,10 +306,9 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, __m256i v_cat_epi32 = sao_calc_eo_cat_avx2(vector_a_epi8, vector_b_epi8, vector_c_epi8); - __m128i temp_mem_lower_epi32 = _mm_loadu_si128((__m128i*)&orig_data[y * block_width + x] - c); + __m256i temp_mem_epi32 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&orig_data[y * block_width + x] - c)); __m128i temp_mem_upper_epi32 = _mm_loadl_epi64((__m128i*)&orig_data[y * block_width + x + 4] - c); - - __m256i temp_mem_epi32 = _mm256_set_m128i(temp_mem_upper_epi32, temp_mem_lower_epi32); + _mm256_insertf128_si256(temp_mem_epi32, temp_mem_upper_epi32, 0x1); // Check wich values are right for specific cat amount. // It's done for every single value that cat could get {1, 2, 0, 3, 4}