//---------------------------------------------------------------------------------------- // Copyright (c) 2024 Walter Mascarenhas // // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. // // The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability", // and we provide no guarantee regarding the correctness of this source code. // //---------------------------------------------------------------------------------------- #include #include #include #include #include #include "lp_2d.hpp" //---------------------------------------------------------------------------------------- namespace wm::nlp::lp2d { __m256i bounds( __m256i* tiny, __m256i* huge ) { __m256i ones = ones_i(); __m256i sign = _mm256_slli_epi64( ones, 63 ); __m256i a = _mm256_srli_epi64( sign, 1 ); // 0b0100'0000'0000... __m256i b = _mm256_srli_epi64( sign, 3 ); // 0b0001'0000'0000... __m256i c = _mm256_or_si256( a, b ); // 0b0101'0000'0000... __m256i d = _mm256_srli_epi64( c, 4 ); // 0b0000'0101'0000... *huge = _mm256_or_si256( c, d ); // 0b0101'0101'0000... 0x5500... __m256i e = _mm256_srli_epi64( *huge, 1 ); // 0b0010'1010'1000... __m256i f = _mm256_srli_epi64( d, 2 ); // 0b0000'0001'0100... __m256i g = _mm256_srli_epi64( d, 4 ); // 0b0000'0000'0101... __m256i h = _mm256_srli_epi64( ones, 1 ); __m256i i = _mm256_srli_epi64( sign, 32 ); __m256i j = _mm256_or_si256( f, g ); // 0b0000'0001'0101... *tiny = _mm256_or_si256( e, j ); // 0b0010'1011'1101... 0x2bd... __m256i k = and_not_i( i, h ); *huge = blend< 0, 0, 0, 1 >( *huge, k ); *tiny = blend< 0, 0, 0, 1 >( *tiny, zero_i() ); return sign; } //---------------------------------------------------------------------------------------- // exact_location returns the exact location of the intersection ( x, y ) of the lines // // b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ] // b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ] // // with respect to the set // // e0 x + e1 y >= e2 // // under the assumption that // // d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0 // // where ei = edge[ i ], b0 = border[ 0 ] and b1 = b[ 1 ] // // This amounts to evaluating the sign of // // p = e0 r + e1 s - e2 d // // r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ] // s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] , // // which motivates the evaluation of the six products // // group 1 group 2 // e0 b0[ 2 ] b1[ 1 ], -e0 b0[ 1 ] b1[ 2 ] // e1 b0[ 0 ] b1[ 2 ], -e1 b0[ 2 ] b1[ 0 ] // e2 b0[ 1 ] b1[ 0 ]. -e2 b0[ 0 ] b1[ 1 ] // // and we want to find the sign of their sum. // // The function assumes that the edges entries are either normal or zero, ie., // there are no subnormal entries. //---------------------------------------------------------------------------------------- location exact_location( __m256d e, __m256d b0, __m256d b1 ) { //---------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------- __m256d b0_a = permute< 1, 2, 0, 3 >( b0 ); // { b0[ 1 ], b0[ 2 ], b0[ 0 ], ? } __m256d b0_b = permute< 2, 0, 1, 3 >( b0 ); // { b0[ 2 ], b0[ 0 ], b0[ 1 ], ? } __m256d b1_a = permute< 1, 2, 0, 3 >( b1 ); // { b1[ 1 ], b1[ 2 ], b1[ 0 ], ? } __m256d b1_b = permute< 2, 0, 1, 3 >( b1 ); // { b1[ 2 ], b1[ 0 ], b1[ 1 ], ? } __m256d g1 = _mm256_mul_pd( b0_b, b1_a ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? } __m256d g2 = _mm256_mul_pd( b0_a, b1_b ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? } __m256d g1b = _mm256_fmsub_pd( b0_b, b1_a, g1 ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? } __m256d g2b = _mm256_fmsub_pd( b0_a, b1_b, g2 ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? } __m256i sign_i = _mm256_slli_epi64( ones_i(), 63 ); __m256d sign = cast_d( sign_i ); __m256d minus_e = xor_d( e, sign ); __m256d c[ 8 ]; c[ 0 ] = _mm256_mul_pd( e, g1 ); c[ 1 ] = _mm256_mul_pd( minus_e, g2 ); c[ 2 ] = _mm256_mul_pd( e, g1b ); c[ 3 ] = _mm256_mul_pd( minus_e, g2b ); c[ 4 ] = _mm256_fmsub_pd( e, g1, c[ 0 ] ); c[ 5 ] = _mm256_fmsub_pd( minus_e, g2, c[ 1 ] ); c[ 6 ] = _mm256_fmsub_pd( e, g1b, c[ 2 ] ); c[ 7 ] = _mm256_fmsub_pd( minus_e, g2b, c[ 3 ] ); __m256d a[ 6 ]; // transposing the products __m256d tmp_0 = gather_low( c[ 0 ], c[ 1 ] ); __m256d tmp_1 = gather_high( c[ 0 ], c[ 1 ] ); __m256d tmp_2 = gather_low( c[ 2 ], c[ 3 ] ); __m256d tmp_3 = gather_high( c[ 2 ], c[ 3 ] ); a[ 0 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x20 ); a[ 1 ] = _mm256_permute2f128_pd( tmp_1, tmp_3, 0x20 ); a[ 2 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x31 ); __m256d tmp_4 = gather_low( c[ 4 ], c[ 5 ] ); __m256d tmp_5 = gather_high( c[ 4 ], c[ 5 ] ); __m256d tmp_6 = gather_low( c[ 6 ], c[ 7 ]); __m256d tmp_7 = gather_high( c[ 6 ], c[ 7 ] ); a[ 3 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x20 ); a[ 4 ] = _mm256_permute2f128_pd( tmp_5, tmp_7, 0x20 ); a[ 5 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x31 ); __m256d gt_05 = is_greater( a[ 0 ], a[ 5 ] ); __m256d tmp = blend_max( a[ 0 ], a[ 5 ], gt_05 ); a[ 0 ] = blend_min( a[ 0 ], a[ 5 ], gt_05 ); a[ 5 ] = tmp; //---------------------------------------------------------------------------------------- // In the loop below we assume that a0[ j ] <= a5[ j ], i = 0,1,2,3 //---------------------------------------------------------------------------------------- while( true ) { __m256d gt_12 = is_greater( a[ 1 ], a[ 2 ] ); __m256d gt_34 = is_greater( a[ 3 ], a[ 4 ] ); __m256d min_12 = blend_min( a[ 1 ], a[ 2 ], gt_12 ); __m256d max_12 = blend_max( a[ 1 ], a[ 2 ], gt_12 ); __m256d min_34 = blend_min( a[ 3 ], a[ 4 ], gt_34 ); __m256d max_34 = blend_max( a[ 3 ], a[ 4 ], gt_34 ); __m256d gt_min = is_greater( min_12, min_34 ); __m256d gt_max = is_greater( max_12, max_34 ); __m256d min_1234 = blend_min( min_12, min_34, gt_min ); a[ 2 ] = blend_max( min_12, min_34, gt_min ); a[ 3 ] = blend_min( max_12, max_34, gt_max ); __m256d max_1234 = blend_max( max_12, max_34, gt_max ); __m256d gt_0_min_1234 = is_greater( a[ 0 ], min_1234 ); __m256d gt_max_1234_5 = is_greater( max_1234, a[ 5 ] ); a[ 1 ] = blend_max( a[ 0 ], min_1234, gt_0_min_1234 ); a[ 0 ] = blend_min( a[ 0 ], min_1234, gt_0_min_1234 ); a[ 4 ] = blend_min( max_1234, a[ 5 ], gt_max_1234_5 ); a[ 5 ] = blend_max( max_1234, a[ 5 ], gt_max_1234_5 ); // 1) a0[ j ] = min { ai[ j ], i = 0, .... 5 } // 2) a5[ j ] = max { ai[ j ], i = 0, .... 5 } __m256d zero = _mm256_xor_pd( zero, zero ); __m256d cmp0 = _mm256_cmp_pd( a[ 0 ], zero, _CMP_LT_OQ ); if( _mm256_testz_pd( cmp0, cmp0 ) ) // all a0[ j ] are >= 0 { __m256d cmp = is_greater( a[ 5 ], zero ); if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a5 is positive return location::border; else return location::in; } __m256d cmp5 = _mm256_cmp_pd( a[ 5 ], zero, _CMP_GT_OQ ); if( _mm256_testz_pd( cmp5, cmp5 ) ) // all a5[ j ] are <= 0 { __m256d cmp = is_greater( zero, a[ 0 ] ); if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a0 is negative return location::border; else return location::out; } // now // 1) min ai[j] has its sign bit set // 2) max ai[j] does not have its sign bit set __m256d lo = gather_low( a[ 0 ], a[ 5 ] ); // { a0[ 0 ], a5[ 0 ], a0[ 2 ], a5[ 2 ] } __m256d hi = gather_high( a[ 0 ], a[ 5 ] ); // { a0[ 1 ], a5[ 1 ], a0[ 3 ], a5[ 3 ] } __m256d gt_lo_hi = is_greater( lo, hi ); __m256d b0 = blend_min( lo, hi, gt_lo_hi ); __m256d b5 = blend_max( lo, hi, gt_lo_hi ); // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 2 ] } // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 1 ], b5[ 3 ] } b0 = permute< 0, 2, 1, 3 >( b0 ); b5 = permute< 3, 1, 2, 0 >( b5 ); // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 1 ] } // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 0 ], b5[ 1 ] } lo = gather_low( b0, b5 ); // { b0[ 0 ], b5[ 0 ], b0[ 2 ], b5[ 2 ] } hi = gather_high( b0, b5 ); // { b0[ 1 ], b5[ 1 ], b0[ 3 ], b5[ 3 ] } gt_lo_hi = is_greater( lo, hi ); b0 = blend_min( lo, hi, gt_lo_hi ); b5 = blend_max( lo, hi, gt_lo_hi ); // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 1 ] b5 = permute< 1, 0, 3, 2 >( b5 ); // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] <= 0 // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 0 ] >= 0 s // fixing possible break of order in positions 1,2,3 __m256d chk123 = is_greater( b0, b5 ); __m256d aux = blend_min( b0, b5, chk123 ); b5 = blend_max( b0, b5, chk123 ); b0 = aux; __m256i b0i = cast_i( b0 ); __m256i b5i = cast_i( b5 ); uint64_t u0 = _mm256_extract_epi64( b0i, 0 ) & 0x7FFF'FFFF'FFFF'FFFFULL; uint64_t u5 = _mm256_extract_epi64( b5i, 0 ); if( u0 >= u5 ) // | b0[ 0 | >= | b5[ 0 ] | { if( u0 > u5 + 0x0060'0000'0000'0000ULL ) // |b0| > 2^5 |b1| (we use 0x006... to handle subnormals properly) return location::out; __m256d sum = _mm256_add_pd( b0, b5 ); // b0 + b5, rounded down __m256d cor = _mm256_sub_pd( sum, b0 ); // sum - b0, rounded down __m256d delta = _mm256_sub_pd( b5, cor ); // b5 - cor, rounded down. // now b0[ 0 ] + b5[ 0 ] = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ] a[ 0 ] = blend< 1, 0, 0, 0 >( b0, sum ); a[ 5 ] = blend< 1, 0, 0, 0 >( b5, delta ); } else { if( u5 > u0 + 0x0060'0000'0000'0000ULL ) return location::in; __m256i sign_i = _mm256_slli_epi64( ones_i(), 63 ); __m256d sign = cast_d( sign_i ); __m256d minus_b0 = xor_d( b0, sign ); __m256d sum = _mm256_sub_pd( minus_b0, b5 ); // (-b5) + (-b0) rounded down __m256d cor = _mm256_add_pd( sum, b5 ); // sum - (-b5) rounded down __m256d delta = _mm256_sub_pd( minus_b0, cor ); // (-b0) - cor, rounded down. // -( b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ] sum = xor_d( sum, sign ); delta = xor_d( delta, sign ); // b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with delta[ 0 ] <= 0 <= sum[ 0 ] a[ 0 ] = blend< 1, 0, 0, 0 >( b0, delta ); a[ 5 ] = blend< 1, 0, 0, 0 >( b5, sum ); } } } //---------------------------------------------------------------------------------------- // bounded_convex::box: finding the box of the bounded convex region, under the assumption // that the normals are normalized, that is, for each edge e, either // e[ i ] = +0.0 or e[ i ] >= tiny ~ 2^(-100), i.e. no signed zeros, underflow // or overflow or negative coordinates. //---------------------------------------------------------------------------------------- __m256d bounded_convex::box() const { constexpr int non_0 = 0; constexpr int non_1 = 2; constexpr int inf_0 = 0; constexpr int sup_0 = 1; constexpr int inf_1 = 2; constexpr int sup_1 = 3; assert( type != bounded_convex_type::empty ); __m256d den; __m256d num; __m256i sign = _mm256_slli_epi64( ones_i(), 63 ); // { -1, -1, -1, -1 } __m256i sign_h = _mm256_slli_si256( sign, 8 ); // { 0, -1, 0, -1 } if( type != bounded_convex_type::polygon ) { if( type == bounded_convex_type::point ) { // { vertex_pair[ 0 ] = { x_inf, -x_sup, ? , ? } // { vertex_pair[ 1 ] = { y_inf, -y_sup, ? , ? } // { vertex_pair[ 2 ] = { z_inf, -z_sup, ? , ? } den = xor_d( vertex_p[ 2 ], sign_h ); den = permute< sup_0, inf_0, sup_0, inf_0 >( den ); num = gather_lanes< 0, 0 >( vertex_p[ 0 ], vertex_p[ 1 ] ); } else { __m256d num_x; __m256d num_y; den = xor_d( vertex_p[ 2 ], sign_h ); int xy_signs = high_bits_64( edge[ 1 ] ) & 0x3; switch( xy_signs ) { case 0: // edge[ 0 ] >= 0, edge[ 1 ] >= 0: box = { x0, x1, y1, y0 } { num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); den = permute< sup_0, inf_1, sup_1, inf_0 >( den ); break; } case 1: // edge[ 0 ] < 0, edge[ 1 ] >= 0: box = { x0, x1, y0, y1 } { num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); den = permute< sup_0, inf_0, sup_0, inf_1 >( den ); break; } case 2: // edge[ 0 ] >= 0, edge[ 1 ] < 0: box = {x1, x0, y1, y0 } { num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); den = permute< sup_1, inf_0, sup_1, inf_0 >( den ); break; } case 3: // edge[ 0 ] < 0, edge[ 1 ] < 0: box = { x1, x0, y0, y1 } { num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); den = permute< sup_1, inf_0, sup_0, inf_1 >( den ); break; } } num = gather_lanes< 0, 1 >( num_x, num_y ); } } else { int index = begin; auto set_x_max = [ & ]() { int previous_vertex = ( ( index > 0 ) ? index : end ); int previous_pair = 3 * ( --previous_vertex / 2 ); __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); if( previous_vertex & 0x1 ) { z = permute< non_0, inf_1, non_1, non_1 >( z ); __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] ); num = blend< 0, 1, 0, 0 >( num, x ); } else { z = permute< non_0, inf_0, non_1, non_1 >( z ); num = blend< 0, 1, 0, 0 >( num, vertex_p[ previous_pair ] ); } den = blend< 0, 1, 0, 0 >( den, z ); }; auto set_x_min = [ & ]() { int previous_vertex = ( ( index > 0 ) ? index : end ); int previous_pair = 3 * ( --previous_vertex / 2 ); __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); if( previous_vertex & 0x1 ) { z = permute< sup_1, non_0, non_1, non_1 >( z ); __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] ); num = blend< 1, 0, 0, 0 >( num, x ); } else { z = permute< sup_0, non_0, non_1, non_1 >( z ); num = blend< 1, 0, 0, 0 >( num, vertex_p[ previous_pair ] ); } den = blend< 1, 0, 0, 0 >( den, z ); }; auto set_y_max = [ & ]() { int previous_vertex = ( ( index > 0 ) ? index : end ); int previous_pair = 3 * ( --previous_vertex / 2 ); __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); if( previous_vertex & 0x1 ) { z = permute< non_0, non_0, non_1, inf_1 >( z ); num = blend< 0, 0, 0, 1 >( num, vertex_p[ previous_pair + 1 ] ); } else { z = permute< non_0, non_0, non_1, inf_0 >( z ); __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] ); num = blend< 0, 0, 0, 1 >( num, y ); } den = blend< 0, 0, 0, 1 >( den, z ); }; auto set_y_min = [ & ]() { int previous_vertex = ( ( index > 0 ) ? index : end ); int previous_pair = 3 * ( --previous_vertex / 2 ); __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); if( previous_vertex & 0x1 ) { z = permute< non_1, non_1, sup_1, non_1 >( z ); num = blend< 0, 0, 1, 0 >( num, vertex_p[ previous_pair + 1 ] ); } else { z = permute< non_0, non_0, sup_0, non_0 >( z ); __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] ); num = blend< 0, 0, 1, 0 >( num, y ); } den = blend< 0, 0, 1, 0 >( den, z ); }; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 0: // normal[ 0 ] >= 0, normal[ 1 ] >= 0: first quadrant { if( index & 0x1 ) goto quadrant_1_odd; else goto quadrant_1_even; } case 1: // normal[ 0 ] < 0, normal[ 1 ] >= 0: second quadrant { if( index & 0x1 ) goto quadrant_2_odd; else goto quadrant_2_even; } case 2: // normal[ 0 ] >= 0, normal[ 1 ] < 0: fourth quadrant { if( index & 0x1 ) goto quadrant_4_odd; else goto quadrant_4_even; } case 3: // normal[ 0 ] < 0, normal[ 1 ] < 0: third quadrant { if( index & 0x1 ) goto quadrant_3_odd; else goto quadrant_3_even; } } quadrant_1_even: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 0: // quadrant_1 goto quadrant_1_odd; case 1: // quadrant_2 { set_y_min(); goto quadrant_2_odd; } case 3: // quadrant_3 { set_x_max(); set_y_min(); goto quadrant_3_odd; } } } quadrant_1_odd: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 0: // quadrant_1 goto quadrant_1_even; case 1: // quadrant_2 { set_y_min(); goto quadrant_2_even; } case 3: // quadrant_3 { set_x_max(); set_y_min(); goto quadrant_3_even; } } } quadrant_2_even: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 1: // quadrant_2 goto quadrant_2_odd; case 2:// quadrant_4 { set_x_max(); set_y_max(); goto quadrant_4_odd; } case 3: // quadrant_3 { set_x_max(); goto quadrant_3_odd; } } } quadrant_2_odd: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 1: // quadrant_2 goto quadrant_2_even; case 2:// quadrant_4 { set_x_max(); set_y_max(); goto quadrant_4_even; } case 3: // quadrant_3 { set_x_max(); goto quadrant_3_even; } } } quadrant_3_even: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 0: // quadrant_1 { set_y_max(); set_x_min(); goto quadrant_1_odd; } case 2:// quadrant_4 { set_y_max(); goto quadrant_4_odd; } case 3: // quadrant_3 goto quadrant_3_odd; } } quadrant_3_odd: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 0: // quadrant_1 { set_x_min(); set_y_max(); goto quadrant_1_even; } case 2:// quadrant_4 { set_y_max(); goto quadrant_4_even; } case 3: // quadrant_3 goto quadrant_3_even; } } quadrant_4_even: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 0: // quadrant_1 { set_x_min(); goto quadrant_1_odd; } case 1: // quadrant_2 { set_x_min(); set_y_min(); goto quadrant_2_odd; } case 2:// quadrant_4 goto quadrant_4_odd; } } quadrant_4_odd: { if( ++index > end ) goto done; int xy_signs = high_bits_64( edge[ index ] ) & 0x3; switch( xy_signs ) { case 0: // quadrant_1 { set_x_min(); goto quadrant_1_even; } case 1: // quadrant_2 { set_x_min(); set_y_min(); goto quadrant_2_even; } case 2:// quadrant_4 goto quadrant_4_even; } } done: {} } num = _mm256_div_pd( num, den ); num = xor_d( num, sign_h ); return num; } int bounded_convex::insert_vertex( int at ) { int high = end - at; int low = at - begin; assert( ( low > 0 ) && ( high > 0 ) ); if( low <= high ) { __m256d* ed = edge + ( begin - 1 ); __m256d* ed_max = ed + low; for( ; ed < ed_max; ++ed ) { __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed + 1 ) ); _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp ); } int to_load; __m256d* vp = vertex_pair( begin ); __m256d a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) ); __m256d a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) ); __m256d a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) ); if( ( begin & 0x1 ) == 0 ) { __m256d c0 = gather_lanes< clear_lane, 0 >( a0 ); __m256d c1 = gather_lanes< clear_lane, 0 >( a1 ); __m256d c2 = gather_lanes< clear_lane, 0 >( a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), c2 ); if( low == 1 ) { --begin; return at - 1; } to_load = low - 2; } else to_load = low - 1; int q = to_load / 4; __m256d* vp_max = vp + 6 * q; while( vp < vp_max ) { __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); __m256d c0 = gather_lanes< 1, 0 >( a0, b0 ); __m256d c1 = gather_lanes< 1, 0 >( a1, b1 ); __m256d c2 = gather_lanes< 1, 0 >( a2, b2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); vp += 3; a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); __m256d d0 = gather_lanes< 1, 0 >( b0, a0 ); __m256d d1 = gather_lanes< 1, 0 >( b1, a1 ); __m256d d2 = gather_lanes< 1, 0 >( b2, a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), d0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); vp += 3; } int r = to_load % 4; if( r == 0 ) { __m256d c0 = gather_lanes< 1, clear_lane >( a0 ); __m256d c1 = gather_lanes< 1, clear_lane >( a1 ); __m256d c2 = gather_lanes< 1, clear_lane >( a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); } else { __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); __m256d c0 = gather_lanes< 1, 0 >( a0, b0 ); __m256d c1 = gather_lanes< 1, 0 >( a1, b1 ); __m256d c2 = gather_lanes< 1, 0 >( a2, b2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); switch( r ) { case 1: goto low_end; case 2: { __m256d c0 = gather_lanes< 1, clear_lane >( b0 ); __m256d c1 = gather_lanes< 1, clear_lane >( b1 ); __m256d c2 = gather_lanes< 1, clear_lane >( b2 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 3 ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 4 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 5 ), c2 ); goto low_end; } default: { vp += 3; a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); __m256d c3 = gather_lanes< 1, 0 >( b0, a0 ); __m256d c4 = gather_lanes< 1, 0 >( b1, a1 ); __m256d c5 = gather_lanes< 1, 0 >( b2, a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c3 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c4 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c5 ); } } } low_end: { --begin; return at - 1; } } else { __m256d* ed = edge + end + 1; __m256d* ed_min = ed - ( high + 1 ); for( ; ed > ed_min; --ed ) { __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed - 1 ) ); _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp ); } __m256d* vp = vertex_pair( end ); __m256d a0; __m256d a1; __m256d a2; int to_load; if( ( end & 0x1 ) == 0 ) { a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); __m256d c0 = gather_lanes< 1, clear_lane >( a0 ); __m256d c1 = gather_lanes< 1, clear_lane >( a1 ); __m256d c2 = gather_lanes< 1, clear_lane >( a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); if( high == 1 ) { ++end; return at; } vp -= 3; to_load = high - 2; } else { to_load = high - 1; a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) ); a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) ); a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) ); } int q = to_load / 4; __m256d* vp_min = vp - 6 * q; while( vp > vp_min ) { __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); __m256d c0 = gather_lanes< 1, 0 >( b0, a0 ); __m256d c1 = gather_lanes< 1, 0 >( b1, a1 ); __m256d c2 = gather_lanes< 1, 0 >( b2, a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); vp -= 3; a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); __m256d d0 = gather_lanes< 1, 0 >( a0, b0 ); __m256d d1 = gather_lanes< 1, 0 >( a1, b1 ); __m256d d2 = gather_lanes< 1, 0 >( a2, b2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), d0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); vp -= 3; } int r = to_load % 4; if( r == 0 ) { __m256d c0 = gather_lanes< clear_lane, 0 >( a0 ); __m256d c1 = gather_lanes< clear_lane, 0 >( a1 ); __m256d c2 = gather_lanes< clear_lane, 0 >( a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); } else { __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); __m256d c0 = gather_lanes< 1, 0 >( b0, a0 ); __m256d c1 = gather_lanes< 1, 0 >( b1, a1 ); __m256d c2 = gather_lanes< 1, 0 >( b2, a2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); switch( r ) { case 1: goto high_end; case 2: { __m256d d0 = gather_lanes< clear_lane, 0 >( b0 ); __m256d d1 = gather_lanes< clear_lane, 0 >( b1 ); __m256d d2 = gather_lanes< clear_lane, 0 >( b2 ); _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), d0 ); _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), d1 ); _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), d2 ); goto high_end; } default: // case 3 { vp -= 3; a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); __m256d d0 = gather_lanes< 1, 0 >( a0, b0 ); __m256d d1 = gather_lanes< 1, 0 >( a1, b1 ); __m256d d2 = gather_lanes< 1, 0 >( a2, b2 ); _mm256_store_pd( reinterpret_cast< double* >( vp ), d0 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); } } } high_end: { ++end; return at; } } } void bounded_convex::push( __m256d* begin, __m256d* end ) { for( ; begin < end; ++begin ) { int sign = _mm256_movemask_pd( *begin ) & 0x7; switch( sign ) { case 0b000: { locator< 0, 0, 0 >::insert( *this, *begin ); break; } case 0b001: { locator< 1, 0, 0 >::insert( *this, *begin ); break; } case 0b010: { locator< 0, 1, 0 >::insert( *this, *begin ); break; } case 0b011: { locator< 1, 1, 0 >::insert( *this, *begin ); break; } case 0b100: { locator< 0, 0, 1 >::insert( *this, *begin ); break; } case 0b101: { locator< 1, 0, 1 >::insert( *this, *begin ); break; } case 0b110: { locator< 0, 1, 1 >::insert( *this, *begin ); break; } case 0b111: { locator< 1, 1, 1 >::insert( *this, *begin ); break; } } if( type == bounded_convex_type::empty ) return; } } int bounded_convex::remove_vertices( int begin_vertex, int end_vertex ) { int re = end_vertex & 0x1; int rb = begin_vertex & 0x1; int low = begin_vertex - begin; int high = end - end_vertex; if( low <= high ) { __m256d* e_dst = edge + end_vertex; __m256d* e_dst_min = e_dst - low; __m256d* e_src = edge + begin_vertex; while( --e_dst >= e_dst_min ) { __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( --e_src ) ); _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); } __m256d* v_dst = vertex_pair( end_vertex ); __m256d* v_src = vertex_pair( begin_vertex ); if( re == rb ) { int q; if( re & 0x1 ) { __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); __m256d b0 = blend< 0, 0, 1, 1 >( s0, d0 ); __m256d b1 = blend< 0, 0, 1, 1 >( s1, d1 ); __m256d b2 = blend< 0, 0, 1, 1 >( s2, d2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 ); q = low; } else q = ( low + 1 ); q /= 2; __m256d* v_src_min = v_src - 3 * q; while( v_src > v_src_min ) { --v_src; --v_dst; __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); } } else { __m256d s0; __m256d s1; __m256d s2; int to_load; if( rb == 0 ) { v_src -= 3; s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d p0 = gather_lanes< 1, clear_lane >( s0 ); __m256d p1 = gather_lanes< 1, clear_lane >( s1 ); __m256d p2 = gather_lanes< 1, clear_lane >( s2 ); __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); __m256d b0 = blend< 0, 0, 1, 1 >( p0, d0 ); __m256d b1 = blend< 0, 0, 1, 1 >( p1, d1 ); __m256d b2 = blend< 0, 0, 1, 1 >( p2, d2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 ); if( low == 1 ) goto low_end; to_load = low - 2; } else { to_load = low - 1; s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); } int q = to_load / 4; __m256d* v_src_min = v_src - 6 * q; while( v_src > v_src_min ) { v_src -= 3; v_dst -= 3; __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d c0 = gather_lanes< 1, 0 >( b0, s0 ); __m256d c1 = gather_lanes< 1, 0 >( b1, s1 ); __m256d c2 = gather_lanes< 1, 0 >( b2, s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); v_dst -= 3; v_src -= 3; s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d d0 = gather_lanes< 1, 0 >( s0, b0 ); __m256d d1 = gather_lanes< 1, 0 >( s1, b1 ); __m256d d2 = gather_lanes< 1, 0 >( s2, b2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 ); } v_dst -= 3; int r = to_load % 4; if( r == 0 ) { __m256d c0 = gather_lanes< clear_lane, 0 >( s0 ); __m256d c1 = gather_lanes< clear_lane, 0 >( s1 ); __m256d c2 = gather_lanes< clear_lane, 0 >( s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); } else { v_src -= 3; __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d c0 = gather_lanes< 1, 0 >( b0, s0 ); __m256d c1 = gather_lanes< 1, 0 >( b1, s1 ); __m256d c2 = gather_lanes< 1, 0 >( b2, s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); switch( r ) { case 1: goto low_end; case 2: { v_dst -= 3; __m256d d0 = gather_lanes< clear_lane, 0 >( b0 ); __m256d d1 = gather_lanes< clear_lane, 0 >( b1 ); __m256d d2 = gather_lanes< clear_lane, 0 >( b2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 ); goto low_end; } default: { v_dst -= 3; v_src -= 3; s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d d0 = gather_lanes< 1, 0 >( s0, b0 ); __m256d d1 = gather_lanes< 1, 0 >( s1, b1 ); __m256d d2 = gather_lanes< 1, 0 >( s2, b2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 ); v_dst -= 3; s0 = gather_lanes< clear_lane, 0 >( s0, b0 ); s1 = gather_lanes< clear_lane, 0 >( s1, b1 ); s2 = gather_lanes< clear_lane, 0 >( s2, b2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), s0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), s1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), s2 ); } } } } low_end: { begin += ( end_vertex - begin_vertex ); return end_vertex - 1; } } else { __m256d* e_dst = edge + begin_vertex; __m256d* e_src = edge + end_vertex; __m256d* e_src_max = e_src + high + 1; for( ; e_src < e_src_max; ++e_src, ++e_dst ) { __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( e_src ) ); _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); } __m256d s0; __m256d s1; __m256d s2; __m256d* v_dst = vertex_pair( begin_vertex ); __m256d* v_src = vertex_pair( end_vertex ); if( re == rb ) { if( re & 0x1 ) { s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); __m256d c0 = blend< 0, 0, 1, 1 >( d0, s0 ); __m256d c1 = blend< 0, 0, 1, 1 >( d1, s1 ); __m256d c2 = blend< 0, 0, 1, 1 >( d2, s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); --high; v_src += 3; v_dst += 3; } int q = 3 * ( ( high + 1 ) / 2 ); __m256d* v_src_max = vertex_pair( end + 1 ); for( ; v_src < v_src_max; ++v_src, ++v_dst ) { __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); } } else { int to_load; __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); if( rb ) // re = 0 =: { __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); __m256d g0 = gather_lanes< 0, 0 >( d0, s0 ); __m256d g1 = gather_lanes< 0, 0 >( d1, s1 ); __m256d g2 = gather_lanes< 0, 0 >( d2, s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), g0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), g1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), g2 ); if( high == 1 ) goto high_end; v_dst += 3; to_load = high - 2; } else // re = 1 => only the second lane of the si are used to_load = high - 1; int q = to_load / 4; __m256d* v_src_max = v_src + 6 * q; while( v_src < v_src_max ) { v_src += 3; __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d u0 = gather_lanes< 1, 0 >( s0, t0 ); __m256d u1 = gather_lanes< 1, 0 >( s1, t1 ); __m256d u2 = gather_lanes< 1, 0 >( s2, t2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 ); v_dst += 3; v_src += 3; s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d v0 = gather_lanes< 1, 0 >( t0, s0 ); __m256d v1 = gather_lanes< 1, 0 >( t1, s1 ); __m256d v2 = gather_lanes< 1, 0 >( t2, s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 ); v_dst += 3; } int r = to_load % 4; if( r == 0 ) { __m256d u0 = gather_lanes< 1, clear_lane >( s0 ); __m256d u1 = gather_lanes< 1, clear_lane >( s1 ); __m256d u2 = gather_lanes< 1, clear_lane >( s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 ); } else { v_src += 3; __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d u0 = gather_lanes< 1, 0 >( s0, t0 ); __m256d u1 = gather_lanes< 1, 0 >( s1, t1 ); __m256d u2 = gather_lanes< 1, 0 >( s2, t2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 ); switch( r ) { case 1: goto high_end; case 2: { v_dst += 3; __m256d v0 = gather_lanes< 1, clear_lane >( t0 ); __m256d v1 = gather_lanes< 1, clear_lane >( t1 ); __m256d v2 = gather_lanes< 1, clear_lane >( t2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 ); goto high_end; } default: // case 3 { v_dst += 3; v_src += 3; s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); __m256d v0 = gather_lanes< 1, 0 >( t0, s0 ); __m256d v1 = gather_lanes< 1, 0 >( t1, s1 ); __m256d v2 = gather_lanes< 1, 0 >( t2, s2 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 ); _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 ); } } } } high_end: { end -= ( end_vertex - begin_vertex ); return begin_vertex - 1; } } } void bounded_convex::set_to_last_segment() { int last = end - 1; edge[ 0 ] = edge[ last ]; edge[ 1 ] = edge[ end ]; edge[ 2 ] = edge[ begin + 1 ]; __m256d* vl = vertex_pair( last ); __m256d* vb = vertex_pair( begin ); if( last & 0x1 ) { if( begin & 0x1 ) { vertex_p[ 0 ] = gather_lanes< 1, 1 >( vl[ 0 ], vb[ 0 ] ); vertex_p[ 1 ] = gather_lanes< 1, 1 >( vl[ 1 ], vb[ 1 ] ); vertex_p[ 2 ] = gather_lanes< 1, 1 >( vl[ 2 ], vb[ 2 ] ); } else { vertex_p[ 0 ] = gather_lanes< 1, 0 >( vl[ 0 ], vb[ 0 ] ); vertex_p[ 1 ] = gather_lanes< 1, 0 >( vl[ 1 ], vb[ 1 ] ); vertex_p[ 2 ] = gather_lanes< 1, 0 >( vl[ 2 ], vb[ 2 ] ); } } else { if( begin & 0x1 ) { vertex_p[ 0 ] = blend< 0, 0, 1, 1 >( vl[ 0 ], vb[ 0 ] ); vertex_p[ 1 ] = blend< 0, 0, 1, 1 >( vl[ 1 ], vb[ 1 ] ); vertex_p[ 2 ] = blend< 0, 0, 1, 1 >( vl[ 2 ], vb[ 2 ] ); } else { vertex_p[ 0 ] = gather_lanes< 0, 0 >( vl[ 0 ], vb[ 0 ] ); vertex_p[ 1 ] = gather_lanes< 0, 0 >( vl[ 1 ], vb[ 1 ] ); vertex_p[ 2 ] = gather_lanes< 0, 0 >( vl[ 2 ], vb[ 2 ] ); } } end = 2; begin = 0; type = bounded_convex_type::segment; } void bounded_convex::setup( __m256d box ) { type = bounded_convex_type::polygon; __m256d zero = _mm256_xor_pd( zero, zero ); __m256i ones = _mm256_cmpeq_epi64( ones, ones ); __m256i signi = _mm256_slli_epi64( ones, 63 ); __m256i signi_h = _mm256_slli_si256( signi, 8 ); __m256d sign = _mm256_castsi256_pd( signi ); __m256d sign_h = _mm256_castsi256_pd( signi_h ); __m256i onei = _mm256_srli_epi64( ones, 54 ); onei = _mm256_slli_epi64( onei, 52 ); // { 1.0, 1.0, 1.0, 1.0 } __m256d one = _mm256_castsi256_pd( onei ); box = _mm256_max_pd( box, zero ); begin = alloc / 2; end = begin + 4; __m256d* e = edge + begin; __m256d x_box = gather_lanes< 0, 0 >( box ); // { x_min, x_max, x_min, x_max } __m256d y_box = gather_lanes< 1, 1 >( box ); // { y_min, y_max, y_min, y_max } e[ 0 ] = blend< 0, 1, 0, 0 >( one, zero ); // { 1, 0, 0, 0}, e[ 0 ] = blend< 0, 0, 1, 1 >( e[ 0 ], x_box ); // { 1, 0, x_min, x_max }, for x >= x_min e[ 1 ] = permute< 1, 0, 2, 3 >( e[ 0 ] ); // { 0 , 1, x_min, x_max } // y >= y_min e[ 1 ] = blend< 0, 0, 1, 1 >( e[ 1 ], y_box ); // { 0 , 1, y_min, y_max } // y >= y_min e[ 2 ] = permute< 0, 1, 3, 2 >( e[ 0 ] ); // { 1 , 0, x_max, x_min } e[ 2 ] = _mm256_xor_pd( e[ 2 ], sign ); // { -1 , -, -x_max, -y_min } -x >= -x_max e[ 3 ] = permute< 0, 1, 3, 2 >( e[ 1 ] ); // { 0, 1, y_max, ? } e[ 3 ] = _mm256_xor_pd( e[ 3 ], sign ); // { 0, 1, -y_max, ? } -y >= -y_max e[ 4 ] = e[ 0 ]; // { 1, 0, x_min, } } // initial vertices __m256d* vp = vertex_pair( begin ); vp[ 0 ] = permute< 0, 0, 3, 3 >( x_box ); // { x_min, x_min, x_max, x_max } : vertices 0 and 1: vp[ 0 ] = _mm256_xor_pd( vp[ 0 ], sign_h ); // { x_min, -x_min, x_max, -x_max } : vertices 0 and 1: vp[ 1 ] = permute< 0, 0, 2, 2 >( y_box ); // { y_min, y_min, y_min, y_min } vp[ 1 ] = _mm256_xor_pd( vp[ 1 ], sign_h ); // { y_min, -y_min, y_min, -y_min } vp[ 2 ] = _mm256_xor_pd( one, sign_h ); // { 1.0, -1.0, 1.0, -1.0 } vp[ 3 ] = permute< 1, 1, 2, 2 >( x_box ); // { x_max, x_max, x_min, x_min } : vertices 2 and 3: vp[ 3 ] = _mm256_xor_pd( vp[ 3 ], sign_h ); // { x_max, -x_max, x_min, -x_min } vp[ 4 ] = permute< 1, 1, 3, 3 >( y_box ); // { y_max, y_max, y_max, y_max } vp[ 4 ] = _mm256_xor_pd( vp[ 4 ], sign_h ); // { y_max, -y_max, y_max, -y_max } vp[ 5 ] = vp[ 2 ]; } int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge ) { constexpr unsigned long long x_index_epi8 = 0x1 << 4; constexpr unsigned long long y_index_epi8 = 0x1 << 12; constexpr unsigned long long z_index_epi8 = 0x1 << 16; int ret = 0; __m256i ones = ones_i(); // { 0xFFFF'FFFF'FFFF'FFFF, ... } __m256i exp_mask = _mm256_srli_epi64( ones, 53 ); // { 0x0000'0000'0000'07FF, .... } __m256i one = _mm256_srli_epi64( ones, 54 ); // { 0x0000'0000'0000'03FF, .... } __m256i sign = _mm256_slli_epi64( ones, 63 ); // { 0x8000'0000'0000'0000, .... } exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000, .... } one = _mm256_slli_epi64( one, 52 ); // { 0x3FF0'0000'0000'0000, .... } sign = gather_lanes< 1, clear_lane >( sign ); // { sign_bit, sign_bit, 0, 0 } for( int iq = 0; iq < 4; ++iq ) { __m256d* end = quadrant[ iq ] + n; while( quadrant[ iq ] < end ) { __m256d* q = quadrant[ iq ]; __m256i exp = and_i( *q, exp_mask ); __m256i cmp_huge = _mm256_cmpgt_epi32( exp, huge ); if( _mm256_testz_si256( cmp_huge, cmp_huge ) ) // no huge entries { __m256i cmp_tiny = _mm256_cmpgt_epi32( tiny, exp ); if( _mm256_testz_si256( cmp_tiny, cmp_tiny ) ) // no entries are huge or tiny { ++quadrant[ iq ]; continue; } // no entries are huge, but aome are tiny. We scale by 2^(huge - max_exp[ i ] ) __m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? } __m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? } __m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? } max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....} int64_t int_max = _mm256_extract_epi64( max, 0 ); if( int_max == 0 ) // all exponents are zero. { __m256d cmp = _mm256_cmp_pd( *q, zero_d(), _CMP_EQ_OQ ); if( ( _mm256_movemask_pd( cmp ) & 0x7 ) == 0x7 ) // all coefficients are zero { // drop the constraint *q = *( --end ); continue; } } max = permute< 0, 0, 0, 0 >( max ); __m256i de = _mm256_sub_epi32( huge, max ); // de may be larger than 0x3ff and we must scale in two steps *q = xor_d( *q, sign ); __m256i hs = _mm256_srli_epi64( de, 1 ); // de/2 hs = _mm256_and_si256( hs, exp_mask ); // throwing away the remainder __m256d scale = cast_d( _mm256_add_epi64( hs, one ) ); *q = _mm256_mul_pd( scale, *q ); __m256i ls = _mm256_sub_epi64( de, hs ); // de - hs scale = cast_d( _mm256_add_epi64( ls, one ) ); *q = _mm256_mul_pd( scale, *q ); } else // some huge entries { __m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? } __m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? } __m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? } max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....} max = permute< 0, 0, 0, 0 >( max ); __m256i de = _mm256_sub_epi32( huge, max ); // de > -0x3ff __m256d scale = cast_d( _mm256_add_epi64( de, one ) ); // this does not underflow *q = xor_d( *q, sign ); *q = _mm256_mul_pd( scale, *q ); // this may be inexact, but the inexact entries will be dropped later } // cleaning up the remaining tiny entries __m256i exp_d = and_i( *q, exp_mask ); __m256i cmp_d = _mm256_cmpgt_epi32( exp_d, tiny ); // 8 x (4 byte masks) int cmp_d_mask = _mm256_movemask_epi8( cmp_d ); // 8 bits for x, 8 bits for y, 8 bits for z if( cmp_d_mask & ( x_index_epi8 | y_index_epi8 ) ) { // In order to preserve the solution set, we must round the right hand side down and // the left hand side up, so that every solution of the old problem satisfies the new contraints. // To do that, we want to round the left hand side up and the right hand side down. Since // the lhs were flipped, we simply round all entries down, that is: // // we replace x in ( -tiny, 0 ) by -tiny and x in [ 0, tiny ) by +0.0 // (and fix the zeros latter ) __m256d zero = zero_d(); __m256i exp_f = and_i( *q, exp_mask ); __m256i full_sign = gather_lanes< 0, 0 >( sign ); __m256i minus_tiny = _mm256_or_si256( tiny, full_sign ); __m256i is_tiny_f = _mm256_cmpgt_epi32( tiny, exp_f ); __m256d non_negative = _mm256_cmp_pd( *q, zero, _CMP_GE_OQ ); // tiny negatives should be replaced by -tiny, tiny non negatives are replace by 0. __m256d fix = blend( cast_d( minus_tiny ), zero, non_negative ); // fixing the tiny entries *q = blend( *q, fix, is_tiny_f ); // restoring the rhs sign *q = xor_d( *q, sign ); // cleaning up the zeros __m256i abs = _mm256_slli_epi64( cast_i( *q ), 1 ); // throwing away the sign bit __m256i is_zero = _mm256_cmpeq_epi64( abs, cast_i( zero ) ); *q = blend( *q, zero, is_zero ); ++quadrant[ iq ]; } else { // | a | <= tiny and | b | <= tiny: and |c| >= huge, // The constraint is either irrelevant or fatal. int pb_sign = _mm256_movemask_pd( *q ); if( !( pb_sign & 0x4 ) ) // drop the quadrant: the right hand side is too large { ret |= ( 0x1 << iq ); break; } // else, drop the constraint: the right hand side is too small *q = *( --end ); } } } return ret; } } // namespace wm::nlp::lp2d //---------------------------------------------------------------------------------------- extern "C" { int interval_system_2d( double* solution, double const* box, double const* eq, unsigned int n_equations ) { constexpr int max_quadrant_size = 32; using namespace wm::nlp::lp2d; double* solsPtr = solution; printf("box: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", box[0], box[1], box[2], box[3]); printf("n_equations = %u\n", n_equations); for (int i = 0; i < n_equations; i++) { printf("equation %u: a_min = %f, a_max = %f, b_min = %f, b_max = %f, c_min = %f, c_max = %f\n", i, eq[6*i], eq[6*i + 1], eq[6*i + 2], eq[6*i + 3], eq[6*i + 4], eq[6*i + 5]); } if( n_equations > ( max_quadrant_size / 2 ) ) return -1; if( std::bit_cast< uint64_t >( box ) & 0x1F ) return -2; if( std::bit_cast< uint64_t >( eq ) & 0x1F ) return -3; int ret = 0; int n_equations_q = n_equations / 2; int n_equations_r = n_equations % 2; double const* eq_max = eq + n_equations_q; __m256i huge; __m256i tiny; __m256i signi = bounds( &tiny, &huge ); __m256i exp_mask = _mm256_srli_epi64( ones_i(), 53 ); exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000... } __m256d sign = cast_d( signi ); __m256d sign_01 = blend< 0, 0, 1, 1 >( sign, zero_d() ); // { -1: -1, 0, 0 } int old_rounding_mode = std::fegetround(); if( std::fesetround( FE_DOWNWARD ) ) return -4; __m256d quadrants[ 4 * max_quadrant_size ]; __m256d* q[ 4 ] = { quadrants, quadrants + max_quadrant_size, quadrants + 2 * max_quadrant_size, quadrants + 3 * max_quadrant_size }; int empty_quadrant = 0; double const* p_eq = eq; int quadrant_is_not_empty = 0xF; for( ; p_eq < eq_max; p_eq += 12 ) { __m256d a0 = _mm256_load_pd( p_eq ); __m256d aux = _mm256_load_pd( p_eq + 4 ); __m256d bc1 = _mm256_load_pd( p_eq + 8 ); // the equations in this group are // // a0[ 0, 1 ] x + a0[ 2, 3 ] y = aux[ 0, 1 ] // aux[ 2, 3 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] __m256d bc0 = gather_lanes< 1, 0 >( a0, aux ); __m256d a1 = gather_lanes< 1, 0 >( aux ); // the equations in this group are now // // a0[ 0, 1 ] x + bc0[ 0, 1 ] y = bc0[ 2, 3 ] // a1[ 0, 1 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] __m256d max_c0 = permute< 0, 1, 3, 2 >( bc0 ); //{ ?, ?, bc0[ 3 ], ? } __m256d max_c1 = permute< 0, 1, 3, 2 >( bc1 ); //{ ?, ?, bc1[ 3 ], ? } for( int iq = 0; iq < 4; ++iq ) // iqth quadrant { q[ iq ][ 0 ] = gather_low( a0, bc0 ); // { a0[ 0 ], bc0[ 0 ], ?, ? } q[ iq ][ 1 ] = gather_high( a0, bc0 ); // { a0[ 1 ], bc0[ 1 ], ?, ? } q[ iq ][ 2 ] = gather_low( a1, bc1 ); // { a1[ 0 ], bc1[ 0 ], ?, ? } q[ iq ][ 3 ] = gather_high( a1, bc1 ); // { a1[ 1 ], bc1[ 1 ], ?, ? } q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c0 ); // { a0[ 0 ], bc0[ 0 ], bc0[ 3 ], ? } q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], bc0 ); // { a0[ 1 ], bc0[ 1 ], bc0[ 2 ], ? } q[ iq ][ 2 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 2 ], max_c1 ); // { a1[ 0 ], bc1[ 0 ], bc1[ 3 ], ? } q[ iq ][ 3 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 3 ], bc1 ); // { a1[ 1 ], bc1[ 1 ], bc1[ 2 ], ? } q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -a0[ 0 ], -bc0[ 0 ], -bc0[ 3 ], ? } q[ iq ][ 2 ] = xor_d( q[ iq ][ 2 ], sign ); // { -a1[ 0 ], -bc1[ 0 ], -bc1[ 3 ], ? } // moving to the next quadrant if( iq & 0x1 ) // replacing y by - y { bc0 = xor_d( bc0, sign_01 ); bc1 = xor_d( bc1, sign_01 ); bc0 = permute< 1, 0, 2, 3 >( bc0 ); bc1 = permute< 1, 0, 2, 3 >( bc1 ); } else // replacing x by - x { a0 = xor_d( a0, sign_01 ); a1 = xor_d( a1, sign_01 ); a0 = permute< 1, 0, 2, 3 >( a0 ); a1 = permute< 1, 0, 2, 3 >( a1 ); } } //---------------------------------------------------------------------------------------- // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we // normalize equation in which some elements are either +0.0 or have absolute value outside // the range [ tiny, huge ]. In this process we may detect that the intersection of the // feasible region with some quadrants is empty, or that some equations are irrelevant. //---------------------------------------------------------------------------------------- __m256d ra = gather_lanes< 0, 0 >( a0, a1 ); // { row[ 0 ][ 0 ], row[ 0 ][ 1 ], row[ 3 ][ 0 ], row[ 3 ][ 1 ] } __m256i expa = and_i( ra, exp_mask ); __m256i exp0 = and_i( bc0, exp_mask ); __m256i exp1 = and_i( bc1, exp_mask ); __m256i min = _mm256_min_epi32( expa, exp0 ); __m256i max = _mm256_max_epi32( expa, exp0 ); min = _mm256_min_epi32( min, exp1 ); max = _mm256_max_epi32( max, exp1 ); __m256i c_min = _mm256_cmpgt_epi32( tiny, min ); __m256i c_max = _mm256_cmpgt_epi32( max, huge ); __m256i bad = or_i( c_min, c_max ); if( _mm256_testz_si256( bad, bad ) ) { q[ 0 ] += 4; q[ 1 ] += 4; q[ 2 ] += 4; q[ 3 ] += 4; } else empty_quadrant |= normalize_equations( q, 4, tiny, huge ); } if( n_equations_r ) { __m256d ab = _mm256_load_pd( p_eq ); __m128d aux = _mm_load_pd( p_eq + 4 ); __m256d sign_23 = gather_lanes< 1, 0 >( sign_01 ); // the equations in this group are // ab[ 0, 1 ] x + ab[ 2, 3 ] y = aux[ 0, 1 ] __m256d c = _mm256_castpd128_pd256( aux ); c = gather_lanes< 0, 0 >( c ); // the equations in this group are now // ab[ 0, 1 ] x + ab[ 2, 3 ] y = c[ 0, 1 ] __m256d max_c = permute< 0, 1, 3, 3 >( c ); //{ ?, ?, c[ 1 ], ? } for( int iq = 0; iq < 4; ++iq ) // iqth quadrant { q[ iq ][ 0 ] = permute< 0, 2, 2, 3 >( ab ); // { ab[ 0 ], ab[ 2 ], ?, ? } q[ iq ][ 1 ] = permute< 1, 3, 2, 3 >( ab ); // { ab[ 1 ], ab[ 3 ], ?, ? } q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c ); // { ab[ 0 ], ab[ 2 ], c[ 1 ], ? } q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], c ); // { ab[ 1 ], ab[ 3 ], c[ 0 ], ? } q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -ab[ 0 ], -ab[ 2 ], -c[ 1 ], ? } // moving to the next quadrant if( iq & 0x1 ) // replacing y by - y { ab = xor_d( ab, sign_23 ); ab = permute< 0, 1, 3, 2 >( ab ); } else // replacing x by - x { ab = xor_d( ab, sign_01 ); ab = permute< 1, 0, 2, 3 >( ab ); } } //---------------------------------------------------------------------------------------- // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we // normalize equation in which some elements are either +0.0 or have absolute value outside // the range [ tiny, huge ]. In this process we may detect that the intersection of the // feasible region with some quadrants is empty, or that some equations are irrelevant. //---------------------------------------------------------------------------------------- __m256i exp_ab = and_i( ab, exp_mask ); __m256i exp_c = and_i( c, exp_mask ); exp_c = blend< 0, 0, 1, 1 >( exp_c, exp_ab ); __m256i min = _mm256_min_epi32( exp_ab, exp_c ); __m256i max = _mm256_max_epi32( exp_ab, exp_c ); __m256i c_min = _mm256_cmpgt_epi32( tiny, min ); __m256i c_max = _mm256_cmpgt_epi32( max, huge ); __m256i bad = or_i( c_min, c_max ); if( _mm256_testz_si256( bad, bad ) ) { q[ 0 ] += 2; q[ 1 ] += 2; q[ 2 ] += 2; q[ 3 ] += 2; } else empty_quadrant |= normalize_equations( q, 2, tiny, huge ); } __m256d avx_box = _mm256_load_pd( box ); __m256d cmp = _mm256_cmp_pd( avx_box, zero_d(), _CMP_GE_OQ ); int positive_box = _mm256_movemask_pd( cmp ); __m256d edge[ 2 * ( max_quadrant_size + 2 ) ]; __m256d vertex_pair[ 3 * ( max_quadrant_size + 2 ) ]; bounded_convex bc( 2 * ( max_quadrant_size + 2 ), edge, vertex_pair ); constexpr uint x_min_field = 0x1; constexpr uint x_max_field = 0x2; constexpr uint y_min_field = 0x4; constexpr uint y_max_field = 0x8; constexpr uint first_quadrant_field = 0x1; constexpr uint second_quadrant_field = 0x2; constexpr uint third_quadrant_field = 0x4; constexpr uint fourth_quadrant_field = 0x8; if( positive_box & x_max_field ) // x_max >= 0 { if( !( empty_quadrant & first_quadrant_field ) && ( positive_box & y_max_field ) ) // y_max >= 0, first quadrant { bc.setup( avx_box ); bc.push( quadrants, q[ 0 ] ); if( bc.type != bounded_convex_type::empty ) { ++ret; __m256d bc_box = bc.box(); _mm256_store_pd( solution, bc_box ); solution += 4; } } if( !( empty_quadrant & fourth_quadrant_field ) && !( positive_box & y_min_field ) ) // y_min < 0, fourth quadrant { __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 ); y_sign = gather_lanes< clear_lane, 1 >( y_sign ); __m256d q_box = permute< 0, 1, 3, 2 >( avx_box ); q_box = xor_d( q_box, y_sign ); bc.setup( q_box ); bc.push( quadrants + 3 * max_quadrant_size, q[ 3 ] ); if( bc.type != bounded_convex_type::empty ) { ++ret; __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 ); y_sign = gather_lanes< clear_lane, 1 >( y_sign ); __m256d bc_box = bc.box(); bc_box = xor_d( bc_box, y_sign ); bc_box = permute< 0, 1, 3, 2 >( bc_box ); _mm256_store_pd( solution, bc_box ); solution += 4; } } } if( !( positive_box & x_min_field ) ) // x_min < 0 { if( !( empty_quadrant & second_quadrant_field) && ( positive_box & y_max_field ) ) // y_max >= 0, second quadrant { __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 ); x_sign = gather_lanes< 0, clear_lane >( x_sign ); __m256d q_box = permute< 1, 0, 2, 3 >( avx_box ); q_box = xor_d( q_box, x_sign ); bc.setup( q_box ); bc.push( quadrants + max_quadrant_size, q[ 1 ] ); if( bc.type != bounded_convex_type::empty ) { ++ret; __m256d bc_box = bc.box(); __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 ); x_sign = gather_lanes< 0, clear_lane >( x_sign ); bc_box = xor_d( bc_box, x_sign ); bc_box = permute< 1, 0, 2, 3 >( bc_box ); _mm256_store_pd( solution, bc_box ); solution += 4; } } if( !( empty_quadrant & third_quadrant_field ) && !( positive_box & y_min_field ) ) // y_min < 0, third quadrant { __m256i sign = _mm256_slli_epi64( ones_i(), 63 ); avx_box = permute< 1, 0, 3, 2 >( avx_box ); avx_box = xor_d( avx_box, sign ); bc.setup( avx_box ); bc.push( quadrants + 2 * max_quadrant_size, q[ 2 ] ); if( bc.type != bounded_convex_type::empty ) { ++ret; __m256d bc_box = bc.box(); __m256i sign = _mm256_slli_epi64( ones_i(), 63 ); bc_box = xor_d( bc_box, sign ); bc_box = permute< 1, 0, 3, 2 >( bc_box ); _mm256_store_pd( solution, bc_box ); } } } std::fesetround( old_rounding_mode ); for (int i = 0; i < ret; i++) { printf("solution %u: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", i, solsPtr[4*i], solsPtr[4*i + 1], solsPtr[4*i + 2], solsPtr[4*i + 3]); } return ret; } } // extern "C" //----------------------------------------------------------------------------------------