metabrush/brush-strokes/cbits/lp_2d.cpp

2035 lines
67 KiB
C++

//----------------------------------------------------------------------------------------
// Copyright (c) 2024 Walter Mascarenhas
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability",
// and we provide no guarantee regarding the correctness of this source code.
//
//----------------------------------------------------------------------------------------
#include <algorithm>
#include <cfenv>
#include <cstdint>
#include <functional>
#include <iostream>
#include "lp_2d.hpp"
//----------------------------------------------------------------------------------------
namespace wm::nlp::lp2d {
__m256i bounds( __m256i* tiny, __m256i* huge )
{
__m256i ones = ones_i();
__m256i sign = _mm256_slli_epi64( ones, 63 );
__m256i a = _mm256_srli_epi64( sign, 1 ); // 0b0100'0000'0000...
__m256i b = _mm256_srli_epi64( sign, 3 ); // 0b0001'0000'0000...
__m256i c = _mm256_or_si256( a, b ); // 0b0101'0000'0000...
__m256i d = _mm256_srli_epi64( c, 4 ); // 0b0000'0101'0000...
*huge = _mm256_or_si256( c, d ); // 0b0101'0101'0000... 0x5500...
__m256i e = _mm256_srli_epi64( *huge, 1 ); // 0b0010'1010'1000...
__m256i f = _mm256_srli_epi64( d, 2 ); // 0b0000'0001'0100...
__m256i g = _mm256_srli_epi64( d, 4 ); // 0b0000'0000'0101...
__m256i h = _mm256_srli_epi64( ones, 1 );
__m256i i = _mm256_srli_epi64( sign, 32 );
__m256i j = _mm256_or_si256( f, g ); // 0b0000'0001'0101...
*tiny = _mm256_or_si256( e, j ); // 0b0010'1011'1101... 0x2bd...
__m256i k = and_not_i( i, h );
*huge = blend< 0, 0, 0, 1 >( *huge, k );
*tiny = blend< 0, 0, 0, 1 >( *tiny, zero_i() );
return sign;
}
//----------------------------------------------------------------------------------------
// exact_location returns the exact location of the intersection ( x, y ) of the lines
//
// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ]
// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ]
//
// with respect to the set
//
// e0 x + e1 y >= e2
//
// under the assumption that
//
// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0
//
// where ei = edge[ i ], b0 = border[ 0 ] and b1 = b[ 1 ]
//
// This amounts to evaluating the sign of
//
// p = e0 r + e1 s - e2 d
//
// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ]
// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] ,
//
// which motivates the evaluation of the six products
//
// group 1 group 2
// e0 b0[ 2 ] b1[ 1 ], -e0 b0[ 1 ] b1[ 2 ]
// e1 b0[ 0 ] b1[ 2 ], -e1 b0[ 2 ] b1[ 0 ]
// e2 b0[ 1 ] b1[ 0 ]. -e2 b0[ 0 ] b1[ 1 ]
//
// and we want to find the sign of their sum.
//
// The function assumes that the edges entries are either normal or zero, ie.,
// there are no subnormal entries.
//----------------------------------------------------------------------------------------
location exact_location( __m256d e, __m256d b0, __m256d b1 )
{
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
__m256d b0_a = permute< 1, 2, 0, 3 >( b0 ); // { b0[ 1 ], b0[ 2 ], b0[ 0 ], ? }
__m256d b0_b = permute< 2, 0, 1, 3 >( b0 ); // { b0[ 2 ], b0[ 0 ], b0[ 1 ], ? }
__m256d b1_a = permute< 1, 2, 0, 3 >( b1 ); // { b1[ 1 ], b1[ 2 ], b1[ 0 ], ? }
__m256d b1_b = permute< 2, 0, 1, 3 >( b1 ); // { b1[ 2 ], b1[ 0 ], b1[ 1 ], ? }
__m256d g1 = _mm256_mul_pd( b0_b, b1_a ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
__m256d g2 = _mm256_mul_pd( b0_a, b1_b ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
__m256d g1b = _mm256_fmsub_pd( b0_b, b1_a, g1 ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
__m256d g2b = _mm256_fmsub_pd( b0_a, b1_b, g2 ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
__m256i sign_i = _mm256_slli_epi64( ones_i(), 63 );
__m256d sign = cast_d( sign_i );
__m256d minus_e = xor_d( e, sign );
__m256d c[ 8 ];
c[ 0 ] = _mm256_mul_pd( e, g1 );
c[ 1 ] = _mm256_mul_pd( minus_e, g2 );
c[ 2 ] = _mm256_mul_pd( e, g1b );
c[ 3 ] = _mm256_mul_pd( minus_e, g2b );
c[ 4 ] = _mm256_fmsub_pd( e, g1, c[ 0 ] );
c[ 5 ] = _mm256_fmsub_pd( minus_e, g2, c[ 1 ] );
c[ 6 ] = _mm256_fmsub_pd( e, g1b, c[ 2 ] );
c[ 7 ] = _mm256_fmsub_pd( minus_e, g2b, c[ 3 ] );
__m256d a[ 6 ];
// transposing the products
__m256d tmp_0 = gather_low( c[ 0 ], c[ 1 ] );
__m256d tmp_1 = gather_high( c[ 0 ], c[ 1 ] );
__m256d tmp_2 = gather_low( c[ 2 ], c[ 3 ] );
__m256d tmp_3 = gather_high( c[ 2 ], c[ 3 ] );
a[ 0 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x20 );
a[ 1 ] = _mm256_permute2f128_pd( tmp_1, tmp_3, 0x20 );
a[ 2 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x31 );
__m256d tmp_4 = gather_low( c[ 4 ], c[ 5 ] );
__m256d tmp_5 = gather_high( c[ 4 ], c[ 5 ] );
__m256d tmp_6 = gather_low( c[ 6 ], c[ 7 ]);
__m256d tmp_7 = gather_high( c[ 6 ], c[ 7 ] );
a[ 3 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x20 );
a[ 4 ] = _mm256_permute2f128_pd( tmp_5, tmp_7, 0x20 );
a[ 5 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x31 );
__m256d gt_05 = is_greater( a[ 0 ], a[ 5 ] );
__m256d tmp = blend_max( a[ 0 ], a[ 5 ], gt_05 );
a[ 0 ] = blend_min( a[ 0 ], a[ 5 ], gt_05 );
a[ 5 ] = tmp;
//----------------------------------------------------------------------------------------
// In the loop below we assume that a0[ j ] <= a5[ j ], i = 0,1,2,3
//----------------------------------------------------------------------------------------
while( true )
{
__m256d gt_12 = is_greater( a[ 1 ], a[ 2 ] );
__m256d gt_34 = is_greater( a[ 3 ], a[ 4 ] );
__m256d min_12 = blend_min( a[ 1 ], a[ 2 ], gt_12 );
__m256d max_12 = blend_max( a[ 1 ], a[ 2 ], gt_12 );
__m256d min_34 = blend_min( a[ 3 ], a[ 4 ], gt_34 );
__m256d max_34 = blend_max( a[ 3 ], a[ 4 ], gt_34 );
__m256d gt_min = is_greater( min_12, min_34 );
__m256d gt_max = is_greater( max_12, max_34 );
__m256d min_1234 = blend_min( min_12, min_34, gt_min );
a[ 2 ] = blend_max( min_12, min_34, gt_min );
a[ 3 ] = blend_min( max_12, max_34, gt_max );
__m256d max_1234 = blend_max( max_12, max_34, gt_max );
__m256d gt_0_min_1234 = is_greater( a[ 0 ], min_1234 );
__m256d gt_max_1234_5 = is_greater( max_1234, a[ 5 ] );
a[ 1 ] = blend_max( a[ 0 ], min_1234, gt_0_min_1234 );
a[ 0 ] = blend_min( a[ 0 ], min_1234, gt_0_min_1234 );
a[ 4 ] = blend_min( max_1234, a[ 5 ], gt_max_1234_5 );
a[ 5 ] = blend_max( max_1234, a[ 5 ], gt_max_1234_5 );
// 1) a0[ j ] = min { ai[ j ], i = 0, .... 5 }
// 2) a5[ j ] = max { ai[ j ], i = 0, .... 5 }
__m256d zero = _mm256_xor_pd( zero, zero );
__m256d cmp0 = _mm256_cmp_pd( a[ 0 ], zero, _CMP_LT_OQ );
if( _mm256_testz_pd( cmp0, cmp0 ) ) // all a0[ j ] are >= 0
{
__m256d cmp = is_greater( a[ 5 ], zero );
if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a5 is positive
return location::border;
else
return location::in;
}
__m256d cmp5 = _mm256_cmp_pd( a[ 5 ], zero, _CMP_GT_OQ );
if( _mm256_testz_pd( cmp5, cmp5 ) ) // all a5[ j ] are <= 0
{
__m256d cmp = is_greater( zero, a[ 0 ] );
if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a0 is negative
return location::border;
else
return location::out;
}
// now
// 1) min ai[j] has its sign bit set
// 2) max ai[j] does not have its sign bit set
__m256d lo = gather_low( a[ 0 ], a[ 5 ] ); // { a0[ 0 ], a5[ 0 ], a0[ 2 ], a5[ 2 ] }
__m256d hi = gather_high( a[ 0 ], a[ 5 ] ); // { a0[ 1 ], a5[ 1 ], a0[ 3 ], a5[ 3 ] }
__m256d gt_lo_hi = is_greater( lo, hi );
__m256d b0 = blend_min( lo, hi, gt_lo_hi );
__m256d b5 = blend_max( lo, hi, gt_lo_hi );
// 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 2 ] }
// 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 1 ], b5[ 3 ] }
b0 = permute< 0, 2, 1, 3 >( b0 );
b5 = permute< 3, 1, 2, 0 >( b5 );
// 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 1 ] }
// 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 0 ], b5[ 1 ] }
lo = gather_low( b0, b5 ); // { b0[ 0 ], b5[ 0 ], b0[ 2 ], b5[ 2 ] }
hi = gather_high( b0, b5 ); // { b0[ 1 ], b5[ 1 ], b0[ 3 ], b5[ 3 ] }
gt_lo_hi = is_greater( lo, hi );
b0 = blend_min( lo, hi, gt_lo_hi );
b5 = blend_max( lo, hi, gt_lo_hi );
// 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ]
// 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 1 ]
b5 = permute< 1, 0, 3, 2 >( b5 );
// 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] <= 0
// 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 0 ] >= 0 s
// fixing possible break of order in positions 1,2,3
__m256d chk123 = is_greater( b0, b5 );
__m256d aux = blend_min( b0, b5, chk123 );
b5 = blend_max( b0, b5, chk123 );
b0 = aux;
__m256i b0i = cast_i( b0 );
__m256i b5i = cast_i( b5 );
uint64_t u0 = _mm256_extract_epi64( b0i, 0 ) & 0x7FFF'FFFF'FFFF'FFFFULL;
uint64_t u5 = _mm256_extract_epi64( b5i, 0 );
if( u0 >= u5 ) // | b0[ 0 | >= | b5[ 0 ] |
{
if( u0 > u5 + 0x0060'0000'0000'0000ULL ) // |b0| > 2^5 |b1| (we use 0x006... to handle subnormals properly)
return location::out;
__m256d sum = _mm256_add_pd( b0, b5 ); // b0 + b5, rounded down
__m256d cor = _mm256_sub_pd( sum, b0 ); // sum - b0, rounded down
__m256d delta = _mm256_sub_pd( b5, cor ); // b5 - cor, rounded down.
// now b0[ 0 ] + b5[ 0 ] = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ]
a[ 0 ] = blend< 1, 0, 0, 0 >( b0, sum );
a[ 5 ] = blend< 1, 0, 0, 0 >( b5, delta );
}
else
{
if( u5 > u0 + 0x0060'0000'0000'0000ULL )
return location::in;
__m256i sign_i = _mm256_slli_epi64( ones_i(), 63 );
__m256d sign = cast_d( sign_i );
__m256d minus_b0 = xor_d( b0, sign );
__m256d sum = _mm256_sub_pd( minus_b0, b5 ); // (-b5) + (-b0) rounded down
__m256d cor = _mm256_add_pd( sum, b5 ); // sum - (-b5) rounded down
__m256d delta = _mm256_sub_pd( minus_b0, cor ); // (-b0) - cor, rounded down.
// -( b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ]
sum = xor_d( sum, sign );
delta = xor_d( delta, sign );
// b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with delta[ 0 ] <= 0 <= sum[ 0 ]
a[ 0 ] = blend< 1, 0, 0, 0 >( b0, delta );
a[ 5 ] = blend< 1, 0, 0, 0 >( b5, sum );
}
}
}
//----------------------------------------------------------------------------------------
// bounded_convex::box: finding the box of the bounded convex region, under the assumption
// that the normals are normalized, that is, for each edge e, either
// e[ i ] = +0.0 or e[ i ] >= tiny ~ 2^(-100), i.e. no signed zeros, underflow
// or overflow or negative coordinates.
//----------------------------------------------------------------------------------------
__m256d bounded_convex::box() const
{
constexpr int non_0 = 0;
constexpr int non_1 = 2;
constexpr int inf_0 = 0;
constexpr int sup_0 = 1;
constexpr int inf_1 = 2;
constexpr int sup_1 = 3;
assert( type != bounded_convex_type::empty );
__m256d den;
__m256d num;
__m256i sign = _mm256_slli_epi64( ones_i(), 63 ); // { -1, -1, -1, -1 }
__m256i sign_h = _mm256_slli_si256( sign, 8 ); // { 0, -1, 0, -1 }
if( type != bounded_convex_type::polygon )
{
if( type == bounded_convex_type::point )
{
// { vertex_pair[ 0 ] = { x_inf, -x_sup, ? , ? }
// { vertex_pair[ 1 ] = { y_inf, -y_sup, ? , ? }
// { vertex_pair[ 2 ] = { z_inf, -z_sup, ? , ? }
den = xor_d( vertex_p[ 2 ], sign_h );
den = permute< sup_0, inf_0, sup_0, inf_0 >( den );
num = gather_lanes< 0, 0 >( vertex_p[ 0 ], vertex_p[ 1 ] );
}
else
{
__m256d num_x;
__m256d num_y;
den = xor_d( vertex_p[ 2 ], sign_h );
int xy_signs = high_bits_64( edge[ 1 ] ) & 0x3;
switch( xy_signs )
{
case 0: // edge[ 0 ] >= 0, edge[ 1 ] >= 0: box = { x0, x1, y1, y0 }
{
num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] );
num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] );
den = permute< sup_0, inf_1, sup_1, inf_0 >( den );
break;
}
case 1: // edge[ 0 ] < 0, edge[ 1 ] >= 0: box = { x0, x1, y0, y1 }
{
num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] );
num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] );
den = permute< sup_0, inf_0, sup_0, inf_1 >( den );
break;
}
case 2: // edge[ 0 ] >= 0, edge[ 1 ] < 0: box = {x1, x0, y1, y0 }
{
num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] );
num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] );
den = permute< sup_1, inf_0, sup_1, inf_0 >( den );
break;
}
case 3: // edge[ 0 ] < 0, edge[ 1 ] < 0: box = { x1, x0, y0, y1 }
{
num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] );
num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] );
den = permute< sup_1, inf_0, sup_0, inf_1 >( den );
break;
}
}
num = gather_lanes< 0, 1 >( num_x, num_y );
}
}
else
{
int index = begin;
auto set_x_max = [ & ]()
{
int previous_vertex = ( ( index > 0 ) ? index : end );
int previous_pair = 3 * ( --previous_vertex / 2 );
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
if( previous_vertex & 0x1 )
{
z = permute< non_0, inf_1, non_1, non_1 >( z );
__m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
num = blend< 0, 1, 0, 0 >( num, x );
}
else
{
z = permute< non_0, inf_0, non_1, non_1 >( z );
num = blend< 0, 1, 0, 0 >( num, vertex_p[ previous_pair ] );
}
den = blend< 0, 1, 0, 0 >( den, z );
};
auto set_x_min = [ & ]()
{
int previous_vertex = ( ( index > 0 ) ? index : end );
int previous_pair = 3 * ( --previous_vertex / 2 );
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
if( previous_vertex & 0x1 )
{
z = permute< sup_1, non_0, non_1, non_1 >( z );
__m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
num = blend< 1, 0, 0, 0 >( num, x );
}
else
{
z = permute< sup_0, non_0, non_1, non_1 >( z );
num = blend< 1, 0, 0, 0 >( num, vertex_p[ previous_pair ] );
}
den = blend< 1, 0, 0, 0 >( den, z );
};
auto set_y_max = [ & ]()
{
int previous_vertex = ( ( index > 0 ) ? index : end );
int previous_pair = 3 * ( --previous_vertex / 2 );
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
if( previous_vertex & 0x1 )
{
z = permute< non_0, non_0, non_1, inf_1 >( z );
num = blend< 0, 0, 0, 1 >( num, vertex_p[ previous_pair + 1 ] );
}
else
{
z = permute< non_0, non_0, non_1, inf_0 >( z );
__m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
num = blend< 0, 0, 0, 1 >( num, y );
}
den = blend< 0, 0, 0, 1 >( den, z );
};
auto set_y_min = [ & ]()
{
int previous_vertex = ( ( index > 0 ) ? index : end );
int previous_pair = 3 * ( --previous_vertex / 2 );
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
if( previous_vertex & 0x1 )
{
z = permute< non_1, non_1, sup_1, non_1 >( z );
num = blend< 0, 0, 1, 0 >( num, vertex_p[ previous_pair + 1 ] );
}
else
{
z = permute< non_0, non_0, sup_0, non_0 >( z );
__m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
num = blend< 0, 0, 1, 0 >( num, y );
}
den = blend< 0, 0, 1, 0 >( den, z );
};
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 0: // normal[ 0 ] >= 0, normal[ 1 ] >= 0: first quadrant
{
if( index & 0x1 )
goto quadrant_1_odd;
else
goto quadrant_1_even;
}
case 1: // normal[ 0 ] < 0, normal[ 1 ] >= 0: second quadrant
{
if( index & 0x1 )
goto quadrant_2_odd;
else
goto quadrant_2_even;
}
case 2: // normal[ 0 ] >= 0, normal[ 1 ] < 0: fourth quadrant
{
if( index & 0x1 )
goto quadrant_4_odd;
else
goto quadrant_4_even;
}
case 3: // normal[ 0 ] < 0, normal[ 1 ] < 0: third quadrant
{
if( index & 0x1 )
goto quadrant_3_odd;
else
goto quadrant_3_even;
}
}
quadrant_1_even:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 0: // quadrant_1
goto quadrant_1_odd;
case 1: // quadrant_2
{
set_y_min();
goto quadrant_2_odd;
}
case 3: // quadrant_3
{
set_x_max();
set_y_min();
goto quadrant_3_odd;
}
}
}
quadrant_1_odd:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 0: // quadrant_1
goto quadrant_1_even;
case 1: // quadrant_2
{
set_y_min();
goto quadrant_2_even;
}
case 3: // quadrant_3
{
set_x_max();
set_y_min();
goto quadrant_3_even;
}
}
}
quadrant_2_even:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 1: // quadrant_2
goto quadrant_2_odd;
case 2:// quadrant_4
{
set_x_max();
set_y_max();
goto quadrant_4_odd;
}
case 3: // quadrant_3
{
set_x_max();
goto quadrant_3_odd;
}
}
}
quadrant_2_odd:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 1: // quadrant_2
goto quadrant_2_even;
case 2:// quadrant_4
{
set_x_max();
set_y_max();
goto quadrant_4_even;
}
case 3: // quadrant_3
{
set_x_max();
goto quadrant_3_even;
}
}
}
quadrant_3_even:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 0: // quadrant_1
{
set_y_max();
set_x_min();
goto quadrant_1_odd;
}
case 2:// quadrant_4
{
set_y_max();
goto quadrant_4_odd;
}
case 3: // quadrant_3
goto quadrant_3_odd;
}
}
quadrant_3_odd:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 0: // quadrant_1
{
set_x_min();
set_y_max();
goto quadrant_1_even;
}
case 2:// quadrant_4
{
set_y_max();
goto quadrant_4_even;
}
case 3: // quadrant_3
goto quadrant_3_even;
}
}
quadrant_4_even:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 0: // quadrant_1
{
set_x_min();
goto quadrant_1_odd;
}
case 1: // quadrant_2
{
set_x_min();
set_y_min();
goto quadrant_2_odd;
}
case 2:// quadrant_4
goto quadrant_4_odd;
}
}
quadrant_4_odd:
{
if( ++index > end )
goto done;
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
switch( xy_signs )
{
case 0: // quadrant_1
{
set_x_min();
goto quadrant_1_even;
}
case 1: // quadrant_2
{
set_x_min();
set_y_min();
goto quadrant_2_even;
}
case 2:// quadrant_4
goto quadrant_4_even;
}
}
done: {}
}
num = _mm256_div_pd( num, den );
num = xor_d( num, sign_h );
return num;
}
int bounded_convex::insert_vertex( int at )
{
int high = end - at;
int low = at - begin;
assert( ( low > 0 ) && ( high > 0 ) );
if( low <= high )
{
__m256d* ed = edge + ( begin - 1 );
__m256d* ed_max = ed + low;
for( ; ed < ed_max; ++ed )
{
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed + 1 ) );
_mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
}
int to_load;
__m256d* vp = vertex_pair( begin );
__m256d a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) );
__m256d a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
__m256d a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );
if( ( begin & 0x1 ) == 0 )
{
__m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
__m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
__m256d c2 = gather_lanes< clear_lane, 0 >( a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), c2 );
if( low == 1 )
{
--begin;
return at - 1;
}
to_load = low - 2;
}
else
to_load = low - 1;
int q = to_load / 4;
__m256d* vp_max = vp + 6 * q;
while( vp < vp_max )
{
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
__m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
__m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
__m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
vp += 3;
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
__m256d d0 = gather_lanes< 1, 0 >( b0, a0 );
__m256d d1 = gather_lanes< 1, 0 >( b1, a1 );
__m256d d2 = gather_lanes< 1, 0 >( b2, a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), d0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
vp += 3;
}
int r = to_load % 4;
if( r == 0 )
{
__m256d c0 = gather_lanes< 1, clear_lane >( a0 );
__m256d c1 = gather_lanes< 1, clear_lane >( a1 );
__m256d c2 = gather_lanes< 1, clear_lane >( a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
}
else
{
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
__m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
__m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
__m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
switch( r )
{
case 1:
goto low_end;
case 2:
{
__m256d c0 = gather_lanes< 1, clear_lane >( b0 );
__m256d c1 = gather_lanes< 1, clear_lane >( b1 );
__m256d c2 = gather_lanes< 1, clear_lane >( b2 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 3 ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 4 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 5 ), c2 );
goto low_end;
}
default:
{
vp += 3;
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
__m256d c3 = gather_lanes< 1, 0 >( b0, a0 );
__m256d c4 = gather_lanes< 1, 0 >( b1, a1 );
__m256d c5 = gather_lanes< 1, 0 >( b2, a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c3 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c4 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c5 );
}
}
}
low_end:
{
--begin;
return at - 1;
}
}
else
{
__m256d* ed = edge + end + 1;
__m256d* ed_min = ed - ( high + 1 );
for( ; ed > ed_min; --ed )
{
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed - 1 ) );
_mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
}
__m256d* vp = vertex_pair( end );
__m256d a0;
__m256d a1;
__m256d a2;
int to_load;
if( ( end & 0x1 ) == 0 )
{
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
__m256d c0 = gather_lanes< 1, clear_lane >( a0 );
__m256d c1 = gather_lanes< 1, clear_lane >( a1 );
__m256d c2 = gather_lanes< 1, clear_lane >( a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
if( high == 1 )
{
++end;
return at;
}
vp -= 3;
to_load = high - 2;
}
else
{
to_load = high - 1;
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) );
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );
}
int q = to_load / 4;
__m256d* vp_min = vp - 6 * q;
while( vp > vp_min )
{
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
__m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
__m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
__m256d c2 = gather_lanes< 1, 0 >( b2, a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
vp -= 3;
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
__m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
__m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
__m256d d2 = gather_lanes< 1, 0 >( a2, b2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), d0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
vp -= 3;
}
int r = to_load % 4;
if( r == 0 )
{
__m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
__m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
__m256d c2 = gather_lanes< clear_lane, 0 >( a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
}
else
{
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
__m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
__m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
__m256d c2 = gather_lanes< 1, 0 >( b2, a2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
switch( r )
{
case 1:
goto high_end;
case 2:
{
__m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
__m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
__m256d d2 = gather_lanes< clear_lane, 0 >( b2 );
_mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), d0 );
_mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), d1 );
_mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), d2 );
goto high_end;
}
default: // case 3
{
vp -= 3;
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
__m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
__m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
__m256d d2 = gather_lanes< 1, 0 >( a2, b2 );
_mm256_store_pd( reinterpret_cast< double* >( vp ), d0 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
}
}
}
high_end:
{
++end;
return at;
}
}
}
void bounded_convex::push( __m256d* begin, __m256d* end )
{
for( ; begin < end; ++begin )
{
int sign = _mm256_movemask_pd( *begin ) & 0x7;
switch( sign )
{
case 0b000: { locator< 0, 0, 0 >::insert( *this, *begin ); break; }
case 0b001: { locator< 1, 0, 0 >::insert( *this, *begin ); break; }
case 0b010: { locator< 0, 1, 0 >::insert( *this, *begin ); break; }
case 0b011: { locator< 1, 1, 0 >::insert( *this, *begin ); break; }
case 0b100: { locator< 0, 0, 1 >::insert( *this, *begin ); break; }
case 0b101: { locator< 1, 0, 1 >::insert( *this, *begin ); break; }
case 0b110: { locator< 0, 1, 1 >::insert( *this, *begin ); break; }
case 0b111: { locator< 1, 1, 1 >::insert( *this, *begin ); break; }
}
if( type == bounded_convex_type::empty )
return;
}
}
int bounded_convex::remove_vertices( int begin_vertex, int end_vertex )
{
int re = end_vertex & 0x1;
int rb = begin_vertex & 0x1;
int low = begin_vertex - begin;
int high = end - end_vertex;
if( low <= high )
{
__m256d* e_dst = edge + end_vertex;
__m256d* e_dst_min = e_dst - low;
__m256d* e_src = edge + begin_vertex;
while( --e_dst >= e_dst_min )
{
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( --e_src ) );
_mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp );
}
__m256d* v_dst = vertex_pair( end_vertex );
__m256d* v_src = vertex_pair( begin_vertex );
if( re == rb )
{
int q;
if( re & 0x1 )
{
__m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
__m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
__m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
__m256d b0 = blend< 0, 0, 1, 1 >( s0, d0 );
__m256d b1 = blend< 0, 0, 1, 1 >( s1, d1 );
__m256d b2 = blend< 0, 0, 1, 1 >( s2, d2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );
q = low;
}
else
q = ( low + 1 );
q /= 2;
__m256d* v_src_min = v_src - 3 * q;
while( v_src > v_src_min )
{
--v_src;
--v_dst;
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp );
}
}
else
{
__m256d s0;
__m256d s1;
__m256d s2;
int to_load;
if( rb == 0 )
{
v_src -= 3;
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d p0 = gather_lanes< 1, clear_lane >( s0 );
__m256d p1 = gather_lanes< 1, clear_lane >( s1 );
__m256d p2 = gather_lanes< 1, clear_lane >( s2 );
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
__m256d b0 = blend< 0, 0, 1, 1 >( p0, d0 );
__m256d b1 = blend< 0, 0, 1, 1 >( p1, d1 );
__m256d b2 = blend< 0, 0, 1, 1 >( p2, d2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );
if( low == 1 )
goto low_end;
to_load = low - 2;
}
else
{
to_load = low - 1;
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
}
int q = to_load / 4;
__m256d* v_src_min = v_src - 6 * q;
while( v_src > v_src_min )
{
v_src -= 3;
v_dst -= 3;
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
__m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
__m256d c2 = gather_lanes< 1, 0 >( b2, s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
v_dst -= 3;
v_src -= 3;
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
__m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
__m256d d2 = gather_lanes< 1, 0 >( s2, b2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
}
v_dst -= 3;
int r = to_load % 4;
if( r == 0 )
{
__m256d c0 = gather_lanes< clear_lane, 0 >( s0 );
__m256d c1 = gather_lanes< clear_lane, 0 >( s1 );
__m256d c2 = gather_lanes< clear_lane, 0 >( s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
}
else
{
v_src -= 3;
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
__m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
__m256d c2 = gather_lanes< 1, 0 >( b2, s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
switch( r )
{
case 1:
goto low_end;
case 2:
{
v_dst -= 3;
__m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
__m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
__m256d d2 = gather_lanes< clear_lane, 0 >( b2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
goto low_end;
}
default:
{
v_dst -= 3;
v_src -= 3;
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
__m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
__m256d d2 = gather_lanes< 1, 0 >( s2, b2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
v_dst -= 3;
s0 = gather_lanes< clear_lane, 0 >( s0, b0 );
s1 = gather_lanes< clear_lane, 0 >( s1, b1 );
s2 = gather_lanes< clear_lane, 0 >( s2, b2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), s0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), s1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), s2 );
}
}
}
}
low_end:
{
begin += ( end_vertex - begin_vertex );
return end_vertex - 1;
}
}
else
{
__m256d* e_dst = edge + begin_vertex;
__m256d* e_src = edge + end_vertex;
__m256d* e_src_max = e_src + high + 1;
for( ; e_src < e_src_max; ++e_src, ++e_dst )
{
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( e_src ) );
_mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp );
}
__m256d s0;
__m256d s1;
__m256d s2;
__m256d* v_dst = vertex_pair( begin_vertex );
__m256d* v_src = vertex_pair( end_vertex );
if( re == rb )
{
if( re & 0x1 )
{
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
__m256d c0 = blend< 0, 0, 1, 1 >( d0, s0 );
__m256d c1 = blend< 0, 0, 1, 1 >( d1, s1 );
__m256d c2 = blend< 0, 0, 1, 1 >( d2, s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
--high;
v_src += 3;
v_dst += 3;
}
int q = 3 * ( ( high + 1 ) / 2 );
__m256d* v_src_max = vertex_pair( end + 1 );
for( ; v_src < v_src_max; ++v_src, ++v_dst )
{
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp );
}
}
else
{
int to_load;
__m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
__m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
__m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
if( rb ) // re = 0 =:
{
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
__m256d g0 = gather_lanes< 0, 0 >( d0, s0 );
__m256d g1 = gather_lanes< 0, 0 >( d1, s1 );
__m256d g2 = gather_lanes< 0, 0 >( d2, s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), g0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), g1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), g2 );
if( high == 1 )
goto high_end;
v_dst += 3;
to_load = high - 2;
}
else // re = 1 => only the second lane of the si are used
to_load = high - 1;
int q = to_load / 4;
__m256d* v_src_max = v_src + 6 * q;
while( v_src < v_src_max )
{
v_src += 3;
__m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
__m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
__m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
__m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
__m256d u2 = gather_lanes< 1, 0 >( s2, t2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
v_dst += 3;
v_src += 3;
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
__m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
__m256d v2 = gather_lanes< 1, 0 >( t2, s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
v_dst += 3;
}
int r = to_load % 4;
if( r == 0 )
{
__m256d u0 = gather_lanes< 1, clear_lane >( s0 );
__m256d u1 = gather_lanes< 1, clear_lane >( s1 );
__m256d u2 = gather_lanes< 1, clear_lane >( s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
}
else
{
v_src += 3;
__m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
__m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
__m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
__m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
__m256d u2 = gather_lanes< 1, 0 >( s2, t2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
switch( r )
{
case 1:
goto high_end;
case 2:
{
v_dst += 3;
__m256d v0 = gather_lanes< 1, clear_lane >( t0 );
__m256d v1 = gather_lanes< 1, clear_lane >( t1 );
__m256d v2 = gather_lanes< 1, clear_lane >( t2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
goto high_end;
}
default: // case 3
{
v_dst += 3;
v_src += 3;
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
__m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
__m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
__m256d v2 = gather_lanes< 1, 0 >( t2, s2 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
}
}
}
}
high_end:
{
end -= ( end_vertex - begin_vertex );
return begin_vertex - 1;
}
}
}
void bounded_convex::set_to_last_segment()
{
int last = end - 1;
edge[ 0 ] = edge[ last ];
edge[ 1 ] = edge[ end ];
edge[ 2 ] = edge[ begin + 1 ];
__m256d* vl = vertex_pair( last );
__m256d* vb = vertex_pair( begin );
if( last & 0x1 )
{
if( begin & 0x1 )
{
vertex_p[ 0 ] = gather_lanes< 1, 1 >( vl[ 0 ], vb[ 0 ] );
vertex_p[ 1 ] = gather_lanes< 1, 1 >( vl[ 1 ], vb[ 1 ] );
vertex_p[ 2 ] = gather_lanes< 1, 1 >( vl[ 2 ], vb[ 2 ] );
}
else
{
vertex_p[ 0 ] = gather_lanes< 1, 0 >( vl[ 0 ], vb[ 0 ] );
vertex_p[ 1 ] = gather_lanes< 1, 0 >( vl[ 1 ], vb[ 1 ] );
vertex_p[ 2 ] = gather_lanes< 1, 0 >( vl[ 2 ], vb[ 2 ] );
}
}
else
{
if( begin & 0x1 )
{
vertex_p[ 0 ] = blend< 0, 0, 1, 1 >( vl[ 0 ], vb[ 0 ] );
vertex_p[ 1 ] = blend< 0, 0, 1, 1 >( vl[ 1 ], vb[ 1 ] );
vertex_p[ 2 ] = blend< 0, 0, 1, 1 >( vl[ 2 ], vb[ 2 ] );
}
else
{
vertex_p[ 0 ] = gather_lanes< 0, 0 >( vl[ 0 ], vb[ 0 ] );
vertex_p[ 1 ] = gather_lanes< 0, 0 >( vl[ 1 ], vb[ 1 ] );
vertex_p[ 2 ] = gather_lanes< 0, 0 >( vl[ 2 ], vb[ 2 ] );
}
}
end = 2;
begin = 0;
type = bounded_convex_type::segment;
}
void bounded_convex::setup( __m256d box )
{
type = bounded_convex_type::polygon;
__m256d zero = _mm256_xor_pd( zero, zero );
__m256i ones = _mm256_cmpeq_epi64( ones, ones );
__m256i signi = _mm256_slli_epi64( ones, 63 );
__m256i signi_h = _mm256_slli_si256( signi, 8 );
__m256d sign = _mm256_castsi256_pd( signi );
__m256d sign_h = _mm256_castsi256_pd( signi_h );
__m256i onei = _mm256_srli_epi64( ones, 54 );
onei = _mm256_slli_epi64( onei, 52 ); // { 1.0, 1.0, 1.0, 1.0 }
__m256d one = _mm256_castsi256_pd( onei );
box = _mm256_max_pd( box, zero );
begin = alloc / 2;
end = begin + 4;
__m256d* e = edge + begin;
__m256d x_box = gather_lanes< 0, 0 >( box ); // { x_min, x_max, x_min, x_max }
__m256d y_box = gather_lanes< 1, 1 >( box ); // { y_min, y_max, y_min, y_max }
e[ 0 ] = blend< 0, 1, 0, 0 >( one, zero ); // { 1, 0, 0, 0},
e[ 0 ] = blend< 0, 0, 1, 1 >( e[ 0 ], x_box ); // { 1, 0, x_min, x_max }, for x >= x_min
e[ 1 ] = permute< 1, 0, 2, 3 >( e[ 0 ] ); // { 0 , 1, x_min, x_max } // y >= y_min
e[ 1 ] = blend< 0, 0, 1, 1 >( e[ 1 ], y_box ); // { 0 , 1, y_min, y_max } // y >= y_min
e[ 2 ] = permute< 0, 1, 3, 2 >( e[ 0 ] ); // { 1 , 0, x_max, x_min }
e[ 2 ] = _mm256_xor_pd( e[ 2 ], sign ); // { -1 , -, -x_max, -y_min } -x >= -x_max
e[ 3 ] = permute< 0, 1, 3, 2 >( e[ 1 ] ); // { 0, 1, y_max, ? }
e[ 3 ] = _mm256_xor_pd( e[ 3 ], sign ); // { 0, 1, -y_max, ? } -y >= -y_max
e[ 4 ] = e[ 0 ]; // { 1, 0, x_min, } }
// initial vertices
__m256d* vp = vertex_pair( begin );
vp[ 0 ] = permute< 0, 0, 3, 3 >( x_box ); // { x_min, x_min, x_max, x_max } : vertices 0 and 1:
vp[ 0 ] = _mm256_xor_pd( vp[ 0 ], sign_h ); // { x_min, -x_min, x_max, -x_max } : vertices 0 and 1:
vp[ 1 ] = permute< 0, 0, 2, 2 >( y_box ); // { y_min, y_min, y_min, y_min }
vp[ 1 ] = _mm256_xor_pd( vp[ 1 ], sign_h ); // { y_min, -y_min, y_min, -y_min }
vp[ 2 ] = _mm256_xor_pd( one, sign_h ); // { 1.0, -1.0, 1.0, -1.0 }
vp[ 3 ] = permute< 1, 1, 2, 2 >( x_box ); // { x_max, x_max, x_min, x_min } : vertices 2 and 3:
vp[ 3 ] = _mm256_xor_pd( vp[ 3 ], sign_h ); // { x_max, -x_max, x_min, -x_min }
vp[ 4 ] = permute< 1, 1, 3, 3 >( y_box ); // { y_max, y_max, y_max, y_max }
vp[ 4 ] = _mm256_xor_pd( vp[ 4 ], sign_h ); // { y_max, -y_max, y_max, -y_max }
vp[ 5 ] = vp[ 2 ];
}
int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge )
{
constexpr unsigned long long x_index_epi8 = 0x1 << 4;
constexpr unsigned long long y_index_epi8 = 0x1 << 12;
constexpr unsigned long long z_index_epi8 = 0x1 << 16;
int ret = 0;
__m256i ones = ones_i(); // { 0xFFFF'FFFF'FFFF'FFFF, ... }
__m256i exp_mask = _mm256_srli_epi64( ones, 53 ); // { 0x0000'0000'0000'07FF, .... }
__m256i one = _mm256_srli_epi64( ones, 54 ); // { 0x0000'0000'0000'03FF, .... }
__m256i sign = _mm256_slli_epi64( ones, 63 ); // { 0x8000'0000'0000'0000, .... }
exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000, .... }
one = _mm256_slli_epi64( one, 52 ); // { 0x3FF0'0000'0000'0000, .... }
sign = gather_lanes< 1, clear_lane >( sign ); // { sign_bit, sign_bit, 0, 0 }
for( int iq = 0; iq < 4; ++iq )
{
__m256d* end = quadrant[ iq ] + n;
while( quadrant[ iq ] < end )
{
__m256d* q = quadrant[ iq ];
__m256i exp = and_i( *q, exp_mask );
__m256i cmp_huge = _mm256_cmpgt_epi32( exp, huge );
if( _mm256_testz_si256( cmp_huge, cmp_huge ) ) // no huge entries
{
__m256i cmp_tiny = _mm256_cmpgt_epi32( tiny, exp );
if( _mm256_testz_si256( cmp_tiny, cmp_tiny ) ) // no entries are huge or tiny
{
++quadrant[ iq ];
continue;
}
// no entries are huge, but aome are tiny. We scale by 2^(huge - max_exp[ i ] )
__m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
__m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
__m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}
int64_t int_max = _mm256_extract_epi64( max, 0 );
if( int_max == 0 ) // all exponents are zero.
{
__m256d cmp = _mm256_cmp_pd( *q, zero_d(), _CMP_EQ_OQ );
if( ( _mm256_movemask_pd( cmp ) & 0x7 ) == 0x7 ) // all coefficients are zero
{ // drop the constraint
*q = *( --end );
continue;
}
}
max = permute< 0, 0, 0, 0 >( max );
__m256i de = _mm256_sub_epi32( huge, max );
// de may be larger than 0x3ff and we must scale in two steps
*q = xor_d( *q, sign );
__m256i hs = _mm256_srli_epi64( de, 1 ); // de/2
hs = _mm256_and_si256( hs, exp_mask ); // throwing away the remainder
__m256d scale = cast_d( _mm256_add_epi64( hs, one ) );
*q = _mm256_mul_pd( scale, *q );
__m256i ls = _mm256_sub_epi64( de, hs ); // de - hs
scale = cast_d( _mm256_add_epi64( ls, one ) );
*q = _mm256_mul_pd( scale, *q );
}
else // some huge entries
{
__m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
__m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
__m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}
max = permute< 0, 0, 0, 0 >( max );
__m256i de = _mm256_sub_epi32( huge, max ); // de > -0x3ff
__m256d scale = cast_d( _mm256_add_epi64( de, one ) ); // this does not underflow
*q = xor_d( *q, sign );
*q = _mm256_mul_pd( scale, *q ); // this may be inexact, but the inexact entries will be dropped later
}
// cleaning up the remaining tiny entries
__m256i exp_d = and_i( *q, exp_mask );
__m256i cmp_d = _mm256_cmpgt_epi32( exp_d, tiny ); // 8 x (4 byte masks)
int cmp_d_mask = _mm256_movemask_epi8( cmp_d ); // 8 bits for x, 8 bits for y, 8 bits for z
if( cmp_d_mask & ( x_index_epi8 | y_index_epi8 ) )
{
// In order to preserve the solution set, we must round the right hand side down and
// the left hand side up, so that every solution of the old problem satisfies the new contraints.
// To do that, we want to round the left hand side up and the right hand side down. Since
// the lhs were flipped, we simply round all entries down, that is:
//
// we replace x in ( -tiny, 0 ) by -tiny and x in [ 0, tiny ) by +0.0
// (and fix the zeros latter )
__m256d zero = zero_d();
__m256i exp_f = and_i( *q, exp_mask );
__m256i full_sign = gather_lanes< 0, 0 >( sign );
__m256i minus_tiny = _mm256_or_si256( tiny, full_sign );
__m256i is_tiny_f = _mm256_cmpgt_epi32( tiny, exp_f );
__m256d non_negative = _mm256_cmp_pd( *q, zero, _CMP_GE_OQ );
// tiny negatives should be replaced by -tiny, tiny non negatives are replace by 0.
__m256d fix = blend( cast_d( minus_tiny ), zero, non_negative );
// fixing the tiny entries
*q = blend( *q, fix, is_tiny_f );
// restoring the rhs sign
*q = xor_d( *q, sign );
// cleaning up the zeros
__m256i abs = _mm256_slli_epi64( cast_i( *q ), 1 ); // throwing away the sign bit
__m256i is_zero = _mm256_cmpeq_epi64( abs, cast_i( zero ) );
*q = blend( *q, zero, is_zero );
++quadrant[ iq ];
}
else
{
// | a | <= tiny and | b | <= tiny: and |c| >= huge,
// The constraint is either irrelevant or fatal.
int pb_sign = _mm256_movemask_pd( *q );
if( !( pb_sign & 0x4 ) ) // drop the quadrant: the right hand side is too large
{
ret |= ( 0x1 << iq );
break;
}
// else, drop the constraint: the right hand side is too small
*q = *( --end );
}
}
}
return ret;
}
} // namespace wm::nlp::lp2d
//----------------------------------------------------------------------------------------
extern "C"
{
int interval_system_2d( double* solution, double const* box, double const* eq,
unsigned int n_equations )
{
constexpr int max_quadrant_size = 32;
using namespace wm::nlp::lp2d;
double* solsPtr = solution;
printf("box: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", box[0], box[1], box[2], box[3]);
printf("n_equations = %u\n", n_equations);
for (int i = 0; i < n_equations; i++) {
printf("equation %u: a_min = %f, a_max = %f, b_min = %f, b_max = %f, c_min = %f, c_max = %f\n", i, eq[6*i], eq[6*i + 1], eq[6*i + 2], eq[6*i + 3], eq[6*i + 4], eq[6*i + 5]);
}
if( n_equations > ( max_quadrant_size / 2 ) )
return -1;
if( std::bit_cast< uint64_t >( box ) & 0x1F )
return -2;
if( std::bit_cast< uint64_t >( eq ) & 0x1F )
return -3;
int ret = 0;
int n_equations_q = n_equations / 2;
int n_equations_r = n_equations % 2;
double const* eq_max = eq + n_equations_q;
__m256i huge;
__m256i tiny;
__m256i signi = bounds( &tiny, &huge );
__m256i exp_mask = _mm256_srli_epi64( ones_i(), 53 );
exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000... }
__m256d sign = cast_d( signi );
__m256d sign_01 = blend< 0, 0, 1, 1 >( sign, zero_d() ); // { -1: -1, 0, 0 }
int old_rounding_mode = std::fegetround();
if( std::fesetround( FE_DOWNWARD ) )
return -4;
__m256d quadrants[ 4 * max_quadrant_size ];
__m256d* q[ 4 ] = { quadrants,
quadrants + max_quadrant_size,
quadrants + 2 * max_quadrant_size,
quadrants + 3 * max_quadrant_size };
int empty_quadrant = 0;
double const* p_eq = eq;
int quadrant_is_not_empty = 0xF;
for( ; p_eq < eq_max; p_eq += 12 )
{
__m256d a0 = _mm256_load_pd( p_eq );
__m256d aux = _mm256_load_pd( p_eq + 4 );
__m256d bc1 = _mm256_load_pd( p_eq + 8 );
// the equations in this group are
//
// a0[ 0, 1 ] x + a0[ 2, 3 ] y = aux[ 0, 1 ]
// aux[ 2, 3 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ]
__m256d bc0 = gather_lanes< 1, 0 >( a0, aux );
__m256d a1 = gather_lanes< 1, 0 >( aux );
// the equations in this group are now
//
// a0[ 0, 1 ] x + bc0[ 0, 1 ] y = bc0[ 2, 3 ]
// a1[ 0, 1 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ]
__m256d max_c0 = permute< 0, 1, 3, 2 >( bc0 ); //{ ?, ?, bc0[ 3 ], ? }
__m256d max_c1 = permute< 0, 1, 3, 2 >( bc1 ); //{ ?, ?, bc1[ 3 ], ? }
for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
{
q[ iq ][ 0 ] = gather_low( a0, bc0 ); // { a0[ 0 ], bc0[ 0 ], ?, ? }
q[ iq ][ 1 ] = gather_high( a0, bc0 ); // { a0[ 1 ], bc0[ 1 ], ?, ? }
q[ iq ][ 2 ] = gather_low( a1, bc1 ); // { a1[ 0 ], bc1[ 0 ], ?, ? }
q[ iq ][ 3 ] = gather_high( a1, bc1 ); // { a1[ 1 ], bc1[ 1 ], ?, ? }
q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c0 ); // { a0[ 0 ], bc0[ 0 ], bc0[ 3 ], ? }
q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], bc0 ); // { a0[ 1 ], bc0[ 1 ], bc0[ 2 ], ? }
q[ iq ][ 2 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 2 ], max_c1 ); // { a1[ 0 ], bc1[ 0 ], bc1[ 3 ], ? }
q[ iq ][ 3 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 3 ], bc1 ); // { a1[ 1 ], bc1[ 1 ], bc1[ 2 ], ? }
q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -a0[ 0 ], -bc0[ 0 ], -bc0[ 3 ], ? }
q[ iq ][ 2 ] = xor_d( q[ iq ][ 2 ], sign ); // { -a1[ 0 ], -bc1[ 0 ], -bc1[ 3 ], ? }
// moving to the next quadrant
if( iq & 0x1 ) // replacing y by - y
{
bc0 = xor_d( bc0, sign_01 );
bc1 = xor_d( bc1, sign_01 );
bc0 = permute< 1, 0, 2, 3 >( bc0 );
bc1 = permute< 1, 0, 2, 3 >( bc1 );
}
else // replacing x by - x
{
a0 = xor_d( a0, sign_01 );
a1 = xor_d( a1, sign_01 );
a0 = permute< 1, 0, 2, 3 >( a0 );
a1 = permute< 1, 0, 2, 3 >( a1 );
}
}
//----------------------------------------------------------------------------------------
// Checking for overflow and underflow: to avoid headaches with underflow and overflow, we
// normalize equation in which some elements are either +0.0 or have absolute value outside
// the range [ tiny, huge ]. In this process we may detect that the intersection of the
// feasible region with some quadrants is empty, or that some equations are irrelevant.
//----------------------------------------------------------------------------------------
__m256d ra = gather_lanes< 0, 0 >( a0, a1 ); // { row[ 0 ][ 0 ], row[ 0 ][ 1 ], row[ 3 ][ 0 ], row[ 3 ][ 1 ] }
__m256i expa = and_i( ra, exp_mask );
__m256i exp0 = and_i( bc0, exp_mask );
__m256i exp1 = and_i( bc1, exp_mask );
__m256i min = _mm256_min_epi32( expa, exp0 );
__m256i max = _mm256_max_epi32( expa, exp0 );
min = _mm256_min_epi32( min, exp1 );
max = _mm256_max_epi32( max, exp1 );
__m256i c_min = _mm256_cmpgt_epi32( tiny, min );
__m256i c_max = _mm256_cmpgt_epi32( max, huge );
__m256i bad = or_i( c_min, c_max );
if( _mm256_testz_si256( bad, bad ) )
{
q[ 0 ] += 4;
q[ 1 ] += 4;
q[ 2 ] += 4;
q[ 3 ] += 4;
}
else
empty_quadrant |= normalize_equations( q, 4, tiny, huge );
}
if( n_equations_r )
{
__m256d ab = _mm256_load_pd( p_eq );
__m128d aux = _mm_load_pd( p_eq + 4 );
__m256d sign_23 = gather_lanes< 1, 0 >( sign_01 );
// the equations in this group are
// ab[ 0, 1 ] x + ab[ 2, 3 ] y = aux[ 0, 1 ]
__m256d c = _mm256_castpd128_pd256( aux );
c = gather_lanes< 0, 0 >( c );
// the equations in this group are now
// ab[ 0, 1 ] x + ab[ 2, 3 ] y = c[ 0, 1 ]
__m256d max_c = permute< 0, 1, 3, 3 >( c ); //{ ?, ?, c[ 1 ], ? }
for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
{
q[ iq ][ 0 ] = permute< 0, 2, 2, 3 >( ab ); // { ab[ 0 ], ab[ 2 ], ?, ? }
q[ iq ][ 1 ] = permute< 1, 3, 2, 3 >( ab ); // { ab[ 1 ], ab[ 3 ], ?, ? }
q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c ); // { ab[ 0 ], ab[ 2 ], c[ 1 ], ? }
q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], c ); // { ab[ 1 ], ab[ 3 ], c[ 0 ], ? }
q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -ab[ 0 ], -ab[ 2 ], -c[ 1 ], ? }
// moving to the next quadrant
if( iq & 0x1 ) // replacing y by - y
{
ab = xor_d( ab, sign_23 );
ab = permute< 0, 1, 3, 2 >( ab );
}
else // replacing x by - x
{
ab = xor_d( ab, sign_01 );
ab = permute< 1, 0, 2, 3 >( ab );
}
}
//----------------------------------------------------------------------------------------
// Checking for overflow and underflow: to avoid headaches with underflow and overflow, we
// normalize equation in which some elements are either +0.0 or have absolute value outside
// the range [ tiny, huge ]. In this process we may detect that the intersection of the
// feasible region with some quadrants is empty, or that some equations are irrelevant.
//----------------------------------------------------------------------------------------
__m256i exp_ab = and_i( ab, exp_mask );
__m256i exp_c = and_i( c, exp_mask );
exp_c = blend< 0, 0, 1, 1 >( exp_c, exp_ab );
__m256i min = _mm256_min_epi32( exp_ab, exp_c );
__m256i max = _mm256_max_epi32( exp_ab, exp_c );
__m256i c_min = _mm256_cmpgt_epi32( tiny, min );
__m256i c_max = _mm256_cmpgt_epi32( max, huge );
__m256i bad = or_i( c_min, c_max );
if( _mm256_testz_si256( bad, bad ) )
{
q[ 0 ] += 2;
q[ 1 ] += 2;
q[ 2 ] += 2;
q[ 3 ] += 2;
}
else
empty_quadrant |= normalize_equations( q, 2, tiny, huge );
}
__m256d avx_box = _mm256_load_pd( box );
__m256d cmp = _mm256_cmp_pd( avx_box, zero_d(), _CMP_GE_OQ );
int positive_box = _mm256_movemask_pd( cmp );
__m256d edge[ 2 * ( max_quadrant_size + 2 ) ];
__m256d vertex_pair[ 3 * ( max_quadrant_size + 2 ) ];
bounded_convex bc( 2 * ( max_quadrant_size + 2 ), edge, vertex_pair );
constexpr uint x_min_field = 0x1;
constexpr uint x_max_field = 0x2;
constexpr uint y_min_field = 0x4;
constexpr uint y_max_field = 0x8;
constexpr uint first_quadrant_field = 0x1;
constexpr uint second_quadrant_field = 0x2;
constexpr uint third_quadrant_field = 0x4;
constexpr uint fourth_quadrant_field = 0x8;
if( positive_box & x_max_field ) // x_max >= 0
{
if( !( empty_quadrant & first_quadrant_field ) &&
( positive_box & y_max_field ) ) // y_max >= 0, first quadrant
{
bc.setup( avx_box );
bc.push( quadrants, q[ 0 ] );
if( bc.type != bounded_convex_type::empty )
{
++ret;
__m256d bc_box = bc.box();
_mm256_store_pd( solution, bc_box );
solution += 4;
}
}
if( !( empty_quadrant & fourth_quadrant_field ) &&
!( positive_box & y_min_field ) ) // y_min < 0, fourth quadrant
{
__m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
y_sign = gather_lanes< clear_lane, 1 >( y_sign );
__m256d q_box = permute< 0, 1, 3, 2 >( avx_box );
q_box = xor_d( q_box, y_sign );
bc.setup( q_box );
bc.push( quadrants + 3 * max_quadrant_size, q[ 3 ] );
if( bc.type != bounded_convex_type::empty )
{
++ret;
__m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
y_sign = gather_lanes< clear_lane, 1 >( y_sign );
__m256d bc_box = bc.box();
bc_box = xor_d( bc_box, y_sign );
bc_box = permute< 0, 1, 3, 2 >( bc_box );
_mm256_store_pd( solution, bc_box );
solution += 4;
}
}
}
if( !( positive_box & x_min_field ) ) // x_min < 0
{
if( !( empty_quadrant & second_quadrant_field) &&
( positive_box & y_max_field ) ) // y_max >= 0, second quadrant
{
__m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
x_sign = gather_lanes< 0, clear_lane >( x_sign );
__m256d q_box = permute< 1, 0, 2, 3 >( avx_box );
q_box = xor_d( q_box, x_sign );
bc.setup( q_box );
bc.push( quadrants + max_quadrant_size, q[ 1 ] );
if( bc.type != bounded_convex_type::empty )
{
++ret;
__m256d bc_box = bc.box();
__m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
x_sign = gather_lanes< 0, clear_lane >( x_sign );
bc_box = xor_d( bc_box, x_sign );
bc_box = permute< 1, 0, 2, 3 >( bc_box );
_mm256_store_pd( solution, bc_box );
solution += 4;
}
}
if( !( empty_quadrant & third_quadrant_field ) &&
!( positive_box & y_min_field ) ) // y_min < 0, third quadrant
{
__m256i sign = _mm256_slli_epi64( ones_i(), 63 );
avx_box = permute< 1, 0, 3, 2 >( avx_box );
avx_box = xor_d( avx_box, sign );
bc.setup( avx_box );
bc.push( quadrants + 2 * max_quadrant_size, q[ 2 ] );
if( bc.type != bounded_convex_type::empty )
{
++ret;
__m256d bc_box = bc.box();
__m256i sign = _mm256_slli_epi64( ones_i(), 63 );
bc_box = xor_d( bc_box, sign );
bc_box = permute< 1, 0, 3, 2 >( bc_box );
_mm256_store_pd( solution, bc_box );
}
}
}
std::fesetround( old_rounding_mode );
for (int i = 0; i < ret; i++) {
printf("solution %u: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", i, solsPtr[4*i], solsPtr[4*i + 1], solsPtr[4*i + 2], solsPtr[4*i + 3]);
}
return ret;
}
} // extern "C"
//----------------------------------------------------------------------------------------