mirror of
https://gitlab.com/sheaf/metabrush.git
synced 2024-12-23 22:04:07 +00:00
2035 lines
67 KiB
C++
2035 lines
67 KiB
C++
|
//----------------------------------------------------------------------------------------
|
||
|
// Copyright (c) 2024 Walter Mascarenhas
|
||
|
//
|
||
|
// This Source Code Form is subject to the terms of the Mozilla Public
|
||
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
||
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||
|
//
|
||
|
// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability",
|
||
|
// and we provide no guarantee regarding the correctness of this source code.
|
||
|
//
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
#include <algorithm>
|
||
|
#include <cfenv>
|
||
|
#include <cstdint>
|
||
|
#include <functional>
|
||
|
#include <iostream>
|
||
|
|
||
|
#include "lp_2d.hpp"
|
||
|
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
namespace wm::nlp::lp2d {
|
||
|
|
||
|
__m256i bounds( __m256i* tiny, __m256i* huge )
|
||
|
{
|
||
|
__m256i ones = ones_i();
|
||
|
__m256i sign = _mm256_slli_epi64( ones, 63 );
|
||
|
__m256i a = _mm256_srli_epi64( sign, 1 ); // 0b0100'0000'0000...
|
||
|
__m256i b = _mm256_srli_epi64( sign, 3 ); // 0b0001'0000'0000...
|
||
|
__m256i c = _mm256_or_si256( a, b ); // 0b0101'0000'0000...
|
||
|
__m256i d = _mm256_srli_epi64( c, 4 ); // 0b0000'0101'0000...
|
||
|
|
||
|
*huge = _mm256_or_si256( c, d ); // 0b0101'0101'0000... 0x5500...
|
||
|
|
||
|
__m256i e = _mm256_srli_epi64( *huge, 1 ); // 0b0010'1010'1000...
|
||
|
__m256i f = _mm256_srli_epi64( d, 2 ); // 0b0000'0001'0100...
|
||
|
__m256i g = _mm256_srli_epi64( d, 4 ); // 0b0000'0000'0101...
|
||
|
__m256i h = _mm256_srli_epi64( ones, 1 );
|
||
|
__m256i i = _mm256_srli_epi64( sign, 32 );
|
||
|
|
||
|
__m256i j = _mm256_or_si256( f, g ); // 0b0000'0001'0101...
|
||
|
|
||
|
*tiny = _mm256_or_si256( e, j ); // 0b0010'1011'1101... 0x2bd...
|
||
|
__m256i k = and_not_i( i, h );
|
||
|
*huge = blend< 0, 0, 0, 1 >( *huge, k );
|
||
|
*tiny = blend< 0, 0, 0, 1 >( *tiny, zero_i() );
|
||
|
return sign;
|
||
|
}
|
||
|
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
// exact_location returns the exact location of the intersection ( x, y ) of the lines
|
||
|
//
|
||
|
// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ]
|
||
|
// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ]
|
||
|
//
|
||
|
// with respect to the set
|
||
|
//
|
||
|
// e0 x + e1 y >= e2
|
||
|
//
|
||
|
// under the assumption that
|
||
|
//
|
||
|
// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0
|
||
|
//
|
||
|
// where ei = edge[ i ], b0 = border[ 0 ] and b1 = b[ 1 ]
|
||
|
//
|
||
|
// This amounts to evaluating the sign of
|
||
|
//
|
||
|
// p = e0 r + e1 s - e2 d
|
||
|
//
|
||
|
// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ]
|
||
|
// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] ,
|
||
|
//
|
||
|
// which motivates the evaluation of the six products
|
||
|
//
|
||
|
// group 1 group 2
|
||
|
// e0 b0[ 2 ] b1[ 1 ], -e0 b0[ 1 ] b1[ 2 ]
|
||
|
// e1 b0[ 0 ] b1[ 2 ], -e1 b0[ 2 ] b1[ 0 ]
|
||
|
// e2 b0[ 1 ] b1[ 0 ]. -e2 b0[ 0 ] b1[ 1 ]
|
||
|
//
|
||
|
// and we want to find the sign of their sum.
|
||
|
//
|
||
|
// The function assumes that the edges entries are either normal or zero, ie.,
|
||
|
// there are no subnormal entries.
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
location exact_location( __m256d e, __m256d b0, __m256d b1 )
|
||
|
{
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
__m256d b0_a = permute< 1, 2, 0, 3 >( b0 ); // { b0[ 1 ], b0[ 2 ], b0[ 0 ], ? }
|
||
|
__m256d b0_b = permute< 2, 0, 1, 3 >( b0 ); // { b0[ 2 ], b0[ 0 ], b0[ 1 ], ? }
|
||
|
__m256d b1_a = permute< 1, 2, 0, 3 >( b1 ); // { b1[ 1 ], b1[ 2 ], b1[ 0 ], ? }
|
||
|
__m256d b1_b = permute< 2, 0, 1, 3 >( b1 ); // { b1[ 2 ], b1[ 0 ], b1[ 1 ], ? }
|
||
|
|
||
|
__m256d g1 = _mm256_mul_pd( b0_b, b1_a ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
|
||
|
__m256d g2 = _mm256_mul_pd( b0_a, b1_b ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
|
||
|
|
||
|
__m256d g1b = _mm256_fmsub_pd( b0_b, b1_a, g1 ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
|
||
|
__m256d g2b = _mm256_fmsub_pd( b0_a, b1_b, g2 ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
|
||
|
|
||
|
__m256i sign_i = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
__m256d sign = cast_d( sign_i );
|
||
|
__m256d minus_e = xor_d( e, sign );
|
||
|
|
||
|
__m256d c[ 8 ];
|
||
|
|
||
|
c[ 0 ] = _mm256_mul_pd( e, g1 );
|
||
|
c[ 1 ] = _mm256_mul_pd( minus_e, g2 );
|
||
|
c[ 2 ] = _mm256_mul_pd( e, g1b );
|
||
|
c[ 3 ] = _mm256_mul_pd( minus_e, g2b );
|
||
|
|
||
|
c[ 4 ] = _mm256_fmsub_pd( e, g1, c[ 0 ] );
|
||
|
c[ 5 ] = _mm256_fmsub_pd( minus_e, g2, c[ 1 ] );
|
||
|
c[ 6 ] = _mm256_fmsub_pd( e, g1b, c[ 2 ] );
|
||
|
c[ 7 ] = _mm256_fmsub_pd( minus_e, g2b, c[ 3 ] );
|
||
|
|
||
|
__m256d a[ 6 ];
|
||
|
|
||
|
// transposing the products
|
||
|
|
||
|
__m256d tmp_0 = gather_low( c[ 0 ], c[ 1 ] );
|
||
|
__m256d tmp_1 = gather_high( c[ 0 ], c[ 1 ] );
|
||
|
__m256d tmp_2 = gather_low( c[ 2 ], c[ 3 ] );
|
||
|
__m256d tmp_3 = gather_high( c[ 2 ], c[ 3 ] );
|
||
|
|
||
|
a[ 0 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x20 );
|
||
|
a[ 1 ] = _mm256_permute2f128_pd( tmp_1, tmp_3, 0x20 );
|
||
|
a[ 2 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x31 );
|
||
|
|
||
|
__m256d tmp_4 = gather_low( c[ 4 ], c[ 5 ] );
|
||
|
__m256d tmp_5 = gather_high( c[ 4 ], c[ 5 ] );
|
||
|
__m256d tmp_6 = gather_low( c[ 6 ], c[ 7 ]);
|
||
|
__m256d tmp_7 = gather_high( c[ 6 ], c[ 7 ] );
|
||
|
|
||
|
a[ 3 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x20 );
|
||
|
a[ 4 ] = _mm256_permute2f128_pd( tmp_5, tmp_7, 0x20 );
|
||
|
a[ 5 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x31 );
|
||
|
|
||
|
__m256d gt_05 = is_greater( a[ 0 ], a[ 5 ] );
|
||
|
__m256d tmp = blend_max( a[ 0 ], a[ 5 ], gt_05 );
|
||
|
a[ 0 ] = blend_min( a[ 0 ], a[ 5 ], gt_05 );
|
||
|
a[ 5 ] = tmp;
|
||
|
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
// In the loop below we assume that a0[ j ] <= a5[ j ], i = 0,1,2,3
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
while( true )
|
||
|
{
|
||
|
__m256d gt_12 = is_greater( a[ 1 ], a[ 2 ] );
|
||
|
__m256d gt_34 = is_greater( a[ 3 ], a[ 4 ] );
|
||
|
|
||
|
__m256d min_12 = blend_min( a[ 1 ], a[ 2 ], gt_12 );
|
||
|
__m256d max_12 = blend_max( a[ 1 ], a[ 2 ], gt_12 );
|
||
|
|
||
|
__m256d min_34 = blend_min( a[ 3 ], a[ 4 ], gt_34 );
|
||
|
__m256d max_34 = blend_max( a[ 3 ], a[ 4 ], gt_34 );
|
||
|
|
||
|
__m256d gt_min = is_greater( min_12, min_34 );
|
||
|
__m256d gt_max = is_greater( max_12, max_34 );
|
||
|
|
||
|
__m256d min_1234 = blend_min( min_12, min_34, gt_min );
|
||
|
a[ 2 ] = blend_max( min_12, min_34, gt_min );
|
||
|
|
||
|
a[ 3 ] = blend_min( max_12, max_34, gt_max );
|
||
|
__m256d max_1234 = blend_max( max_12, max_34, gt_max );
|
||
|
|
||
|
__m256d gt_0_min_1234 = is_greater( a[ 0 ], min_1234 );
|
||
|
__m256d gt_max_1234_5 = is_greater( max_1234, a[ 5 ] );
|
||
|
|
||
|
a[ 1 ] = blend_max( a[ 0 ], min_1234, gt_0_min_1234 );
|
||
|
a[ 0 ] = blend_min( a[ 0 ], min_1234, gt_0_min_1234 );
|
||
|
|
||
|
a[ 4 ] = blend_min( max_1234, a[ 5 ], gt_max_1234_5 );
|
||
|
a[ 5 ] = blend_max( max_1234, a[ 5 ], gt_max_1234_5 );
|
||
|
|
||
|
// 1) a0[ j ] = min { ai[ j ], i = 0, .... 5 }
|
||
|
// 2) a5[ j ] = max { ai[ j ], i = 0, .... 5 }
|
||
|
|
||
|
__m256d zero = _mm256_xor_pd( zero, zero );
|
||
|
__m256d cmp0 = _mm256_cmp_pd( a[ 0 ], zero, _CMP_LT_OQ );
|
||
|
|
||
|
if( _mm256_testz_pd( cmp0, cmp0 ) ) // all a0[ j ] are >= 0
|
||
|
{
|
||
|
__m256d cmp = is_greater( a[ 5 ], zero );
|
||
|
if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a5 is positive
|
||
|
return location::border;
|
||
|
else
|
||
|
return location::in;
|
||
|
}
|
||
|
|
||
|
__m256d cmp5 = _mm256_cmp_pd( a[ 5 ], zero, _CMP_GT_OQ );
|
||
|
|
||
|
if( _mm256_testz_pd( cmp5, cmp5 ) ) // all a5[ j ] are <= 0
|
||
|
{
|
||
|
__m256d cmp = is_greater( zero, a[ 0 ] );
|
||
|
if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a0 is negative
|
||
|
return location::border;
|
||
|
else
|
||
|
return location::out;
|
||
|
}
|
||
|
|
||
|
// now
|
||
|
// 1) min ai[j] has its sign bit set
|
||
|
// 2) max ai[j] does not have its sign bit set
|
||
|
|
||
|
__m256d lo = gather_low( a[ 0 ], a[ 5 ] ); // { a0[ 0 ], a5[ 0 ], a0[ 2 ], a5[ 2 ] }
|
||
|
__m256d hi = gather_high( a[ 0 ], a[ 5 ] ); // { a0[ 1 ], a5[ 1 ], a0[ 3 ], a5[ 3 ] }
|
||
|
|
||
|
__m256d gt_lo_hi = is_greater( lo, hi );
|
||
|
__m256d b0 = blend_min( lo, hi, gt_lo_hi );
|
||
|
__m256d b5 = blend_max( lo, hi, gt_lo_hi );
|
||
|
|
||
|
// 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 2 ] }
|
||
|
// 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 1 ], b5[ 3 ] }
|
||
|
|
||
|
b0 = permute< 0, 2, 1, 3 >( b0 );
|
||
|
b5 = permute< 3, 1, 2, 0 >( b5 );
|
||
|
|
||
|
// 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 1 ] }
|
||
|
// 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 0 ], b5[ 1 ] }
|
||
|
|
||
|
lo = gather_low( b0, b5 ); // { b0[ 0 ], b5[ 0 ], b0[ 2 ], b5[ 2 ] }
|
||
|
hi = gather_high( b0, b5 ); // { b0[ 1 ], b5[ 1 ], b0[ 3 ], b5[ 3 ] }
|
||
|
|
||
|
gt_lo_hi = is_greater( lo, hi );
|
||
|
b0 = blend_min( lo, hi, gt_lo_hi );
|
||
|
b5 = blend_max( lo, hi, gt_lo_hi );
|
||
|
|
||
|
// 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ]
|
||
|
// 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 1 ]
|
||
|
|
||
|
b5 = permute< 1, 0, 3, 2 >( b5 );
|
||
|
|
||
|
// 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] <= 0
|
||
|
// 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 0 ] >= 0 s
|
||
|
|
||
|
// fixing possible break of order in positions 1,2,3
|
||
|
|
||
|
__m256d chk123 = is_greater( b0, b5 );
|
||
|
__m256d aux = blend_min( b0, b5, chk123 );
|
||
|
b5 = blend_max( b0, b5, chk123 );
|
||
|
b0 = aux;
|
||
|
|
||
|
__m256i b0i = cast_i( b0 );
|
||
|
__m256i b5i = cast_i( b5 );
|
||
|
|
||
|
uint64_t u0 = _mm256_extract_epi64( b0i, 0 ) & 0x7FFF'FFFF'FFFF'FFFFULL;
|
||
|
uint64_t u5 = _mm256_extract_epi64( b5i, 0 );
|
||
|
|
||
|
if( u0 >= u5 ) // | b0[ 0 | >= | b5[ 0 ] |
|
||
|
{
|
||
|
if( u0 > u5 + 0x0060'0000'0000'0000ULL ) // |b0| > 2^5 |b1| (we use 0x006... to handle subnormals properly)
|
||
|
return location::out;
|
||
|
|
||
|
__m256d sum = _mm256_add_pd( b0, b5 ); // b0 + b5, rounded down
|
||
|
__m256d cor = _mm256_sub_pd( sum, b0 ); // sum - b0, rounded down
|
||
|
__m256d delta = _mm256_sub_pd( b5, cor ); // b5 - cor, rounded down.
|
||
|
// now b0[ 0 ] + b5[ 0 ] = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ]
|
||
|
a[ 0 ] = blend< 1, 0, 0, 0 >( b0, sum );
|
||
|
a[ 5 ] = blend< 1, 0, 0, 0 >( b5, delta );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if( u5 > u0 + 0x0060'0000'0000'0000ULL )
|
||
|
return location::in;
|
||
|
|
||
|
__m256i sign_i = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
__m256d sign = cast_d( sign_i );
|
||
|
__m256d minus_b0 = xor_d( b0, sign );
|
||
|
|
||
|
__m256d sum = _mm256_sub_pd( minus_b0, b5 ); // (-b5) + (-b0) rounded down
|
||
|
__m256d cor = _mm256_add_pd( sum, b5 ); // sum - (-b5) rounded down
|
||
|
__m256d delta = _mm256_sub_pd( minus_b0, cor ); // (-b0) - cor, rounded down.
|
||
|
|
||
|
// -( b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ]
|
||
|
|
||
|
sum = xor_d( sum, sign );
|
||
|
delta = xor_d( delta, sign );
|
||
|
|
||
|
// b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with delta[ 0 ] <= 0 <= sum[ 0 ]
|
||
|
|
||
|
a[ 0 ] = blend< 1, 0, 0, 0 >( b0, delta );
|
||
|
a[ 5 ] = blend< 1, 0, 0, 0 >( b5, sum );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
// bounded_convex::box: finding the box of the bounded convex region, under the assumption
|
||
|
// that the normals are normalized, that is, for each edge e, either
|
||
|
// e[ i ] = +0.0 or e[ i ] >= tiny ~ 2^(-100), i.e. no signed zeros, underflow
|
||
|
// or overflow or negative coordinates.
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
__m256d bounded_convex::box() const
|
||
|
{
|
||
|
constexpr int non_0 = 0;
|
||
|
constexpr int non_1 = 2;
|
||
|
constexpr int inf_0 = 0;
|
||
|
constexpr int sup_0 = 1;
|
||
|
constexpr int inf_1 = 2;
|
||
|
constexpr int sup_1 = 3;
|
||
|
|
||
|
assert( type != bounded_convex_type::empty );
|
||
|
|
||
|
__m256d den;
|
||
|
__m256d num;
|
||
|
__m256i sign = _mm256_slli_epi64( ones_i(), 63 ); // { -1, -1, -1, -1 }
|
||
|
__m256i sign_h = _mm256_slli_si256( sign, 8 ); // { 0, -1, 0, -1 }
|
||
|
|
||
|
if( type != bounded_convex_type::polygon )
|
||
|
{
|
||
|
if( type == bounded_convex_type::point )
|
||
|
{
|
||
|
// { vertex_pair[ 0 ] = { x_inf, -x_sup, ? , ? }
|
||
|
// { vertex_pair[ 1 ] = { y_inf, -y_sup, ? , ? }
|
||
|
// { vertex_pair[ 2 ] = { z_inf, -z_sup, ? , ? }
|
||
|
|
||
|
den = xor_d( vertex_p[ 2 ], sign_h );
|
||
|
den = permute< sup_0, inf_0, sup_0, inf_0 >( den );
|
||
|
num = gather_lanes< 0, 0 >( vertex_p[ 0 ], vertex_p[ 1 ] );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
__m256d num_x;
|
||
|
__m256d num_y;
|
||
|
|
||
|
den = xor_d( vertex_p[ 2 ], sign_h );
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ 1 ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // edge[ 0 ] >= 0, edge[ 1 ] >= 0: box = { x0, x1, y1, y0 }
|
||
|
{
|
||
|
num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] );
|
||
|
num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] );
|
||
|
|
||
|
den = permute< sup_0, inf_1, sup_1, inf_0 >( den );
|
||
|
break;
|
||
|
}
|
||
|
case 1: // edge[ 0 ] < 0, edge[ 1 ] >= 0: box = { x0, x1, y0, y1 }
|
||
|
{
|
||
|
num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] );
|
||
|
num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] );
|
||
|
|
||
|
den = permute< sup_0, inf_0, sup_0, inf_1 >( den );
|
||
|
break;
|
||
|
}
|
||
|
case 2: // edge[ 0 ] >= 0, edge[ 1 ] < 0: box = {x1, x0, y1, y0 }
|
||
|
{
|
||
|
num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] );
|
||
|
num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] );
|
||
|
|
||
|
den = permute< sup_1, inf_0, sup_1, inf_0 >( den );
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
case 3: // edge[ 0 ] < 0, edge[ 1 ] < 0: box = { x1, x0, y0, y1 }
|
||
|
{
|
||
|
num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] );
|
||
|
num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] );
|
||
|
|
||
|
den = permute< sup_1, inf_0, sup_0, inf_1 >( den );
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
num = gather_lanes< 0, 1 >( num_x, num_y );
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
int index = begin;
|
||
|
|
||
|
auto set_x_max = [ & ]()
|
||
|
{
|
||
|
int previous_vertex = ( ( index > 0 ) ? index : end );
|
||
|
int previous_pair = 3 * ( --previous_vertex / 2 );
|
||
|
|
||
|
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
|
||
|
|
||
|
if( previous_vertex & 0x1 )
|
||
|
{
|
||
|
z = permute< non_0, inf_1, non_1, non_1 >( z );
|
||
|
__m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
|
||
|
num = blend< 0, 1, 0, 0 >( num, x );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
z = permute< non_0, inf_0, non_1, non_1 >( z );
|
||
|
num = blend< 0, 1, 0, 0 >( num, vertex_p[ previous_pair ] );
|
||
|
}
|
||
|
|
||
|
den = blend< 0, 1, 0, 0 >( den, z );
|
||
|
};
|
||
|
|
||
|
auto set_x_min = [ & ]()
|
||
|
{
|
||
|
int previous_vertex = ( ( index > 0 ) ? index : end );
|
||
|
int previous_pair = 3 * ( --previous_vertex / 2 );
|
||
|
|
||
|
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
|
||
|
|
||
|
if( previous_vertex & 0x1 )
|
||
|
{
|
||
|
z = permute< sup_1, non_0, non_1, non_1 >( z );
|
||
|
__m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
|
||
|
num = blend< 1, 0, 0, 0 >( num, x );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
z = permute< sup_0, non_0, non_1, non_1 >( z );
|
||
|
num = blend< 1, 0, 0, 0 >( num, vertex_p[ previous_pair ] );
|
||
|
}
|
||
|
|
||
|
den = blend< 1, 0, 0, 0 >( den, z );
|
||
|
};
|
||
|
|
||
|
auto set_y_max = [ & ]()
|
||
|
{
|
||
|
int previous_vertex = ( ( index > 0 ) ? index : end );
|
||
|
int previous_pair = 3 * ( --previous_vertex / 2 );
|
||
|
|
||
|
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
|
||
|
|
||
|
if( previous_vertex & 0x1 )
|
||
|
{
|
||
|
z = permute< non_0, non_0, non_1, inf_1 >( z );
|
||
|
num = blend< 0, 0, 0, 1 >( num, vertex_p[ previous_pair + 1 ] );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
z = permute< non_0, non_0, non_1, inf_0 >( z );
|
||
|
__m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
|
||
|
num = blend< 0, 0, 0, 1 >( num, y );
|
||
|
}
|
||
|
|
||
|
den = blend< 0, 0, 0, 1 >( den, z );
|
||
|
};
|
||
|
|
||
|
auto set_y_min = [ & ]()
|
||
|
{
|
||
|
int previous_vertex = ( ( index > 0 ) ? index : end );
|
||
|
int previous_pair = 3 * ( --previous_vertex / 2 );
|
||
|
|
||
|
__m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
|
||
|
|
||
|
if( previous_vertex & 0x1 )
|
||
|
{
|
||
|
z = permute< non_1, non_1, sup_1, non_1 >( z );
|
||
|
num = blend< 0, 0, 1, 0 >( num, vertex_p[ previous_pair + 1 ] );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
z = permute< non_0, non_0, sup_0, non_0 >( z );
|
||
|
__m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
|
||
|
num = blend< 0, 0, 1, 0 >( num, y );
|
||
|
}
|
||
|
|
||
|
den = blend< 0, 0, 1, 0 >( den, z );
|
||
|
};
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // normal[ 0 ] >= 0, normal[ 1 ] >= 0: first quadrant
|
||
|
{
|
||
|
if( index & 0x1 )
|
||
|
goto quadrant_1_odd;
|
||
|
else
|
||
|
goto quadrant_1_even;
|
||
|
}
|
||
|
case 1: // normal[ 0 ] < 0, normal[ 1 ] >= 0: second quadrant
|
||
|
{
|
||
|
if( index & 0x1 )
|
||
|
goto quadrant_2_odd;
|
||
|
else
|
||
|
goto quadrant_2_even;
|
||
|
}
|
||
|
case 2: // normal[ 0 ] >= 0, normal[ 1 ] < 0: fourth quadrant
|
||
|
{
|
||
|
if( index & 0x1 )
|
||
|
goto quadrant_4_odd;
|
||
|
else
|
||
|
goto quadrant_4_even;
|
||
|
}
|
||
|
case 3: // normal[ 0 ] < 0, normal[ 1 ] < 0: third quadrant
|
||
|
{
|
||
|
if( index & 0x1 )
|
||
|
goto quadrant_3_odd;
|
||
|
else
|
||
|
goto quadrant_3_even;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_1_even:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // quadrant_1
|
||
|
goto quadrant_1_odd;
|
||
|
case 1: // quadrant_2
|
||
|
{
|
||
|
set_y_min();
|
||
|
goto quadrant_2_odd;
|
||
|
}
|
||
|
case 3: // quadrant_3
|
||
|
{
|
||
|
set_x_max();
|
||
|
set_y_min();
|
||
|
goto quadrant_3_odd;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_1_odd:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // quadrant_1
|
||
|
goto quadrant_1_even;
|
||
|
case 1: // quadrant_2
|
||
|
{
|
||
|
set_y_min();
|
||
|
goto quadrant_2_even;
|
||
|
}
|
||
|
case 3: // quadrant_3
|
||
|
{
|
||
|
set_x_max();
|
||
|
set_y_min();
|
||
|
goto quadrant_3_even;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_2_even:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 1: // quadrant_2
|
||
|
goto quadrant_2_odd;
|
||
|
case 2:// quadrant_4
|
||
|
{
|
||
|
set_x_max();
|
||
|
set_y_max();
|
||
|
goto quadrant_4_odd;
|
||
|
}
|
||
|
case 3: // quadrant_3
|
||
|
{
|
||
|
set_x_max();
|
||
|
goto quadrant_3_odd;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_2_odd:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 1: // quadrant_2
|
||
|
goto quadrant_2_even;
|
||
|
case 2:// quadrant_4
|
||
|
{
|
||
|
set_x_max();
|
||
|
set_y_max();
|
||
|
goto quadrant_4_even;
|
||
|
}
|
||
|
case 3: // quadrant_3
|
||
|
{
|
||
|
set_x_max();
|
||
|
goto quadrant_3_even;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_3_even:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // quadrant_1
|
||
|
{
|
||
|
set_y_max();
|
||
|
set_x_min();
|
||
|
goto quadrant_1_odd;
|
||
|
}
|
||
|
case 2:// quadrant_4
|
||
|
{
|
||
|
set_y_max();
|
||
|
goto quadrant_4_odd;
|
||
|
}
|
||
|
case 3: // quadrant_3
|
||
|
goto quadrant_3_odd;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_3_odd:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // quadrant_1
|
||
|
{
|
||
|
set_x_min();
|
||
|
set_y_max();
|
||
|
goto quadrant_1_even;
|
||
|
}
|
||
|
case 2:// quadrant_4
|
||
|
{
|
||
|
set_y_max();
|
||
|
goto quadrant_4_even;
|
||
|
}
|
||
|
case 3: // quadrant_3
|
||
|
goto quadrant_3_even;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_4_even:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // quadrant_1
|
||
|
{
|
||
|
set_x_min();
|
||
|
goto quadrant_1_odd;
|
||
|
}
|
||
|
case 1: // quadrant_2
|
||
|
{
|
||
|
set_x_min();
|
||
|
set_y_min();
|
||
|
goto quadrant_2_odd;
|
||
|
}
|
||
|
case 2:// quadrant_4
|
||
|
goto quadrant_4_odd;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
quadrant_4_odd:
|
||
|
{
|
||
|
if( ++index > end )
|
||
|
goto done;
|
||
|
|
||
|
int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
|
||
|
|
||
|
switch( xy_signs )
|
||
|
{
|
||
|
case 0: // quadrant_1
|
||
|
{
|
||
|
set_x_min();
|
||
|
goto quadrant_1_even;
|
||
|
}
|
||
|
case 1: // quadrant_2
|
||
|
{
|
||
|
set_x_min();
|
||
|
set_y_min();
|
||
|
goto quadrant_2_even;
|
||
|
}
|
||
|
case 2:// quadrant_4
|
||
|
goto quadrant_4_even;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
done: {}
|
||
|
}
|
||
|
|
||
|
num = _mm256_div_pd( num, den );
|
||
|
num = xor_d( num, sign_h );
|
||
|
return num;
|
||
|
}
|
||
|
|
||
|
int bounded_convex::insert_vertex( int at )
|
||
|
{
|
||
|
int high = end - at;
|
||
|
int low = at - begin;
|
||
|
assert( ( low > 0 ) && ( high > 0 ) );
|
||
|
|
||
|
if( low <= high )
|
||
|
{
|
||
|
__m256d* ed = edge + ( begin - 1 );
|
||
|
__m256d* ed_max = ed + low;
|
||
|
|
||
|
for( ; ed < ed_max; ++ed )
|
||
|
{
|
||
|
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed + 1 ) );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
|
||
|
}
|
||
|
|
||
|
int to_load;
|
||
|
|
||
|
__m256d* vp = vertex_pair( begin );
|
||
|
|
||
|
__m256d a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) );
|
||
|
__m256d a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
|
||
|
__m256d a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );
|
||
|
|
||
|
if( ( begin & 0x1 ) == 0 )
|
||
|
{
|
||
|
__m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
|
||
|
__m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
|
||
|
__m256d c2 = gather_lanes< clear_lane, 0 >( a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), c2 );
|
||
|
|
||
|
if( low == 1 )
|
||
|
{
|
||
|
--begin;
|
||
|
return at - 1;
|
||
|
}
|
||
|
to_load = low - 2;
|
||
|
}
|
||
|
else
|
||
|
to_load = low - 1;
|
||
|
|
||
|
int q = to_load / 4;
|
||
|
__m256d* vp_max = vp + 6 * q;
|
||
|
|
||
|
while( vp < vp_max )
|
||
|
{
|
||
|
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
|
||
|
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
|
||
|
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
|
||
|
|
||
|
__m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
|
||
|
__m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
|
||
|
__m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
|
||
|
|
||
|
vp += 3;
|
||
|
|
||
|
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
|
||
|
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
|
||
|
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
|
||
|
|
||
|
__m256d d0 = gather_lanes< 1, 0 >( b0, a0 );
|
||
|
__m256d d1 = gather_lanes< 1, 0 >( b1, a1 );
|
||
|
__m256d d2 = gather_lanes< 1, 0 >( b2, a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), d0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
|
||
|
|
||
|
vp += 3;
|
||
|
}
|
||
|
|
||
|
int r = to_load % 4;
|
||
|
|
||
|
if( r == 0 )
|
||
|
{
|
||
|
__m256d c0 = gather_lanes< 1, clear_lane >( a0 );
|
||
|
__m256d c1 = gather_lanes< 1, clear_lane >( a1 );
|
||
|
__m256d c2 = gather_lanes< 1, clear_lane >( a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
|
||
|
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
|
||
|
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
|
||
|
|
||
|
__m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
|
||
|
__m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
|
||
|
__m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
|
||
|
|
||
|
switch( r )
|
||
|
{
|
||
|
case 1:
|
||
|
goto low_end;
|
||
|
case 2:
|
||
|
{
|
||
|
__m256d c0 = gather_lanes< 1, clear_lane >( b0 );
|
||
|
__m256d c1 = gather_lanes< 1, clear_lane >( b1 );
|
||
|
__m256d c2 = gather_lanes< 1, clear_lane >( b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 3 ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 4 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 5 ), c2 );
|
||
|
goto low_end;
|
||
|
}
|
||
|
default:
|
||
|
{
|
||
|
vp += 3;
|
||
|
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
|
||
|
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
|
||
|
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
|
||
|
|
||
|
__m256d c3 = gather_lanes< 1, 0 >( b0, a0 );
|
||
|
__m256d c4 = gather_lanes< 1, 0 >( b1, a1 );
|
||
|
__m256d c5 = gather_lanes< 1, 0 >( b2, a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c3 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c4 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c5 );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
low_end:
|
||
|
{
|
||
|
--begin;
|
||
|
return at - 1;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
__m256d* ed = edge + end + 1;
|
||
|
__m256d* ed_min = ed - ( high + 1 );
|
||
|
|
||
|
for( ; ed > ed_min; --ed )
|
||
|
{
|
||
|
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed - 1 ) );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
|
||
|
}
|
||
|
|
||
|
__m256d* vp = vertex_pair( end );
|
||
|
|
||
|
__m256d a0;
|
||
|
__m256d a1;
|
||
|
__m256d a2;
|
||
|
|
||
|
int to_load;
|
||
|
|
||
|
if( ( end & 0x1 ) == 0 )
|
||
|
{
|
||
|
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
|
||
|
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
|
||
|
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
|
||
|
|
||
|
__m256d c0 = gather_lanes< 1, clear_lane >( a0 );
|
||
|
__m256d c1 = gather_lanes< 1, clear_lane >( a1 );
|
||
|
__m256d c2 = gather_lanes< 1, clear_lane >( a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
|
||
|
|
||
|
if( high == 1 )
|
||
|
{
|
||
|
++end;
|
||
|
return at;
|
||
|
}
|
||
|
|
||
|
vp -= 3;
|
||
|
to_load = high - 2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
to_load = high - 1;
|
||
|
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) );
|
||
|
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
|
||
|
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );
|
||
|
}
|
||
|
|
||
|
int q = to_load / 4;
|
||
|
|
||
|
__m256d* vp_min = vp - 6 * q;
|
||
|
|
||
|
while( vp > vp_min )
|
||
|
{
|
||
|
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
|
||
|
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
|
||
|
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
|
||
|
|
||
|
__m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
|
||
|
__m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
|
||
|
__m256d c2 = gather_lanes< 1, 0 >( b2, a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
|
||
|
|
||
|
vp -= 3;
|
||
|
|
||
|
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
|
||
|
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
|
||
|
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
|
||
|
|
||
|
__m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
|
||
|
__m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
|
||
|
__m256d d2 = gather_lanes< 1, 0 >( a2, b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), d0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
|
||
|
|
||
|
vp -= 3;
|
||
|
}
|
||
|
|
||
|
int r = to_load % 4;
|
||
|
|
||
|
if( r == 0 )
|
||
|
{
|
||
|
__m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
|
||
|
__m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
|
||
|
__m256d c2 = gather_lanes< clear_lane, 0 >( a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
|
||
|
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
|
||
|
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
|
||
|
|
||
|
__m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
|
||
|
__m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
|
||
|
__m256d c2 = gather_lanes< 1, 0 >( b2, a2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
|
||
|
|
||
|
switch( r )
|
||
|
{
|
||
|
case 1:
|
||
|
goto high_end;
|
||
|
case 2:
|
||
|
{
|
||
|
__m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
|
||
|
__m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
|
||
|
__m256d d2 = gather_lanes< clear_lane, 0 >( b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), d0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), d1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), d2 );
|
||
|
goto high_end;
|
||
|
}
|
||
|
default: // case 3
|
||
|
{
|
||
|
vp -= 3;
|
||
|
|
||
|
a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
|
||
|
a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
|
||
|
a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
|
||
|
|
||
|
__m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
|
||
|
__m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
|
||
|
__m256d d2 = gather_lanes< 1, 0 >( a2, b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp ), d0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
high_end:
|
||
|
{
|
||
|
++end;
|
||
|
return at;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void bounded_convex::push( __m256d* begin, __m256d* end )
|
||
|
{
|
||
|
for( ; begin < end; ++begin )
|
||
|
{
|
||
|
int sign = _mm256_movemask_pd( *begin ) & 0x7;
|
||
|
|
||
|
switch( sign )
|
||
|
{
|
||
|
case 0b000: { locator< 0, 0, 0 >::insert( *this, *begin ); break; }
|
||
|
case 0b001: { locator< 1, 0, 0 >::insert( *this, *begin ); break; }
|
||
|
case 0b010: { locator< 0, 1, 0 >::insert( *this, *begin ); break; }
|
||
|
case 0b011: { locator< 1, 1, 0 >::insert( *this, *begin ); break; }
|
||
|
case 0b100: { locator< 0, 0, 1 >::insert( *this, *begin ); break; }
|
||
|
case 0b101: { locator< 1, 0, 1 >::insert( *this, *begin ); break; }
|
||
|
case 0b110: { locator< 0, 1, 1 >::insert( *this, *begin ); break; }
|
||
|
case 0b111: { locator< 1, 1, 1 >::insert( *this, *begin ); break; }
|
||
|
}
|
||
|
|
||
|
if( type == bounded_convex_type::empty )
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int bounded_convex::remove_vertices( int begin_vertex, int end_vertex )
|
||
|
{
|
||
|
int re = end_vertex & 0x1;
|
||
|
int rb = begin_vertex & 0x1;
|
||
|
int low = begin_vertex - begin;
|
||
|
int high = end - end_vertex;
|
||
|
|
||
|
if( low <= high )
|
||
|
{
|
||
|
__m256d* e_dst = edge + end_vertex;
|
||
|
__m256d* e_dst_min = e_dst - low;
|
||
|
__m256d* e_src = edge + begin_vertex;
|
||
|
|
||
|
while( --e_dst >= e_dst_min )
|
||
|
{
|
||
|
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( --e_src ) );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp );
|
||
|
}
|
||
|
|
||
|
__m256d* v_dst = vertex_pair( end_vertex );
|
||
|
__m256d* v_src = vertex_pair( begin_vertex );
|
||
|
|
||
|
if( re == rb )
|
||
|
{
|
||
|
int q;
|
||
|
|
||
|
if( re & 0x1 )
|
||
|
{
|
||
|
__m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
__m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
__m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
|
||
|
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
|
||
|
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
|
||
|
|
||
|
__m256d b0 = blend< 0, 0, 1, 1 >( s0, d0 );
|
||
|
__m256d b1 = blend< 0, 0, 1, 1 >( s1, d1 );
|
||
|
__m256d b2 = blend< 0, 0, 1, 1 >( s2, d2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );
|
||
|
|
||
|
q = low;
|
||
|
}
|
||
|
else
|
||
|
q = ( low + 1 );
|
||
|
|
||
|
q /= 2;
|
||
|
__m256d* v_src_min = v_src - 3 * q;
|
||
|
|
||
|
while( v_src > v_src_min )
|
||
|
{
|
||
|
--v_src;
|
||
|
--v_dst;
|
||
|
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp );
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
__m256d s0;
|
||
|
__m256d s1;
|
||
|
__m256d s2;
|
||
|
|
||
|
int to_load;
|
||
|
|
||
|
if( rb == 0 )
|
||
|
{
|
||
|
v_src -= 3;
|
||
|
|
||
|
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d p0 = gather_lanes< 1, clear_lane >( s0 );
|
||
|
__m256d p1 = gather_lanes< 1, clear_lane >( s1 );
|
||
|
__m256d p2 = gather_lanes< 1, clear_lane >( s2 );
|
||
|
|
||
|
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
|
||
|
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
|
||
|
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
|
||
|
|
||
|
__m256d b0 = blend< 0, 0, 1, 1 >( p0, d0 );
|
||
|
__m256d b1 = blend< 0, 0, 1, 1 >( p1, d1 );
|
||
|
__m256d b2 = blend< 0, 0, 1, 1 >( p2, d2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );
|
||
|
|
||
|
if( low == 1 )
|
||
|
goto low_end;
|
||
|
|
||
|
to_load = low - 2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
to_load = low - 1;
|
||
|
|
||
|
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
}
|
||
|
|
||
|
int q = to_load / 4;
|
||
|
__m256d* v_src_min = v_src - 6 * q;
|
||
|
|
||
|
while( v_src > v_src_min )
|
||
|
{
|
||
|
v_src -= 3;
|
||
|
v_dst -= 3;
|
||
|
|
||
|
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
|
||
|
__m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
|
||
|
__m256d c2 = gather_lanes< 1, 0 >( b2, s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
|
||
|
|
||
|
v_dst -= 3;
|
||
|
v_src -= 3;
|
||
|
|
||
|
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
|
||
|
__m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
|
||
|
__m256d d2 = gather_lanes< 1, 0 >( s2, b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
|
||
|
}
|
||
|
|
||
|
v_dst -= 3;
|
||
|
int r = to_load % 4;
|
||
|
|
||
|
if( r == 0 )
|
||
|
{
|
||
|
__m256d c0 = gather_lanes< clear_lane, 0 >( s0 );
|
||
|
__m256d c1 = gather_lanes< clear_lane, 0 >( s1 );
|
||
|
__m256d c2 = gather_lanes< clear_lane, 0 >( s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
v_src -= 3;
|
||
|
__m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
__m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
__m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
|
||
|
__m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
|
||
|
__m256d c2 = gather_lanes< 1, 0 >( b2, s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
|
||
|
|
||
|
switch( r )
|
||
|
{
|
||
|
case 1:
|
||
|
goto low_end;
|
||
|
case 2:
|
||
|
{
|
||
|
v_dst -= 3;
|
||
|
|
||
|
__m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
|
||
|
__m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
|
||
|
__m256d d2 = gather_lanes< clear_lane, 0 >( b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
|
||
|
goto low_end;
|
||
|
}
|
||
|
default:
|
||
|
{
|
||
|
v_dst -= 3;
|
||
|
v_src -= 3;
|
||
|
|
||
|
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
|
||
|
__m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
|
||
|
__m256d d2 = gather_lanes< 1, 0 >( s2, b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
|
||
|
|
||
|
v_dst -= 3;
|
||
|
|
||
|
s0 = gather_lanes< clear_lane, 0 >( s0, b0 );
|
||
|
s1 = gather_lanes< clear_lane, 0 >( s1, b1 );
|
||
|
s2 = gather_lanes< clear_lane, 0 >( s2, b2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), s0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), s1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), s2 );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
low_end:
|
||
|
{
|
||
|
begin += ( end_vertex - begin_vertex );
|
||
|
return end_vertex - 1;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
__m256d* e_dst = edge + begin_vertex;
|
||
|
__m256d* e_src = edge + end_vertex;
|
||
|
__m256d* e_src_max = e_src + high + 1;
|
||
|
|
||
|
for( ; e_src < e_src_max; ++e_src, ++e_dst )
|
||
|
{
|
||
|
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( e_src ) );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp );
|
||
|
}
|
||
|
|
||
|
__m256d s0;
|
||
|
__m256d s1;
|
||
|
__m256d s2;
|
||
|
|
||
|
__m256d* v_dst = vertex_pair( begin_vertex );
|
||
|
__m256d* v_src = vertex_pair( end_vertex );
|
||
|
|
||
|
if( re == rb )
|
||
|
{
|
||
|
if( re & 0x1 )
|
||
|
{
|
||
|
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
|
||
|
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
|
||
|
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
|
||
|
|
||
|
__m256d c0 = blend< 0, 0, 1, 1 >( d0, s0 );
|
||
|
__m256d c1 = blend< 0, 0, 1, 1 >( d1, s1 );
|
||
|
__m256d c2 = blend< 0, 0, 1, 1 >( d2, s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
|
||
|
|
||
|
--high;
|
||
|
v_src += 3;
|
||
|
v_dst += 3;
|
||
|
}
|
||
|
|
||
|
int q = 3 * ( ( high + 1 ) / 2 );
|
||
|
|
||
|
__m256d* v_src_max = vertex_pair( end + 1 );
|
||
|
for( ; v_src < v_src_max; ++v_src, ++v_dst )
|
||
|
{
|
||
|
__m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp );
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
int to_load;
|
||
|
|
||
|
__m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
__m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
__m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
if( rb ) // re = 0 =:
|
||
|
{
|
||
|
__m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) );
|
||
|
__m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
|
||
|
__m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
|
||
|
|
||
|
__m256d g0 = gather_lanes< 0, 0 >( d0, s0 );
|
||
|
__m256d g1 = gather_lanes< 0, 0 >( d1, s1 );
|
||
|
__m256d g2 = gather_lanes< 0, 0 >( d2, s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), g0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), g1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), g2 );
|
||
|
|
||
|
if( high == 1 )
|
||
|
goto high_end;
|
||
|
|
||
|
v_dst += 3;
|
||
|
to_load = high - 2;
|
||
|
}
|
||
|
else // re = 1 => only the second lane of the si are used
|
||
|
to_load = high - 1;
|
||
|
|
||
|
int q = to_load / 4;
|
||
|
__m256d* v_src_max = v_src + 6 * q;
|
||
|
|
||
|
while( v_src < v_src_max )
|
||
|
{
|
||
|
v_src += 3;
|
||
|
|
||
|
__m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
__m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
__m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
|
||
|
__m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
|
||
|
__m256d u2 = gather_lanes< 1, 0 >( s2, t2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
|
||
|
|
||
|
v_dst += 3;
|
||
|
v_src += 3;
|
||
|
|
||
|
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
|
||
|
__m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
|
||
|
__m256d v2 = gather_lanes< 1, 0 >( t2, s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
|
||
|
|
||
|
v_dst += 3;
|
||
|
}
|
||
|
|
||
|
int r = to_load % 4;
|
||
|
|
||
|
if( r == 0 )
|
||
|
{
|
||
|
__m256d u0 = gather_lanes< 1, clear_lane >( s0 );
|
||
|
__m256d u1 = gather_lanes< 1, clear_lane >( s1 );
|
||
|
__m256d u2 = gather_lanes< 1, clear_lane >( s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
v_src += 3;
|
||
|
|
||
|
__m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
__m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
__m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
|
||
|
__m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
|
||
|
__m256d u2 = gather_lanes< 1, 0 >( s2, t2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
|
||
|
|
||
|
switch( r )
|
||
|
{
|
||
|
case 1:
|
||
|
goto high_end;
|
||
|
case 2:
|
||
|
{
|
||
|
v_dst += 3;
|
||
|
__m256d v0 = gather_lanes< 1, clear_lane >( t0 );
|
||
|
__m256d v1 = gather_lanes< 1, clear_lane >( t1 );
|
||
|
__m256d v2 = gather_lanes< 1, clear_lane >( t2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
|
||
|
|
||
|
goto high_end;
|
||
|
}
|
||
|
default: // case 3
|
||
|
{
|
||
|
v_dst += 3;
|
||
|
v_src += 3;
|
||
|
|
||
|
s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
|
||
|
s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
|
||
|
s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
|
||
|
|
||
|
__m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
|
||
|
__m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
|
||
|
__m256d v2 = gather_lanes< 1, 0 >( t2, s2 );
|
||
|
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
|
||
|
_mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
high_end:
|
||
|
{
|
||
|
end -= ( end_vertex - begin_vertex );
|
||
|
return begin_vertex - 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void bounded_convex::set_to_last_segment()
|
||
|
{
|
||
|
int last = end - 1;
|
||
|
edge[ 0 ] = edge[ last ];
|
||
|
edge[ 1 ] = edge[ end ];
|
||
|
edge[ 2 ] = edge[ begin + 1 ];
|
||
|
|
||
|
__m256d* vl = vertex_pair( last );
|
||
|
__m256d* vb = vertex_pair( begin );
|
||
|
|
||
|
if( last & 0x1 )
|
||
|
{
|
||
|
if( begin & 0x1 )
|
||
|
{
|
||
|
vertex_p[ 0 ] = gather_lanes< 1, 1 >( vl[ 0 ], vb[ 0 ] );
|
||
|
vertex_p[ 1 ] = gather_lanes< 1, 1 >( vl[ 1 ], vb[ 1 ] );
|
||
|
vertex_p[ 2 ] = gather_lanes< 1, 1 >( vl[ 2 ], vb[ 2 ] );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
vertex_p[ 0 ] = gather_lanes< 1, 0 >( vl[ 0 ], vb[ 0 ] );
|
||
|
vertex_p[ 1 ] = gather_lanes< 1, 0 >( vl[ 1 ], vb[ 1 ] );
|
||
|
vertex_p[ 2 ] = gather_lanes< 1, 0 >( vl[ 2 ], vb[ 2 ] );
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if( begin & 0x1 )
|
||
|
{
|
||
|
vertex_p[ 0 ] = blend< 0, 0, 1, 1 >( vl[ 0 ], vb[ 0 ] );
|
||
|
vertex_p[ 1 ] = blend< 0, 0, 1, 1 >( vl[ 1 ], vb[ 1 ] );
|
||
|
vertex_p[ 2 ] = blend< 0, 0, 1, 1 >( vl[ 2 ], vb[ 2 ] );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
vertex_p[ 0 ] = gather_lanes< 0, 0 >( vl[ 0 ], vb[ 0 ] );
|
||
|
vertex_p[ 1 ] = gather_lanes< 0, 0 >( vl[ 1 ], vb[ 1 ] );
|
||
|
vertex_p[ 2 ] = gather_lanes< 0, 0 >( vl[ 2 ], vb[ 2 ] );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
end = 2;
|
||
|
begin = 0;
|
||
|
type = bounded_convex_type::segment;
|
||
|
}
|
||
|
|
||
|
void bounded_convex::setup( __m256d box )
|
||
|
{
|
||
|
type = bounded_convex_type::polygon;
|
||
|
|
||
|
__m256d zero = _mm256_xor_pd( zero, zero );
|
||
|
__m256i ones = _mm256_cmpeq_epi64( ones, ones );
|
||
|
__m256i signi = _mm256_slli_epi64( ones, 63 );
|
||
|
__m256i signi_h = _mm256_slli_si256( signi, 8 );
|
||
|
__m256d sign = _mm256_castsi256_pd( signi );
|
||
|
__m256d sign_h = _mm256_castsi256_pd( signi_h );
|
||
|
__m256i onei = _mm256_srli_epi64( ones, 54 );
|
||
|
onei = _mm256_slli_epi64( onei, 52 ); // { 1.0, 1.0, 1.0, 1.0 }
|
||
|
__m256d one = _mm256_castsi256_pd( onei );
|
||
|
|
||
|
box = _mm256_max_pd( box, zero );
|
||
|
|
||
|
begin = alloc / 2;
|
||
|
end = begin + 4;
|
||
|
__m256d* e = edge + begin;
|
||
|
__m256d x_box = gather_lanes< 0, 0 >( box ); // { x_min, x_max, x_min, x_max }
|
||
|
__m256d y_box = gather_lanes< 1, 1 >( box ); // { y_min, y_max, y_min, y_max }
|
||
|
|
||
|
e[ 0 ] = blend< 0, 1, 0, 0 >( one, zero ); // { 1, 0, 0, 0},
|
||
|
e[ 0 ] = blend< 0, 0, 1, 1 >( e[ 0 ], x_box ); // { 1, 0, x_min, x_max }, for x >= x_min
|
||
|
|
||
|
e[ 1 ] = permute< 1, 0, 2, 3 >( e[ 0 ] ); // { 0 , 1, x_min, x_max } // y >= y_min
|
||
|
e[ 1 ] = blend< 0, 0, 1, 1 >( e[ 1 ], y_box ); // { 0 , 1, y_min, y_max } // y >= y_min
|
||
|
|
||
|
e[ 2 ] = permute< 0, 1, 3, 2 >( e[ 0 ] ); // { 1 , 0, x_max, x_min }
|
||
|
e[ 2 ] = _mm256_xor_pd( e[ 2 ], sign ); // { -1 , -, -x_max, -y_min } -x >= -x_max
|
||
|
|
||
|
e[ 3 ] = permute< 0, 1, 3, 2 >( e[ 1 ] ); // { 0, 1, y_max, ? }
|
||
|
e[ 3 ] = _mm256_xor_pd( e[ 3 ], sign ); // { 0, 1, -y_max, ? } -y >= -y_max
|
||
|
|
||
|
e[ 4 ] = e[ 0 ]; // { 1, 0, x_min, } }
|
||
|
|
||
|
// initial vertices
|
||
|
|
||
|
__m256d* vp = vertex_pair( begin );
|
||
|
|
||
|
vp[ 0 ] = permute< 0, 0, 3, 3 >( x_box ); // { x_min, x_min, x_max, x_max } : vertices 0 and 1:
|
||
|
vp[ 0 ] = _mm256_xor_pd( vp[ 0 ], sign_h ); // { x_min, -x_min, x_max, -x_max } : vertices 0 and 1:
|
||
|
|
||
|
vp[ 1 ] = permute< 0, 0, 2, 2 >( y_box ); // { y_min, y_min, y_min, y_min }
|
||
|
vp[ 1 ] = _mm256_xor_pd( vp[ 1 ], sign_h ); // { y_min, -y_min, y_min, -y_min }
|
||
|
|
||
|
vp[ 2 ] = _mm256_xor_pd( one, sign_h ); // { 1.0, -1.0, 1.0, -1.0 }
|
||
|
|
||
|
vp[ 3 ] = permute< 1, 1, 2, 2 >( x_box ); // { x_max, x_max, x_min, x_min } : vertices 2 and 3:
|
||
|
vp[ 3 ] = _mm256_xor_pd( vp[ 3 ], sign_h ); // { x_max, -x_max, x_min, -x_min }
|
||
|
|
||
|
vp[ 4 ] = permute< 1, 1, 3, 3 >( y_box ); // { y_max, y_max, y_max, y_max }
|
||
|
vp[ 4 ] = _mm256_xor_pd( vp[ 4 ], sign_h ); // { y_max, -y_max, y_max, -y_max }
|
||
|
|
||
|
vp[ 5 ] = vp[ 2 ];
|
||
|
}
|
||
|
|
||
|
int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge )
|
||
|
{
|
||
|
constexpr unsigned long long x_index_epi8 = 0x1 << 4;
|
||
|
constexpr unsigned long long y_index_epi8 = 0x1 << 12;
|
||
|
constexpr unsigned long long z_index_epi8 = 0x1 << 16;
|
||
|
|
||
|
int ret = 0;
|
||
|
__m256i ones = ones_i(); // { 0xFFFF'FFFF'FFFF'FFFF, ... }
|
||
|
__m256i exp_mask = _mm256_srli_epi64( ones, 53 ); // { 0x0000'0000'0000'07FF, .... }
|
||
|
__m256i one = _mm256_srli_epi64( ones, 54 ); // { 0x0000'0000'0000'03FF, .... }
|
||
|
__m256i sign = _mm256_slli_epi64( ones, 63 ); // { 0x8000'0000'0000'0000, .... }
|
||
|
exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000, .... }
|
||
|
one = _mm256_slli_epi64( one, 52 ); // { 0x3FF0'0000'0000'0000, .... }
|
||
|
sign = gather_lanes< 1, clear_lane >( sign ); // { sign_bit, sign_bit, 0, 0 }
|
||
|
|
||
|
for( int iq = 0; iq < 4; ++iq )
|
||
|
{
|
||
|
__m256d* end = quadrant[ iq ] + n;
|
||
|
|
||
|
while( quadrant[ iq ] < end )
|
||
|
{
|
||
|
__m256d* q = quadrant[ iq ];
|
||
|
__m256i exp = and_i( *q, exp_mask );
|
||
|
__m256i cmp_huge = _mm256_cmpgt_epi32( exp, huge );
|
||
|
|
||
|
if( _mm256_testz_si256( cmp_huge, cmp_huge ) ) // no huge entries
|
||
|
{
|
||
|
__m256i cmp_tiny = _mm256_cmpgt_epi32( tiny, exp );
|
||
|
|
||
|
if( _mm256_testz_si256( cmp_tiny, cmp_tiny ) ) // no entries are huge or tiny
|
||
|
{
|
||
|
++quadrant[ iq ];
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// no entries are huge, but aome are tiny. We scale by 2^(huge - max_exp[ i ] )
|
||
|
|
||
|
__m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
|
||
|
__m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
|
||
|
__m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
|
||
|
max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}
|
||
|
|
||
|
int64_t int_max = _mm256_extract_epi64( max, 0 );
|
||
|
if( int_max == 0 ) // all exponents are zero.
|
||
|
{
|
||
|
__m256d cmp = _mm256_cmp_pd( *q, zero_d(), _CMP_EQ_OQ );
|
||
|
if( ( _mm256_movemask_pd( cmp ) & 0x7 ) == 0x7 ) // all coefficients are zero
|
||
|
{ // drop the constraint
|
||
|
*q = *( --end );
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
max = permute< 0, 0, 0, 0 >( max );
|
||
|
__m256i de = _mm256_sub_epi32( huge, max );
|
||
|
|
||
|
// de may be larger than 0x3ff and we must scale in two steps
|
||
|
|
||
|
*q = xor_d( *q, sign );
|
||
|
|
||
|
__m256i hs = _mm256_srli_epi64( de, 1 ); // de/2
|
||
|
hs = _mm256_and_si256( hs, exp_mask ); // throwing away the remainder
|
||
|
__m256d scale = cast_d( _mm256_add_epi64( hs, one ) );
|
||
|
*q = _mm256_mul_pd( scale, *q );
|
||
|
|
||
|
__m256i ls = _mm256_sub_epi64( de, hs ); // de - hs
|
||
|
scale = cast_d( _mm256_add_epi64( ls, one ) );
|
||
|
*q = _mm256_mul_pd( scale, *q );
|
||
|
}
|
||
|
else // some huge entries
|
||
|
{
|
||
|
__m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
|
||
|
__m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
|
||
|
__m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
|
||
|
max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}
|
||
|
|
||
|
max = permute< 0, 0, 0, 0 >( max );
|
||
|
__m256i de = _mm256_sub_epi32( huge, max ); // de > -0x3ff
|
||
|
__m256d scale = cast_d( _mm256_add_epi64( de, one ) ); // this does not underflow
|
||
|
|
||
|
*q = xor_d( *q, sign );
|
||
|
*q = _mm256_mul_pd( scale, *q ); // this may be inexact, but the inexact entries will be dropped later
|
||
|
}
|
||
|
|
||
|
// cleaning up the remaining tiny entries
|
||
|
|
||
|
__m256i exp_d = and_i( *q, exp_mask );
|
||
|
__m256i cmp_d = _mm256_cmpgt_epi32( exp_d, tiny ); // 8 x (4 byte masks)
|
||
|
int cmp_d_mask = _mm256_movemask_epi8( cmp_d ); // 8 bits for x, 8 bits for y, 8 bits for z
|
||
|
|
||
|
if( cmp_d_mask & ( x_index_epi8 | y_index_epi8 ) )
|
||
|
{
|
||
|
// In order to preserve the solution set, we must round the right hand side down and
|
||
|
// the left hand side up, so that every solution of the old problem satisfies the new contraints.
|
||
|
// To do that, we want to round the left hand side up and the right hand side down. Since
|
||
|
// the lhs were flipped, we simply round all entries down, that is:
|
||
|
//
|
||
|
// we replace x in ( -tiny, 0 ) by -tiny and x in [ 0, tiny ) by +0.0
|
||
|
// (and fix the zeros latter )
|
||
|
|
||
|
__m256d zero = zero_d();
|
||
|
__m256i exp_f = and_i( *q, exp_mask );
|
||
|
__m256i full_sign = gather_lanes< 0, 0 >( sign );
|
||
|
__m256i minus_tiny = _mm256_or_si256( tiny, full_sign );
|
||
|
__m256i is_tiny_f = _mm256_cmpgt_epi32( tiny, exp_f );
|
||
|
__m256d non_negative = _mm256_cmp_pd( *q, zero, _CMP_GE_OQ );
|
||
|
|
||
|
// tiny negatives should be replaced by -tiny, tiny non negatives are replace by 0.
|
||
|
__m256d fix = blend( cast_d( minus_tiny ), zero, non_negative );
|
||
|
|
||
|
// fixing the tiny entries
|
||
|
*q = blend( *q, fix, is_tiny_f );
|
||
|
|
||
|
// restoring the rhs sign
|
||
|
*q = xor_d( *q, sign );
|
||
|
|
||
|
// cleaning up the zeros
|
||
|
__m256i abs = _mm256_slli_epi64( cast_i( *q ), 1 ); // throwing away the sign bit
|
||
|
__m256i is_zero = _mm256_cmpeq_epi64( abs, cast_i( zero ) );
|
||
|
|
||
|
*q = blend( *q, zero, is_zero );
|
||
|
|
||
|
++quadrant[ iq ];
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// | a | <= tiny and | b | <= tiny: and |c| >= huge,
|
||
|
// The constraint is either irrelevant or fatal.
|
||
|
|
||
|
int pb_sign = _mm256_movemask_pd( *q );
|
||
|
if( !( pb_sign & 0x4 ) ) // drop the quadrant: the right hand side is too large
|
||
|
{
|
||
|
ret |= ( 0x1 << iq );
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// else, drop the constraint: the right hand side is too small
|
||
|
|
||
|
*q = *( --end );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
} // namespace wm::nlp::lp2d
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
extern "C"
|
||
|
{
|
||
|
|
||
|
int interval_system_2d( double* solution, double const* box, double const* eq,
|
||
|
unsigned int n_equations )
|
||
|
{
|
||
|
constexpr int max_quadrant_size = 32;
|
||
|
|
||
|
using namespace wm::nlp::lp2d;
|
||
|
|
||
|
double* solsPtr = solution;
|
||
|
|
||
|
printf("box: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", box[0], box[1], box[2], box[3]);
|
||
|
printf("n_equations = %u\n", n_equations);
|
||
|
for (int i = 0; i < n_equations; i++) {
|
||
|
printf("equation %u: a_min = %f, a_max = %f, b_min = %f, b_max = %f, c_min = %f, c_max = %f\n", i, eq[6*i], eq[6*i + 1], eq[6*i + 2], eq[6*i + 3], eq[6*i + 4], eq[6*i + 5]);
|
||
|
}
|
||
|
|
||
|
if( n_equations > ( max_quadrant_size / 2 ) )
|
||
|
return -1;
|
||
|
|
||
|
if( std::bit_cast< uint64_t >( box ) & 0x1F )
|
||
|
return -2;
|
||
|
|
||
|
if( std::bit_cast< uint64_t >( eq ) & 0x1F )
|
||
|
return -3;
|
||
|
|
||
|
|
||
|
|
||
|
int ret = 0;
|
||
|
int n_equations_q = n_equations / 2;
|
||
|
int n_equations_r = n_equations % 2;
|
||
|
|
||
|
double const* eq_max = eq + n_equations_q;
|
||
|
|
||
|
__m256i huge;
|
||
|
__m256i tiny;
|
||
|
__m256i signi = bounds( &tiny, &huge );
|
||
|
__m256i exp_mask = _mm256_srli_epi64( ones_i(), 53 );
|
||
|
exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000... }
|
||
|
|
||
|
__m256d sign = cast_d( signi );
|
||
|
__m256d sign_01 = blend< 0, 0, 1, 1 >( sign, zero_d() ); // { -1: -1, 0, 0 }
|
||
|
|
||
|
int old_rounding_mode = std::fegetround();
|
||
|
|
||
|
if( std::fesetround( FE_DOWNWARD ) )
|
||
|
return -4;
|
||
|
|
||
|
__m256d quadrants[ 4 * max_quadrant_size ];
|
||
|
|
||
|
__m256d* q[ 4 ] = { quadrants,
|
||
|
quadrants + max_quadrant_size,
|
||
|
quadrants + 2 * max_quadrant_size,
|
||
|
quadrants + 3 * max_quadrant_size };
|
||
|
|
||
|
int empty_quadrant = 0;
|
||
|
double const* p_eq = eq;
|
||
|
int quadrant_is_not_empty = 0xF;
|
||
|
|
||
|
for( ; p_eq < eq_max; p_eq += 12 )
|
||
|
{
|
||
|
__m256d a0 = _mm256_load_pd( p_eq );
|
||
|
__m256d aux = _mm256_load_pd( p_eq + 4 );
|
||
|
__m256d bc1 = _mm256_load_pd( p_eq + 8 );
|
||
|
|
||
|
// the equations in this group are
|
||
|
//
|
||
|
// a0[ 0, 1 ] x + a0[ 2, 3 ] y = aux[ 0, 1 ]
|
||
|
// aux[ 2, 3 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ]
|
||
|
|
||
|
__m256d bc0 = gather_lanes< 1, 0 >( a0, aux );
|
||
|
__m256d a1 = gather_lanes< 1, 0 >( aux );
|
||
|
|
||
|
// the equations in this group are now
|
||
|
//
|
||
|
// a0[ 0, 1 ] x + bc0[ 0, 1 ] y = bc0[ 2, 3 ]
|
||
|
// a1[ 0, 1 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ]
|
||
|
|
||
|
__m256d max_c0 = permute< 0, 1, 3, 2 >( bc0 ); //{ ?, ?, bc0[ 3 ], ? }
|
||
|
__m256d max_c1 = permute< 0, 1, 3, 2 >( bc1 ); //{ ?, ?, bc1[ 3 ], ? }
|
||
|
|
||
|
for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
|
||
|
{
|
||
|
q[ iq ][ 0 ] = gather_low( a0, bc0 ); // { a0[ 0 ], bc0[ 0 ], ?, ? }
|
||
|
q[ iq ][ 1 ] = gather_high( a0, bc0 ); // { a0[ 1 ], bc0[ 1 ], ?, ? }
|
||
|
q[ iq ][ 2 ] = gather_low( a1, bc1 ); // { a1[ 0 ], bc1[ 0 ], ?, ? }
|
||
|
q[ iq ][ 3 ] = gather_high( a1, bc1 ); // { a1[ 1 ], bc1[ 1 ], ?, ? }
|
||
|
|
||
|
q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c0 ); // { a0[ 0 ], bc0[ 0 ], bc0[ 3 ], ? }
|
||
|
q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], bc0 ); // { a0[ 1 ], bc0[ 1 ], bc0[ 2 ], ? }
|
||
|
q[ iq ][ 2 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 2 ], max_c1 ); // { a1[ 0 ], bc1[ 0 ], bc1[ 3 ], ? }
|
||
|
q[ iq ][ 3 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 3 ], bc1 ); // { a1[ 1 ], bc1[ 1 ], bc1[ 2 ], ? }
|
||
|
|
||
|
q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -a0[ 0 ], -bc0[ 0 ], -bc0[ 3 ], ? }
|
||
|
q[ iq ][ 2 ] = xor_d( q[ iq ][ 2 ], sign ); // { -a1[ 0 ], -bc1[ 0 ], -bc1[ 3 ], ? }
|
||
|
|
||
|
// moving to the next quadrant
|
||
|
|
||
|
if( iq & 0x1 ) // replacing y by - y
|
||
|
{
|
||
|
bc0 = xor_d( bc0, sign_01 );
|
||
|
bc1 = xor_d( bc1, sign_01 );
|
||
|
bc0 = permute< 1, 0, 2, 3 >( bc0 );
|
||
|
bc1 = permute< 1, 0, 2, 3 >( bc1 );
|
||
|
}
|
||
|
else // replacing x by - x
|
||
|
{
|
||
|
a0 = xor_d( a0, sign_01 );
|
||
|
a1 = xor_d( a1, sign_01 );
|
||
|
a0 = permute< 1, 0, 2, 3 >( a0 );
|
||
|
a1 = permute< 1, 0, 2, 3 >( a1 );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
// Checking for overflow and underflow: to avoid headaches with underflow and overflow, we
|
||
|
// normalize equation in which some elements are either +0.0 or have absolute value outside
|
||
|
// the range [ tiny, huge ]. In this process we may detect that the intersection of the
|
||
|
// feasible region with some quadrants is empty, or that some equations are irrelevant.
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
__m256d ra = gather_lanes< 0, 0 >( a0, a1 ); // { row[ 0 ][ 0 ], row[ 0 ][ 1 ], row[ 3 ][ 0 ], row[ 3 ][ 1 ] }
|
||
|
__m256i expa = and_i( ra, exp_mask );
|
||
|
__m256i exp0 = and_i( bc0, exp_mask );
|
||
|
__m256i exp1 = and_i( bc1, exp_mask );
|
||
|
|
||
|
__m256i min = _mm256_min_epi32( expa, exp0 );
|
||
|
__m256i max = _mm256_max_epi32( expa, exp0 );
|
||
|
min = _mm256_min_epi32( min, exp1 );
|
||
|
max = _mm256_max_epi32( max, exp1 );
|
||
|
|
||
|
__m256i c_min = _mm256_cmpgt_epi32( tiny, min );
|
||
|
__m256i c_max = _mm256_cmpgt_epi32( max, huge );
|
||
|
__m256i bad = or_i( c_min, c_max );
|
||
|
|
||
|
if( _mm256_testz_si256( bad, bad ) )
|
||
|
{
|
||
|
q[ 0 ] += 4;
|
||
|
q[ 1 ] += 4;
|
||
|
q[ 2 ] += 4;
|
||
|
q[ 3 ] += 4;
|
||
|
}
|
||
|
else
|
||
|
empty_quadrant |= normalize_equations( q, 4, tiny, huge );
|
||
|
}
|
||
|
|
||
|
if( n_equations_r )
|
||
|
{
|
||
|
__m256d ab = _mm256_load_pd( p_eq );
|
||
|
__m128d aux = _mm_load_pd( p_eq + 4 );
|
||
|
|
||
|
__m256d sign_23 = gather_lanes< 1, 0 >( sign_01 );
|
||
|
|
||
|
// the equations in this group are
|
||
|
// ab[ 0, 1 ] x + ab[ 2, 3 ] y = aux[ 0, 1 ]
|
||
|
|
||
|
__m256d c = _mm256_castpd128_pd256( aux );
|
||
|
c = gather_lanes< 0, 0 >( c );
|
||
|
// the equations in this group are now
|
||
|
// ab[ 0, 1 ] x + ab[ 2, 3 ] y = c[ 0, 1 ]
|
||
|
|
||
|
__m256d max_c = permute< 0, 1, 3, 3 >( c ); //{ ?, ?, c[ 1 ], ? }
|
||
|
|
||
|
for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
|
||
|
{
|
||
|
q[ iq ][ 0 ] = permute< 0, 2, 2, 3 >( ab ); // { ab[ 0 ], ab[ 2 ], ?, ? }
|
||
|
q[ iq ][ 1 ] = permute< 1, 3, 2, 3 >( ab ); // { ab[ 1 ], ab[ 3 ], ?, ? }
|
||
|
|
||
|
q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c ); // { ab[ 0 ], ab[ 2 ], c[ 1 ], ? }
|
||
|
q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], c ); // { ab[ 1 ], ab[ 3 ], c[ 0 ], ? }
|
||
|
|
||
|
q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -ab[ 0 ], -ab[ 2 ], -c[ 1 ], ? }
|
||
|
|
||
|
// moving to the next quadrant
|
||
|
|
||
|
if( iq & 0x1 ) // replacing y by - y
|
||
|
{
|
||
|
ab = xor_d( ab, sign_23 );
|
||
|
ab = permute< 0, 1, 3, 2 >( ab );
|
||
|
}
|
||
|
else // replacing x by - x
|
||
|
{
|
||
|
ab = xor_d( ab, sign_01 );
|
||
|
ab = permute< 1, 0, 2, 3 >( ab );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
// Checking for overflow and underflow: to avoid headaches with underflow and overflow, we
|
||
|
// normalize equation in which some elements are either +0.0 or have absolute value outside
|
||
|
// the range [ tiny, huge ]. In this process we may detect that the intersection of the
|
||
|
// feasible region with some quadrants is empty, or that some equations are irrelevant.
|
||
|
//----------------------------------------------------------------------------------------
|
||
|
|
||
|
__m256i exp_ab = and_i( ab, exp_mask );
|
||
|
__m256i exp_c = and_i( c, exp_mask );
|
||
|
exp_c = blend< 0, 0, 1, 1 >( exp_c, exp_ab );
|
||
|
|
||
|
__m256i min = _mm256_min_epi32( exp_ab, exp_c );
|
||
|
__m256i max = _mm256_max_epi32( exp_ab, exp_c );
|
||
|
|
||
|
__m256i c_min = _mm256_cmpgt_epi32( tiny, min );
|
||
|
__m256i c_max = _mm256_cmpgt_epi32( max, huge );
|
||
|
__m256i bad = or_i( c_min, c_max );
|
||
|
|
||
|
if( _mm256_testz_si256( bad, bad ) )
|
||
|
{
|
||
|
q[ 0 ] += 2;
|
||
|
q[ 1 ] += 2;
|
||
|
q[ 2 ] += 2;
|
||
|
q[ 3 ] += 2;
|
||
|
}
|
||
|
else
|
||
|
empty_quadrant |= normalize_equations( q, 2, tiny, huge );
|
||
|
}
|
||
|
|
||
|
__m256d avx_box = _mm256_load_pd( box );
|
||
|
__m256d cmp = _mm256_cmp_pd( avx_box, zero_d(), _CMP_GE_OQ );
|
||
|
int positive_box = _mm256_movemask_pd( cmp );
|
||
|
|
||
|
__m256d edge[ 2 * ( max_quadrant_size + 2 ) ];
|
||
|
__m256d vertex_pair[ 3 * ( max_quadrant_size + 2 ) ];
|
||
|
bounded_convex bc( 2 * ( max_quadrant_size + 2 ), edge, vertex_pair );
|
||
|
|
||
|
constexpr uint x_min_field = 0x1;
|
||
|
constexpr uint x_max_field = 0x2;
|
||
|
constexpr uint y_min_field = 0x4;
|
||
|
constexpr uint y_max_field = 0x8;
|
||
|
|
||
|
constexpr uint first_quadrant_field = 0x1;
|
||
|
constexpr uint second_quadrant_field = 0x2;
|
||
|
constexpr uint third_quadrant_field = 0x4;
|
||
|
constexpr uint fourth_quadrant_field = 0x8;
|
||
|
|
||
|
if( positive_box & x_max_field ) // x_max >= 0
|
||
|
{
|
||
|
if( !( empty_quadrant & first_quadrant_field ) &&
|
||
|
( positive_box & y_max_field ) ) // y_max >= 0, first quadrant
|
||
|
{
|
||
|
bc.setup( avx_box );
|
||
|
bc.push( quadrants, q[ 0 ] );
|
||
|
if( bc.type != bounded_convex_type::empty )
|
||
|
{
|
||
|
++ret;
|
||
|
__m256d bc_box = bc.box();
|
||
|
_mm256_store_pd( solution, bc_box );
|
||
|
solution += 4;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if( !( empty_quadrant & fourth_quadrant_field ) &&
|
||
|
!( positive_box & y_min_field ) ) // y_min < 0, fourth quadrant
|
||
|
{
|
||
|
__m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
y_sign = gather_lanes< clear_lane, 1 >( y_sign );
|
||
|
|
||
|
__m256d q_box = permute< 0, 1, 3, 2 >( avx_box );
|
||
|
q_box = xor_d( q_box, y_sign );
|
||
|
|
||
|
bc.setup( q_box );
|
||
|
bc.push( quadrants + 3 * max_quadrant_size, q[ 3 ] );
|
||
|
|
||
|
if( bc.type != bounded_convex_type::empty )
|
||
|
{
|
||
|
++ret;
|
||
|
__m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
y_sign = gather_lanes< clear_lane, 1 >( y_sign );
|
||
|
|
||
|
__m256d bc_box = bc.box();
|
||
|
bc_box = xor_d( bc_box, y_sign );
|
||
|
bc_box = permute< 0, 1, 3, 2 >( bc_box );
|
||
|
_mm256_store_pd( solution, bc_box );
|
||
|
solution += 4;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if( !( positive_box & x_min_field ) ) // x_min < 0
|
||
|
{
|
||
|
if( !( empty_quadrant & second_quadrant_field) &&
|
||
|
( positive_box & y_max_field ) ) // y_max >= 0, second quadrant
|
||
|
{
|
||
|
__m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
x_sign = gather_lanes< 0, clear_lane >( x_sign );
|
||
|
|
||
|
__m256d q_box = permute< 1, 0, 2, 3 >( avx_box );
|
||
|
q_box = xor_d( q_box, x_sign );
|
||
|
|
||
|
bc.setup( q_box );
|
||
|
bc.push( quadrants + max_quadrant_size, q[ 1 ] );
|
||
|
|
||
|
if( bc.type != bounded_convex_type::empty )
|
||
|
{
|
||
|
++ret;
|
||
|
__m256d bc_box = bc.box();
|
||
|
__m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
x_sign = gather_lanes< 0, clear_lane >( x_sign );
|
||
|
bc_box = xor_d( bc_box, x_sign );
|
||
|
bc_box = permute< 1, 0, 2, 3 >( bc_box );
|
||
|
_mm256_store_pd( solution, bc_box );
|
||
|
solution += 4;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if( !( empty_quadrant & third_quadrant_field ) &&
|
||
|
!( positive_box & y_min_field ) ) // y_min < 0, third quadrant
|
||
|
{
|
||
|
__m256i sign = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
avx_box = permute< 1, 0, 3, 2 >( avx_box );
|
||
|
avx_box = xor_d( avx_box, sign );
|
||
|
bc.setup( avx_box );
|
||
|
bc.push( quadrants + 2 * max_quadrant_size, q[ 2 ] );
|
||
|
|
||
|
if( bc.type != bounded_convex_type::empty )
|
||
|
{
|
||
|
++ret;
|
||
|
__m256d bc_box = bc.box();
|
||
|
__m256i sign = _mm256_slli_epi64( ones_i(), 63 );
|
||
|
bc_box = xor_d( bc_box, sign );
|
||
|
bc_box = permute< 1, 0, 3, 2 >( bc_box );
|
||
|
_mm256_store_pd( solution, bc_box );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::fesetround( old_rounding_mode );
|
||
|
|
||
|
for (int i = 0; i < ret; i++) {
|
||
|
printf("solution %u: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", i, solsPtr[4*i], solsPtr[4*i + 1], solsPtr[4*i + 2], solsPtr[4*i + 3]);
|
||
|
}
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
} // extern "C"
|
||
|
//----------------------------------------------------------------------------------------
|