From 91ac61e3cd0d7850ef9c1af5a9d6a7b784e5fdc3 Mon Sep 17 00:00:00 2001 From: sheaf Date: Wed, 22 May 2024 16:32:36 +0200 Subject: [PATCH] WIP: add Walter's LP approach to interval Newton --- brush-strokes/brush-strokes.cabal | 24 +- brush-strokes/cbits/lp_2d.cpp | 2035 +++++++++++++++++ brush-strokes/cbits/lp_2d.hpp | 1265 ++++++++++ brush-strokes/src/lib/Math/Root/Isolation.hs | 21 +- .../src/lib/Math/Root/Isolation/Core.hs | 5 +- .../src/lib/Math/Root/Isolation/Newton.hs | 171 ++ .../Isolation/{ => Newton}/GaussSeidel.hs | 158 +- .../src/lib/Math/Root/Isolation/Newton/LP.hs | 132 ++ .../src/lib/Math/Root/Isolation/Utils.hs | 48 + 9 files changed, 3703 insertions(+), 156 deletions(-) create mode 100644 brush-strokes/cbits/lp_2d.cpp create mode 100644 brush-strokes/cbits/lp_2d.hpp create mode 100644 brush-strokes/src/lib/Math/Root/Isolation/Newton.hs rename brush-strokes/src/lib/Math/Root/Isolation/{ => Newton}/GaussSeidel.hs (58%) create mode 100644 brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs create mode 100644 brush-strokes/src/lib/Math/Root/Isolation/Utils.hs diff --git a/brush-strokes/brush-strokes.cabal b/brush-strokes/brush-strokes.cabal index f38f9a6..816458c 100644 --- a/brush-strokes/brush-strokes.cabal +++ b/brush-strokes/brush-strokes.cabal @@ -9,6 +9,9 @@ build-type: Simple description: Computing brush strokes using Bézier curves. +extra-source-files: + cbits/**/*.{cpp, hpp} + flag use-fma description: Use fused-muliply add instructions to implement interval arithmetic. default: True @@ -143,8 +146,11 @@ library , Math.Root.Isolation.Bisection , Math.Root.Isolation.Core , Math.Root.Isolation.Degree - , Math.Root.Isolation.GaussSeidel , Math.Root.Isolation.Narrowing + , Math.Root.Isolation.Newton + , Math.Root.Isolation.Newton.GaussSeidel + , Math.Root.Isolation.Newton.LP + , Math.Root.Isolation.Utils , Debug.Utils other-modules: @@ -186,6 +192,22 @@ library , transformers >= 0.5.6.2 && < 0.7 + -- Extra C++ code for 2D linear systems of interval equations + include-dirs: + cbits + cxx-sources: + cbits/lp_2d.cpp + cxx-options: + -std=c++20 + -mavx2 -mfma + -frounding-math -fno-math-errno -fno-signed-zeros + -fno-trapping-math -fno-signaling-nans + -Wno-unused-result -Wsign-compare -Wno-switch + -march=native -mtune=native + -O3 -DNDEBUG + build-depends: + system-cxx-std-lib + --executable convert-metafont -- -- import: diff --git a/brush-strokes/cbits/lp_2d.cpp b/brush-strokes/cbits/lp_2d.cpp new file mode 100644 index 0000000..18d713c --- /dev/null +++ b/brush-strokes/cbits/lp_2d.cpp @@ -0,0 +1,2035 @@ +//---------------------------------------------------------------------------------------- +// Copyright (c) 2024 Walter Mascarenhas +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability", +// and we provide no guarantee regarding the correctness of this source code. +// +//---------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include + +#include "lp_2d.hpp" + +//---------------------------------------------------------------------------------------- +namespace wm::nlp::lp2d { + +__m256i bounds( __m256i* tiny, __m256i* huge ) +{ + __m256i ones = ones_i(); + __m256i sign = _mm256_slli_epi64( ones, 63 ); + __m256i a = _mm256_srli_epi64( sign, 1 ); // 0b0100'0000'0000... + __m256i b = _mm256_srli_epi64( sign, 3 ); // 0b0001'0000'0000... + __m256i c = _mm256_or_si256( a, b ); // 0b0101'0000'0000... + __m256i d = _mm256_srli_epi64( c, 4 ); // 0b0000'0101'0000... + + *huge = _mm256_or_si256( c, d ); // 0b0101'0101'0000... 0x5500... + + __m256i e = _mm256_srli_epi64( *huge, 1 ); // 0b0010'1010'1000... + __m256i f = _mm256_srli_epi64( d, 2 ); // 0b0000'0001'0100... + __m256i g = _mm256_srli_epi64( d, 4 ); // 0b0000'0000'0101... + __m256i h = _mm256_srli_epi64( ones, 1 ); + __m256i i = _mm256_srli_epi64( sign, 32 ); + + __m256i j = _mm256_or_si256( f, g ); // 0b0000'0001'0101... + + *tiny = _mm256_or_si256( e, j ); // 0b0010'1011'1101... 0x2bd... + __m256i k = and_not_i( i, h ); + *huge = blend< 0, 0, 0, 1 >( *huge, k ); + *tiny = blend< 0, 0, 0, 1 >( *tiny, zero_i() ); + return sign; +} + +//---------------------------------------------------------------------------------------- +// exact_location returns the exact location of the intersection ( x, y ) of the lines +// +// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ] +// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ] +// +// with respect to the set +// +// e0 x + e1 y >= e2 +// +// under the assumption that +// +// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0 +// +// where ei = edge[ i ], b0 = border[ 0 ] and b1 = b[ 1 ] +// +// This amounts to evaluating the sign of +// +// p = e0 r + e1 s - e2 d +// +// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ] +// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] , +// +// which motivates the evaluation of the six products +// +// group 1 group 2 +// e0 b0[ 2 ] b1[ 1 ], -e0 b0[ 1 ] b1[ 2 ] +// e1 b0[ 0 ] b1[ 2 ], -e1 b0[ 2 ] b1[ 0 ] +// e2 b0[ 1 ] b1[ 0 ]. -e2 b0[ 0 ] b1[ 1 ] +// +// and we want to find the sign of their sum. +// +// The function assumes that the edges entries are either normal or zero, ie., +// there are no subnormal entries. +//---------------------------------------------------------------------------------------- + +location exact_location( __m256d e, __m256d b0, __m256d b1 ) +{ + //---------------------------------------------------------------------------------------- + //---------------------------------------------------------------------------------------- + + __m256d b0_a = permute< 1, 2, 0, 3 >( b0 ); // { b0[ 1 ], b0[ 2 ], b0[ 0 ], ? } + __m256d b0_b = permute< 2, 0, 1, 3 >( b0 ); // { b0[ 2 ], b0[ 0 ], b0[ 1 ], ? } + __m256d b1_a = permute< 1, 2, 0, 3 >( b1 ); // { b1[ 1 ], b1[ 2 ], b1[ 0 ], ? } + __m256d b1_b = permute< 2, 0, 1, 3 >( b1 ); // { b1[ 2 ], b1[ 0 ], b1[ 1 ], ? } + + __m256d g1 = _mm256_mul_pd( b0_b, b1_a ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? } + __m256d g2 = _mm256_mul_pd( b0_a, b1_b ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? } + + __m256d g1b = _mm256_fmsub_pd( b0_b, b1_a, g1 ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? } + __m256d g2b = _mm256_fmsub_pd( b0_a, b1_b, g2 ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? } + + __m256i sign_i = _mm256_slli_epi64( ones_i(), 63 ); + __m256d sign = cast_d( sign_i ); + __m256d minus_e = xor_d( e, sign ); + + __m256d c[ 8 ]; + + c[ 0 ] = _mm256_mul_pd( e, g1 ); + c[ 1 ] = _mm256_mul_pd( minus_e, g2 ); + c[ 2 ] = _mm256_mul_pd( e, g1b ); + c[ 3 ] = _mm256_mul_pd( minus_e, g2b ); + + c[ 4 ] = _mm256_fmsub_pd( e, g1, c[ 0 ] ); + c[ 5 ] = _mm256_fmsub_pd( minus_e, g2, c[ 1 ] ); + c[ 6 ] = _mm256_fmsub_pd( e, g1b, c[ 2 ] ); + c[ 7 ] = _mm256_fmsub_pd( minus_e, g2b, c[ 3 ] ); + + __m256d a[ 6 ]; + + // transposing the products + + __m256d tmp_0 = gather_low( c[ 0 ], c[ 1 ] ); + __m256d tmp_1 = gather_high( c[ 0 ], c[ 1 ] ); + __m256d tmp_2 = gather_low( c[ 2 ], c[ 3 ] ); + __m256d tmp_3 = gather_high( c[ 2 ], c[ 3 ] ); + + a[ 0 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x20 ); + a[ 1 ] = _mm256_permute2f128_pd( tmp_1, tmp_3, 0x20 ); + a[ 2 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x31 ); + + __m256d tmp_4 = gather_low( c[ 4 ], c[ 5 ] ); + __m256d tmp_5 = gather_high( c[ 4 ], c[ 5 ] ); + __m256d tmp_6 = gather_low( c[ 6 ], c[ 7 ]); + __m256d tmp_7 = gather_high( c[ 6 ], c[ 7 ] ); + + a[ 3 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x20 ); + a[ 4 ] = _mm256_permute2f128_pd( tmp_5, tmp_7, 0x20 ); + a[ 5 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x31 ); + + __m256d gt_05 = is_greater( a[ 0 ], a[ 5 ] ); + __m256d tmp = blend_max( a[ 0 ], a[ 5 ], gt_05 ); + a[ 0 ] = blend_min( a[ 0 ], a[ 5 ], gt_05 ); + a[ 5 ] = tmp; + + //---------------------------------------------------------------------------------------- + // In the loop below we assume that a0[ j ] <= a5[ j ], i = 0,1,2,3 + //---------------------------------------------------------------------------------------- + + while( true ) + { + __m256d gt_12 = is_greater( a[ 1 ], a[ 2 ] ); + __m256d gt_34 = is_greater( a[ 3 ], a[ 4 ] ); + + __m256d min_12 = blend_min( a[ 1 ], a[ 2 ], gt_12 ); + __m256d max_12 = blend_max( a[ 1 ], a[ 2 ], gt_12 ); + + __m256d min_34 = blend_min( a[ 3 ], a[ 4 ], gt_34 ); + __m256d max_34 = blend_max( a[ 3 ], a[ 4 ], gt_34 ); + + __m256d gt_min = is_greater( min_12, min_34 ); + __m256d gt_max = is_greater( max_12, max_34 ); + + __m256d min_1234 = blend_min( min_12, min_34, gt_min ); + a[ 2 ] = blend_max( min_12, min_34, gt_min ); + + a[ 3 ] = blend_min( max_12, max_34, gt_max ); + __m256d max_1234 = blend_max( max_12, max_34, gt_max ); + + __m256d gt_0_min_1234 = is_greater( a[ 0 ], min_1234 ); + __m256d gt_max_1234_5 = is_greater( max_1234, a[ 5 ] ); + + a[ 1 ] = blend_max( a[ 0 ], min_1234, gt_0_min_1234 ); + a[ 0 ] = blend_min( a[ 0 ], min_1234, gt_0_min_1234 ); + + a[ 4 ] = blend_min( max_1234, a[ 5 ], gt_max_1234_5 ); + a[ 5 ] = blend_max( max_1234, a[ 5 ], gt_max_1234_5 ); + + // 1) a0[ j ] = min { ai[ j ], i = 0, .... 5 } + // 2) a5[ j ] = max { ai[ j ], i = 0, .... 5 } + + __m256d zero = _mm256_xor_pd( zero, zero ); + __m256d cmp0 = _mm256_cmp_pd( a[ 0 ], zero, _CMP_LT_OQ ); + + if( _mm256_testz_pd( cmp0, cmp0 ) ) // all a0[ j ] are >= 0 + { + __m256d cmp = is_greater( a[ 5 ], zero ); + if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a5 is positive + return location::border; + else + return location::in; + } + + __m256d cmp5 = _mm256_cmp_pd( a[ 5 ], zero, _CMP_GT_OQ ); + + if( _mm256_testz_pd( cmp5, cmp5 ) ) // all a5[ j ] are <= 0 + { + __m256d cmp = is_greater( zero, a[ 0 ] ); + if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a0 is negative + return location::border; + else + return location::out; + } + + // now + // 1) min ai[j] has its sign bit set + // 2) max ai[j] does not have its sign bit set + + __m256d lo = gather_low( a[ 0 ], a[ 5 ] ); // { a0[ 0 ], a5[ 0 ], a0[ 2 ], a5[ 2 ] } + __m256d hi = gather_high( a[ 0 ], a[ 5 ] ); // { a0[ 1 ], a5[ 1 ], a0[ 3 ], a5[ 3 ] } + + __m256d gt_lo_hi = is_greater( lo, hi ); + __m256d b0 = blend_min( lo, hi, gt_lo_hi ); + __m256d b5 = blend_max( lo, hi, gt_lo_hi ); + + // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 2 ] } + // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 1 ], b5[ 3 ] } + + b0 = permute< 0, 2, 1, 3 >( b0 ); + b5 = permute< 3, 1, 2, 0 >( b5 ); + + // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 1 ] } + // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 0 ], b5[ 1 ] } + + lo = gather_low( b0, b5 ); // { b0[ 0 ], b5[ 0 ], b0[ 2 ], b5[ 2 ] } + hi = gather_high( b0, b5 ); // { b0[ 1 ], b5[ 1 ], b0[ 3 ], b5[ 3 ] } + + gt_lo_hi = is_greater( lo, hi ); + b0 = blend_min( lo, hi, gt_lo_hi ); + b5 = blend_max( lo, hi, gt_lo_hi ); + + // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] + // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 1 ] + + b5 = permute< 1, 0, 3, 2 >( b5 ); + + // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] <= 0 + // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 0 ] >= 0 s + + // fixing possible break of order in positions 1,2,3 + + __m256d chk123 = is_greater( b0, b5 ); + __m256d aux = blend_min( b0, b5, chk123 ); + b5 = blend_max( b0, b5, chk123 ); + b0 = aux; + + __m256i b0i = cast_i( b0 ); + __m256i b5i = cast_i( b5 ); + + uint64_t u0 = _mm256_extract_epi64( b0i, 0 ) & 0x7FFF'FFFF'FFFF'FFFFULL; + uint64_t u5 = _mm256_extract_epi64( b5i, 0 ); + + if( u0 >= u5 ) // | b0[ 0 | >= | b5[ 0 ] | + { + if( u0 > u5 + 0x0060'0000'0000'0000ULL ) // |b0| > 2^5 |b1| (we use 0x006... to handle subnormals properly) + return location::out; + + __m256d sum = _mm256_add_pd( b0, b5 ); // b0 + b5, rounded down + __m256d cor = _mm256_sub_pd( sum, b0 ); // sum - b0, rounded down + __m256d delta = _mm256_sub_pd( b5, cor ); // b5 - cor, rounded down. + // now b0[ 0 ] + b5[ 0 ] = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ] + a[ 0 ] = blend< 1, 0, 0, 0 >( b0, sum ); + a[ 5 ] = blend< 1, 0, 0, 0 >( b5, delta ); + } + else + { + if( u5 > u0 + 0x0060'0000'0000'0000ULL ) + return location::in; + + __m256i sign_i = _mm256_slli_epi64( ones_i(), 63 ); + __m256d sign = cast_d( sign_i ); + __m256d minus_b0 = xor_d( b0, sign ); + + __m256d sum = _mm256_sub_pd( minus_b0, b5 ); // (-b5) + (-b0) rounded down + __m256d cor = _mm256_add_pd( sum, b5 ); // sum - (-b5) rounded down + __m256d delta = _mm256_sub_pd( minus_b0, cor ); // (-b0) - cor, rounded down. + + // -( b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ] + + sum = xor_d( sum, sign ); + delta = xor_d( delta, sign ); + + // b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with delta[ 0 ] <= 0 <= sum[ 0 ] + + a[ 0 ] = blend< 1, 0, 0, 0 >( b0, delta ); + a[ 5 ] = blend< 1, 0, 0, 0 >( b5, sum ); + } + } +} + +//---------------------------------------------------------------------------------------- +// bounded_convex::box: finding the box of the bounded convex region, under the assumption +// that the normals are normalized, that is, for each edge e, either +// e[ i ] = +0.0 or e[ i ] >= tiny ~ 2^(-100), i.e. no signed zeros, underflow +// or overflow or negative coordinates. +//---------------------------------------------------------------------------------------- + +__m256d bounded_convex::box() const +{ + constexpr int non_0 = 0; + constexpr int non_1 = 2; + constexpr int inf_0 = 0; + constexpr int sup_0 = 1; + constexpr int inf_1 = 2; + constexpr int sup_1 = 3; + + assert( type != bounded_convex_type::empty ); + + __m256d den; + __m256d num; + __m256i sign = _mm256_slli_epi64( ones_i(), 63 ); // { -1, -1, -1, -1 } + __m256i sign_h = _mm256_slli_si256( sign, 8 ); // { 0, -1, 0, -1 } + + if( type != bounded_convex_type::polygon ) + { + if( type == bounded_convex_type::point ) + { + // { vertex_pair[ 0 ] = { x_inf, -x_sup, ? , ? } + // { vertex_pair[ 1 ] = { y_inf, -y_sup, ? , ? } + // { vertex_pair[ 2 ] = { z_inf, -z_sup, ? , ? } + + den = xor_d( vertex_p[ 2 ], sign_h ); + den = permute< sup_0, inf_0, sup_0, inf_0 >( den ); + num = gather_lanes< 0, 0 >( vertex_p[ 0 ], vertex_p[ 1 ] ); + } + else + { + __m256d num_x; + __m256d num_y; + + den = xor_d( vertex_p[ 2 ], sign_h ); + + int xy_signs = high_bits_64( edge[ 1 ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // edge[ 0 ] >= 0, edge[ 1 ] >= 0: box = { x0, x1, y1, y0 } + { + num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); + num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); + + den = permute< sup_0, inf_1, sup_1, inf_0 >( den ); + break; + } + case 1: // edge[ 0 ] < 0, edge[ 1 ] >= 0: box = { x0, x1, y0, y1 } + { + num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); + num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); + + den = permute< sup_0, inf_0, sup_0, inf_1 >( den ); + break; + } + case 2: // edge[ 0 ] >= 0, edge[ 1 ] < 0: box = {x1, x0, y1, y0 } + { + num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); + num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); + + den = permute< sup_1, inf_0, sup_1, inf_0 >( den ); + + break; + } + case 3: // edge[ 0 ] < 0, edge[ 1 ] < 0: box = { x1, x0, y0, y1 } + { + num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); + num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); + + den = permute< sup_1, inf_0, sup_0, inf_1 >( den ); + break; + } + } + + num = gather_lanes< 0, 1 >( num_x, num_y ); + } + } + else + { + int index = begin; + + auto set_x_max = [ & ]() + { + int previous_vertex = ( ( index > 0 ) ? index : end ); + int previous_pair = 3 * ( --previous_vertex / 2 ); + + __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); + + if( previous_vertex & 0x1 ) + { + z = permute< non_0, inf_1, non_1, non_1 >( z ); + __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] ); + num = blend< 0, 1, 0, 0 >( num, x ); + } + else + { + z = permute< non_0, inf_0, non_1, non_1 >( z ); + num = blend< 0, 1, 0, 0 >( num, vertex_p[ previous_pair ] ); + } + + den = blend< 0, 1, 0, 0 >( den, z ); + }; + + auto set_x_min = [ & ]() + { + int previous_vertex = ( ( index > 0 ) ? index : end ); + int previous_pair = 3 * ( --previous_vertex / 2 ); + + __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); + + if( previous_vertex & 0x1 ) + { + z = permute< sup_1, non_0, non_1, non_1 >( z ); + __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] ); + num = blend< 1, 0, 0, 0 >( num, x ); + } + else + { + z = permute< sup_0, non_0, non_1, non_1 >( z ); + num = blend< 1, 0, 0, 0 >( num, vertex_p[ previous_pair ] ); + } + + den = blend< 1, 0, 0, 0 >( den, z ); + }; + + auto set_y_max = [ & ]() + { + int previous_vertex = ( ( index > 0 ) ? index : end ); + int previous_pair = 3 * ( --previous_vertex / 2 ); + + __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); + + if( previous_vertex & 0x1 ) + { + z = permute< non_0, non_0, non_1, inf_1 >( z ); + num = blend< 0, 0, 0, 1 >( num, vertex_p[ previous_pair + 1 ] ); + } + else + { + z = permute< non_0, non_0, non_1, inf_0 >( z ); + __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] ); + num = blend< 0, 0, 0, 1 >( num, y ); + } + + den = blend< 0, 0, 0, 1 >( den, z ); + }; + + auto set_y_min = [ & ]() + { + int previous_vertex = ( ( index > 0 ) ? index : end ); + int previous_pair = 3 * ( --previous_vertex / 2 ); + + __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h ); + + if( previous_vertex & 0x1 ) + { + z = permute< non_1, non_1, sup_1, non_1 >( z ); + num = blend< 0, 0, 1, 0 >( num, vertex_p[ previous_pair + 1 ] ); + } + else + { + z = permute< non_0, non_0, sup_0, non_0 >( z ); + __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] ); + num = blend< 0, 0, 1, 0 >( num, y ); + } + + den = blend< 0, 0, 1, 0 >( den, z ); + }; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // normal[ 0 ] >= 0, normal[ 1 ] >= 0: first quadrant + { + if( index & 0x1 ) + goto quadrant_1_odd; + else + goto quadrant_1_even; + } + case 1: // normal[ 0 ] < 0, normal[ 1 ] >= 0: second quadrant + { + if( index & 0x1 ) + goto quadrant_2_odd; + else + goto quadrant_2_even; + } + case 2: // normal[ 0 ] >= 0, normal[ 1 ] < 0: fourth quadrant + { + if( index & 0x1 ) + goto quadrant_4_odd; + else + goto quadrant_4_even; + } + case 3: // normal[ 0 ] < 0, normal[ 1 ] < 0: third quadrant + { + if( index & 0x1 ) + goto quadrant_3_odd; + else + goto quadrant_3_even; + } + } + + quadrant_1_even: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // quadrant_1 + goto quadrant_1_odd; + case 1: // quadrant_2 + { + set_y_min(); + goto quadrant_2_odd; + } + case 3: // quadrant_3 + { + set_x_max(); + set_y_min(); + goto quadrant_3_odd; + } + } + } + + quadrant_1_odd: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // quadrant_1 + goto quadrant_1_even; + case 1: // quadrant_2 + { + set_y_min(); + goto quadrant_2_even; + } + case 3: // quadrant_3 + { + set_x_max(); + set_y_min(); + goto quadrant_3_even; + } + } + } + + quadrant_2_even: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 1: // quadrant_2 + goto quadrant_2_odd; + case 2:// quadrant_4 + { + set_x_max(); + set_y_max(); + goto quadrant_4_odd; + } + case 3: // quadrant_3 + { + set_x_max(); + goto quadrant_3_odd; + } + } + } + + quadrant_2_odd: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 1: // quadrant_2 + goto quadrant_2_even; + case 2:// quadrant_4 + { + set_x_max(); + set_y_max(); + goto quadrant_4_even; + } + case 3: // quadrant_3 + { + set_x_max(); + goto quadrant_3_even; + } + } + } + + quadrant_3_even: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // quadrant_1 + { + set_y_max(); + set_x_min(); + goto quadrant_1_odd; + } + case 2:// quadrant_4 + { + set_y_max(); + goto quadrant_4_odd; + } + case 3: // quadrant_3 + goto quadrant_3_odd; + } + } + + quadrant_3_odd: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // quadrant_1 + { + set_x_min(); + set_y_max(); + goto quadrant_1_even; + } + case 2:// quadrant_4 + { + set_y_max(); + goto quadrant_4_even; + } + case 3: // quadrant_3 + goto quadrant_3_even; + } + } + + quadrant_4_even: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // quadrant_1 + { + set_x_min(); + goto quadrant_1_odd; + } + case 1: // quadrant_2 + { + set_x_min(); + set_y_min(); + goto quadrant_2_odd; + } + case 2:// quadrant_4 + goto quadrant_4_odd; + } + } + + quadrant_4_odd: + { + if( ++index > end ) + goto done; + + int xy_signs = high_bits_64( edge[ index ] ) & 0x3; + + switch( xy_signs ) + { + case 0: // quadrant_1 + { + set_x_min(); + goto quadrant_1_even; + } + case 1: // quadrant_2 + { + set_x_min(); + set_y_min(); + goto quadrant_2_even; + } + case 2:// quadrant_4 + goto quadrant_4_even; + } + } + + done: {} + } + + num = _mm256_div_pd( num, den ); + num = xor_d( num, sign_h ); + return num; +} + +int bounded_convex::insert_vertex( int at ) +{ + int high = end - at; + int low = at - begin; + assert( ( low > 0 ) && ( high > 0 ) ); + + if( low <= high ) + { + __m256d* ed = edge + ( begin - 1 ); + __m256d* ed_max = ed + low; + + for( ; ed < ed_max; ++ed ) + { + __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed + 1 ) ); + _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp ); + } + + int to_load; + + __m256d* vp = vertex_pair( begin ); + + __m256d a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) ); + __m256d a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) ); + __m256d a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) ); + + if( ( begin & 0x1 ) == 0 ) + { + __m256d c0 = gather_lanes< clear_lane, 0 >( a0 ); + __m256d c1 = gather_lanes< clear_lane, 0 >( a1 ); + __m256d c2 = gather_lanes< clear_lane, 0 >( a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), c2 ); + + if( low == 1 ) + { + --begin; + return at - 1; + } + to_load = low - 2; + } + else + to_load = low - 1; + + int q = to_load / 4; + __m256d* vp_max = vp + 6 * q; + + while( vp < vp_max ) + { + __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); + __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); + __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); + + __m256d c0 = gather_lanes< 1, 0 >( a0, b0 ); + __m256d c1 = gather_lanes< 1, 0 >( a1, b1 ); + __m256d c2 = gather_lanes< 1, 0 >( a2, b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); + + vp += 3; + + a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); + a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); + a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); + + __m256d d0 = gather_lanes< 1, 0 >( b0, a0 ); + __m256d d1 = gather_lanes< 1, 0 >( b1, a1 ); + __m256d d2 = gather_lanes< 1, 0 >( b2, a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), d0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); + + vp += 3; + } + + int r = to_load % 4; + + if( r == 0 ) + { + __m256d c0 = gather_lanes< 1, clear_lane >( a0 ); + __m256d c1 = gather_lanes< 1, clear_lane >( a1 ); + __m256d c2 = gather_lanes< 1, clear_lane >( a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); + } + else + { + __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); + __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); + __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); + + __m256d c0 = gather_lanes< 1, 0 >( a0, b0 ); + __m256d c1 = gather_lanes< 1, 0 >( a1, b1 ); + __m256d c2 = gather_lanes< 1, 0 >( a2, b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); + + switch( r ) + { + case 1: + goto low_end; + case 2: + { + __m256d c0 = gather_lanes< 1, clear_lane >( b0 ); + __m256d c1 = gather_lanes< 1, clear_lane >( b1 ); + __m256d c2 = gather_lanes< 1, clear_lane >( b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp + 3 ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 4 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 5 ), c2 ); + goto low_end; + } + default: + { + vp += 3; + a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); + a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); + a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); + + __m256d c3 = gather_lanes< 1, 0 >( b0, a0 ); + __m256d c4 = gather_lanes< 1, 0 >( b1, a1 ); + __m256d c5 = gather_lanes< 1, 0 >( b2, a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c3 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c4 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c5 ); + } + } + } + + low_end: + { + --begin; + return at - 1; + } + } + else + { + __m256d* ed = edge + end + 1; + __m256d* ed_min = ed - ( high + 1 ); + + for( ; ed > ed_min; --ed ) + { + __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed - 1 ) ); + _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp ); + } + + __m256d* vp = vertex_pair( end ); + + __m256d a0; + __m256d a1; + __m256d a2; + + int to_load; + + if( ( end & 0x1 ) == 0 ) + { + a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); + a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); + a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); + + __m256d c0 = gather_lanes< 1, clear_lane >( a0 ); + __m256d c1 = gather_lanes< 1, clear_lane >( a1 ); + __m256d c2 = gather_lanes< 1, clear_lane >( a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); + + if( high == 1 ) + { + ++end; + return at; + } + + vp -= 3; + to_load = high - 2; + } + else + { + to_load = high - 1; + a0 = _mm256_load_pd( reinterpret_cast< double* >( vp ) ); + a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) ); + a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) ); + } + + int q = to_load / 4; + + __m256d* vp_min = vp - 6 * q; + + while( vp > vp_min ) + { + __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); + __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); + __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); + + __m256d c0 = gather_lanes< 1, 0 >( b0, a0 ); + __m256d c1 = gather_lanes< 1, 0 >( b1, a1 ); + __m256d c2 = gather_lanes< 1, 0 >( b2, a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); + + vp -= 3; + + a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); + a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); + a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); + + __m256d d0 = gather_lanes< 1, 0 >( a0, b0 ); + __m256d d1 = gather_lanes< 1, 0 >( a1, b1 ); + __m256d d2 = gather_lanes< 1, 0 >( a2, b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), d0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); + + vp -= 3; + } + + int r = to_load % 4; + + if( r == 0 ) + { + __m256d c0 = gather_lanes< clear_lane, 0 >( a0 ); + __m256d c1 = gather_lanes< clear_lane, 0 >( a1 ); + __m256d c2 = gather_lanes< clear_lane, 0 >( a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); + } + else + { + __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); + __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); + __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); + + __m256d c0 = gather_lanes< 1, 0 >( b0, a0 ); + __m256d c1 = gather_lanes< 1, 0 >( b1, a1 ); + __m256d c2 = gather_lanes< 1, 0 >( b2, a2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); + + switch( r ) + { + case 1: + goto high_end; + case 2: + { + __m256d d0 = gather_lanes< clear_lane, 0 >( b0 ); + __m256d d1 = gather_lanes< clear_lane, 0 >( b1 ); + __m256d d2 = gather_lanes< clear_lane, 0 >( b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), d0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), d1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), d2 ); + goto high_end; + } + default: // case 3 + { + vp -= 3; + + a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); + a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); + a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); + + __m256d d0 = gather_lanes< 1, 0 >( a0, b0 ); + __m256d d1 = gather_lanes< 1, 0 >( a1, b1 ); + __m256d d2 = gather_lanes< 1, 0 >( a2, b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( vp ), d0 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); + _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); + } + } + } + + high_end: + { + ++end; + return at; + } + } +} + +void bounded_convex::push( __m256d* begin, __m256d* end ) +{ + for( ; begin < end; ++begin ) + { + int sign = _mm256_movemask_pd( *begin ) & 0x7; + + switch( sign ) + { + case 0b000: { locator< 0, 0, 0 >::insert( *this, *begin ); break; } + case 0b001: { locator< 1, 0, 0 >::insert( *this, *begin ); break; } + case 0b010: { locator< 0, 1, 0 >::insert( *this, *begin ); break; } + case 0b011: { locator< 1, 1, 0 >::insert( *this, *begin ); break; } + case 0b100: { locator< 0, 0, 1 >::insert( *this, *begin ); break; } + case 0b101: { locator< 1, 0, 1 >::insert( *this, *begin ); break; } + case 0b110: { locator< 0, 1, 1 >::insert( *this, *begin ); break; } + case 0b111: { locator< 1, 1, 1 >::insert( *this, *begin ); break; } + } + + if( type == bounded_convex_type::empty ) + return; + } +} + +int bounded_convex::remove_vertices( int begin_vertex, int end_vertex ) +{ + int re = end_vertex & 0x1; + int rb = begin_vertex & 0x1; + int low = begin_vertex - begin; + int high = end - end_vertex; + + if( low <= high ) + { + __m256d* e_dst = edge + end_vertex; + __m256d* e_dst_min = e_dst - low; + __m256d* e_src = edge + begin_vertex; + + while( --e_dst >= e_dst_min ) + { + __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( --e_src ) ); + _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); + } + + __m256d* v_dst = vertex_pair( end_vertex ); + __m256d* v_src = vertex_pair( begin_vertex ); + + if( re == rb ) + { + int q; + + if( re & 0x1 ) + { + __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); + __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); + __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); + + __m256d b0 = blend< 0, 0, 1, 1 >( s0, d0 ); + __m256d b1 = blend< 0, 0, 1, 1 >( s1, d1 ); + __m256d b2 = blend< 0, 0, 1, 1 >( s2, d2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 ); + + q = low; + } + else + q = ( low + 1 ); + + q /= 2; + __m256d* v_src_min = v_src - 3 * q; + + while( v_src > v_src_min ) + { + --v_src; + --v_dst; + __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); + } + } + else + { + __m256d s0; + __m256d s1; + __m256d s2; + + int to_load; + + if( rb == 0 ) + { + v_src -= 3; + + s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d p0 = gather_lanes< 1, clear_lane >( s0 ); + __m256d p1 = gather_lanes< 1, clear_lane >( s1 ); + __m256d p2 = gather_lanes< 1, clear_lane >( s2 ); + + __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); + __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); + __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); + + __m256d b0 = blend< 0, 0, 1, 1 >( p0, d0 ); + __m256d b1 = blend< 0, 0, 1, 1 >( p1, d1 ); + __m256d b2 = blend< 0, 0, 1, 1 >( p2, d2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), b0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 ); + + if( low == 1 ) + goto low_end; + + to_load = low - 2; + } + else + { + to_load = low - 1; + + s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + } + + int q = to_load / 4; + __m256d* v_src_min = v_src - 6 * q; + + while( v_src > v_src_min ) + { + v_src -= 3; + v_dst -= 3; + + __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d c0 = gather_lanes< 1, 0 >( b0, s0 ); + __m256d c1 = gather_lanes< 1, 0 >( b1, s1 ); + __m256d c2 = gather_lanes< 1, 0 >( b2, s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); + + v_dst -= 3; + v_src -= 3; + + s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d d0 = gather_lanes< 1, 0 >( s0, b0 ); + __m256d d1 = gather_lanes< 1, 0 >( s1, b1 ); + __m256d d2 = gather_lanes< 1, 0 >( s2, b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 ); + } + + v_dst -= 3; + int r = to_load % 4; + + if( r == 0 ) + { + __m256d c0 = gather_lanes< clear_lane, 0 >( s0 ); + __m256d c1 = gather_lanes< clear_lane, 0 >( s1 ); + __m256d c2 = gather_lanes< clear_lane, 0 >( s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); + } + else + { + v_src -= 3; + __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d c0 = gather_lanes< 1, 0 >( b0, s0 ); + __m256d c1 = gather_lanes< 1, 0 >( b1, s1 ); + __m256d c2 = gather_lanes< 1, 0 >( b2, s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); + + switch( r ) + { + case 1: + goto low_end; + case 2: + { + v_dst -= 3; + + __m256d d0 = gather_lanes< clear_lane, 0 >( b0 ); + __m256d d1 = gather_lanes< clear_lane, 0 >( b1 ); + __m256d d2 = gather_lanes< clear_lane, 0 >( b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 ); + goto low_end; + } + default: + { + v_dst -= 3; + v_src -= 3; + + s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d d0 = gather_lanes< 1, 0 >( s0, b0 ); + __m256d d1 = gather_lanes< 1, 0 >( s1, b1 ); + __m256d d2 = gather_lanes< 1, 0 >( s2, b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), d0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 ); + + v_dst -= 3; + + s0 = gather_lanes< clear_lane, 0 >( s0, b0 ); + s1 = gather_lanes< clear_lane, 0 >( s1, b1 ); + s2 = gather_lanes< clear_lane, 0 >( s2, b2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), s0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), s1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), s2 ); + } + } + } + } + + low_end: + { + begin += ( end_vertex - begin_vertex ); + return end_vertex - 1; + } + } + else + { + __m256d* e_dst = edge + begin_vertex; + __m256d* e_src = edge + end_vertex; + __m256d* e_src_max = e_src + high + 1; + + for( ; e_src < e_src_max; ++e_src, ++e_dst ) + { + __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( e_src ) ); + _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); + } + + __m256d s0; + __m256d s1; + __m256d s2; + + __m256d* v_dst = vertex_pair( begin_vertex ); + __m256d* v_src = vertex_pair( end_vertex ); + + if( re == rb ) + { + if( re & 0x1 ) + { + s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); + __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); + __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); + + __m256d c0 = blend< 0, 0, 1, 1 >( d0, s0 ); + __m256d c1 = blend< 0, 0, 1, 1 >( d1, s1 ); + __m256d c2 = blend< 0, 0, 1, 1 >( d2, s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), c0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 ); + + --high; + v_src += 3; + v_dst += 3; + } + + int q = 3 * ( ( high + 1 ) / 2 ); + + __m256d* v_src_max = vertex_pair( end + 1 ); + for( ; v_src < v_src_max; ++v_src, ++v_dst ) + { + __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); + } + } + else + { + int to_load; + + __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + if( rb ) // re = 0 =: + { + __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst ) ); + __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) ); + __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) ); + + __m256d g0 = gather_lanes< 0, 0 >( d0, s0 ); + __m256d g1 = gather_lanes< 0, 0 >( d1, s1 ); + __m256d g2 = gather_lanes< 0, 0 >( d2, s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), g0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), g1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), g2 ); + + if( high == 1 ) + goto high_end; + + v_dst += 3; + to_load = high - 2; + } + else // re = 1 => only the second lane of the si are used + to_load = high - 1; + + int q = to_load / 4; + __m256d* v_src_max = v_src + 6 * q; + + while( v_src < v_src_max ) + { + v_src += 3; + + __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d u0 = gather_lanes< 1, 0 >( s0, t0 ); + __m256d u1 = gather_lanes< 1, 0 >( s1, t1 ); + __m256d u2 = gather_lanes< 1, 0 >( s2, t2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 ); + + v_dst += 3; + v_src += 3; + + s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d v0 = gather_lanes< 1, 0 >( t0, s0 ); + __m256d v1 = gather_lanes< 1, 0 >( t1, s1 ); + __m256d v2 = gather_lanes< 1, 0 >( t2, s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 ); + + v_dst += 3; + } + + int r = to_load % 4; + + if( r == 0 ) + { + __m256d u0 = gather_lanes< 1, clear_lane >( s0 ); + __m256d u1 = gather_lanes< 1, clear_lane >( s1 ); + __m256d u2 = gather_lanes< 1, clear_lane >( s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 ); + } + else + { + v_src += 3; + + __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d u0 = gather_lanes< 1, 0 >( s0, t0 ); + __m256d u1 = gather_lanes< 1, 0 >( s1, t1 ); + __m256d u2 = gather_lanes< 1, 0 >( s2, t2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), u0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 ); + + switch( r ) + { + case 1: + goto high_end; + case 2: + { + v_dst += 3; + __m256d v0 = gather_lanes< 1, clear_lane >( t0 ); + __m256d v1 = gather_lanes< 1, clear_lane >( t1 ); + __m256d v2 = gather_lanes< 1, clear_lane >( t2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 ); + + goto high_end; + } + default: // case 3 + { + v_dst += 3; + v_src += 3; + + s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src ) ); + s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) ); + s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) ); + + __m256d v0 = gather_lanes< 1, 0 >( t0, s0 ); + __m256d v1 = gather_lanes< 1, 0 >( t1, s1 ); + __m256d v2 = gather_lanes< 1, 0 >( t2, s2 ); + + _mm256_store_pd( reinterpret_cast< double* >( v_dst ), v0 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 ); + _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 ); + } + } + } + } + + high_end: + { + end -= ( end_vertex - begin_vertex ); + return begin_vertex - 1; + } + } +} + +void bounded_convex::set_to_last_segment() +{ + int last = end - 1; + edge[ 0 ] = edge[ last ]; + edge[ 1 ] = edge[ end ]; + edge[ 2 ] = edge[ begin + 1 ]; + + __m256d* vl = vertex_pair( last ); + __m256d* vb = vertex_pair( begin ); + + if( last & 0x1 ) + { + if( begin & 0x1 ) + { + vertex_p[ 0 ] = gather_lanes< 1, 1 >( vl[ 0 ], vb[ 0 ] ); + vertex_p[ 1 ] = gather_lanes< 1, 1 >( vl[ 1 ], vb[ 1 ] ); + vertex_p[ 2 ] = gather_lanes< 1, 1 >( vl[ 2 ], vb[ 2 ] ); + } + else + { + vertex_p[ 0 ] = gather_lanes< 1, 0 >( vl[ 0 ], vb[ 0 ] ); + vertex_p[ 1 ] = gather_lanes< 1, 0 >( vl[ 1 ], vb[ 1 ] ); + vertex_p[ 2 ] = gather_lanes< 1, 0 >( vl[ 2 ], vb[ 2 ] ); + } + } + else + { + if( begin & 0x1 ) + { + vertex_p[ 0 ] = blend< 0, 0, 1, 1 >( vl[ 0 ], vb[ 0 ] ); + vertex_p[ 1 ] = blend< 0, 0, 1, 1 >( vl[ 1 ], vb[ 1 ] ); + vertex_p[ 2 ] = blend< 0, 0, 1, 1 >( vl[ 2 ], vb[ 2 ] ); + } + else + { + vertex_p[ 0 ] = gather_lanes< 0, 0 >( vl[ 0 ], vb[ 0 ] ); + vertex_p[ 1 ] = gather_lanes< 0, 0 >( vl[ 1 ], vb[ 1 ] ); + vertex_p[ 2 ] = gather_lanes< 0, 0 >( vl[ 2 ], vb[ 2 ] ); + } + } + + end = 2; + begin = 0; + type = bounded_convex_type::segment; +} + +void bounded_convex::setup( __m256d box ) +{ + type = bounded_convex_type::polygon; + + __m256d zero = _mm256_xor_pd( zero, zero ); + __m256i ones = _mm256_cmpeq_epi64( ones, ones ); + __m256i signi = _mm256_slli_epi64( ones, 63 ); + __m256i signi_h = _mm256_slli_si256( signi, 8 ); + __m256d sign = _mm256_castsi256_pd( signi ); + __m256d sign_h = _mm256_castsi256_pd( signi_h ); + __m256i onei = _mm256_srli_epi64( ones, 54 ); + onei = _mm256_slli_epi64( onei, 52 ); // { 1.0, 1.0, 1.0, 1.0 } + __m256d one = _mm256_castsi256_pd( onei ); + + box = _mm256_max_pd( box, zero ); + + begin = alloc / 2; + end = begin + 4; + __m256d* e = edge + begin; + __m256d x_box = gather_lanes< 0, 0 >( box ); // { x_min, x_max, x_min, x_max } + __m256d y_box = gather_lanes< 1, 1 >( box ); // { y_min, y_max, y_min, y_max } + + e[ 0 ] = blend< 0, 1, 0, 0 >( one, zero ); // { 1, 0, 0, 0}, + e[ 0 ] = blend< 0, 0, 1, 1 >( e[ 0 ], x_box ); // { 1, 0, x_min, x_max }, for x >= x_min + + e[ 1 ] = permute< 1, 0, 2, 3 >( e[ 0 ] ); // { 0 , 1, x_min, x_max } // y >= y_min + e[ 1 ] = blend< 0, 0, 1, 1 >( e[ 1 ], y_box ); // { 0 , 1, y_min, y_max } // y >= y_min + + e[ 2 ] = permute< 0, 1, 3, 2 >( e[ 0 ] ); // { 1 , 0, x_max, x_min } + e[ 2 ] = _mm256_xor_pd( e[ 2 ], sign ); // { -1 , -, -x_max, -y_min } -x >= -x_max + + e[ 3 ] = permute< 0, 1, 3, 2 >( e[ 1 ] ); // { 0, 1, y_max, ? } + e[ 3 ] = _mm256_xor_pd( e[ 3 ], sign ); // { 0, 1, -y_max, ? } -y >= -y_max + + e[ 4 ] = e[ 0 ]; // { 1, 0, x_min, } } + + // initial vertices + + __m256d* vp = vertex_pair( begin ); + + vp[ 0 ] = permute< 0, 0, 3, 3 >( x_box ); // { x_min, x_min, x_max, x_max } : vertices 0 and 1: + vp[ 0 ] = _mm256_xor_pd( vp[ 0 ], sign_h ); // { x_min, -x_min, x_max, -x_max } : vertices 0 and 1: + + vp[ 1 ] = permute< 0, 0, 2, 2 >( y_box ); // { y_min, y_min, y_min, y_min } + vp[ 1 ] = _mm256_xor_pd( vp[ 1 ], sign_h ); // { y_min, -y_min, y_min, -y_min } + + vp[ 2 ] = _mm256_xor_pd( one, sign_h ); // { 1.0, -1.0, 1.0, -1.0 } + + vp[ 3 ] = permute< 1, 1, 2, 2 >( x_box ); // { x_max, x_max, x_min, x_min } : vertices 2 and 3: + vp[ 3 ] = _mm256_xor_pd( vp[ 3 ], sign_h ); // { x_max, -x_max, x_min, -x_min } + + vp[ 4 ] = permute< 1, 1, 3, 3 >( y_box ); // { y_max, y_max, y_max, y_max } + vp[ 4 ] = _mm256_xor_pd( vp[ 4 ], sign_h ); // { y_max, -y_max, y_max, -y_max } + + vp[ 5 ] = vp[ 2 ]; +} + +int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge ) +{ + constexpr unsigned long long x_index_epi8 = 0x1 << 4; + constexpr unsigned long long y_index_epi8 = 0x1 << 12; + constexpr unsigned long long z_index_epi8 = 0x1 << 16; + + int ret = 0; + __m256i ones = ones_i(); // { 0xFFFF'FFFF'FFFF'FFFF, ... } + __m256i exp_mask = _mm256_srli_epi64( ones, 53 ); // { 0x0000'0000'0000'07FF, .... } + __m256i one = _mm256_srli_epi64( ones, 54 ); // { 0x0000'0000'0000'03FF, .... } + __m256i sign = _mm256_slli_epi64( ones, 63 ); // { 0x8000'0000'0000'0000, .... } + exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000, .... } + one = _mm256_slli_epi64( one, 52 ); // { 0x3FF0'0000'0000'0000, .... } + sign = gather_lanes< 1, clear_lane >( sign ); // { sign_bit, sign_bit, 0, 0 } + + for( int iq = 0; iq < 4; ++iq ) + { + __m256d* end = quadrant[ iq ] + n; + + while( quadrant[ iq ] < end ) + { + __m256d* q = quadrant[ iq ]; + __m256i exp = and_i( *q, exp_mask ); + __m256i cmp_huge = _mm256_cmpgt_epi32( exp, huge ); + + if( _mm256_testz_si256( cmp_huge, cmp_huge ) ) // no huge entries + { + __m256i cmp_tiny = _mm256_cmpgt_epi32( tiny, exp ); + + if( _mm256_testz_si256( cmp_tiny, cmp_tiny ) ) // no entries are huge or tiny + { + ++quadrant[ iq ]; + continue; + } + + // no entries are huge, but aome are tiny. We scale by 2^(huge - max_exp[ i ] ) + + __m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? } + __m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? } + __m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? } + max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....} + + int64_t int_max = _mm256_extract_epi64( max, 0 ); + if( int_max == 0 ) // all exponents are zero. + { + __m256d cmp = _mm256_cmp_pd( *q, zero_d(), _CMP_EQ_OQ ); + if( ( _mm256_movemask_pd( cmp ) & 0x7 ) == 0x7 ) // all coefficients are zero + { // drop the constraint + *q = *( --end ); + continue; + } + } + + max = permute< 0, 0, 0, 0 >( max ); + __m256i de = _mm256_sub_epi32( huge, max ); + + // de may be larger than 0x3ff and we must scale in two steps + + *q = xor_d( *q, sign ); + + __m256i hs = _mm256_srli_epi64( de, 1 ); // de/2 + hs = _mm256_and_si256( hs, exp_mask ); // throwing away the remainder + __m256d scale = cast_d( _mm256_add_epi64( hs, one ) ); + *q = _mm256_mul_pd( scale, *q ); + + __m256i ls = _mm256_sub_epi64( de, hs ); // de - hs + scale = cast_d( _mm256_add_epi64( ls, one ) ); + *q = _mm256_mul_pd( scale, *q ); + } + else // some huge entries + { + __m256i exp_b = permute< 1, 0, 2, 3 >( exp ); // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? } + __m256i exp_c = gather_lanes< 1, 1 >( exp ); // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? } + __m256i max = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? } + max = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....} + + max = permute< 0, 0, 0, 0 >( max ); + __m256i de = _mm256_sub_epi32( huge, max ); // de > -0x3ff + __m256d scale = cast_d( _mm256_add_epi64( de, one ) ); // this does not underflow + + *q = xor_d( *q, sign ); + *q = _mm256_mul_pd( scale, *q ); // this may be inexact, but the inexact entries will be dropped later + } + + // cleaning up the remaining tiny entries + + __m256i exp_d = and_i( *q, exp_mask ); + __m256i cmp_d = _mm256_cmpgt_epi32( exp_d, tiny ); // 8 x (4 byte masks) + int cmp_d_mask = _mm256_movemask_epi8( cmp_d ); // 8 bits for x, 8 bits for y, 8 bits for z + + if( cmp_d_mask & ( x_index_epi8 | y_index_epi8 ) ) + { + // In order to preserve the solution set, we must round the right hand side down and + // the left hand side up, so that every solution of the old problem satisfies the new contraints. + // To do that, we want to round the left hand side up and the right hand side down. Since + // the lhs were flipped, we simply round all entries down, that is: + // + // we replace x in ( -tiny, 0 ) by -tiny and x in [ 0, tiny ) by +0.0 + // (and fix the zeros latter ) + + __m256d zero = zero_d(); + __m256i exp_f = and_i( *q, exp_mask ); + __m256i full_sign = gather_lanes< 0, 0 >( sign ); + __m256i minus_tiny = _mm256_or_si256( tiny, full_sign ); + __m256i is_tiny_f = _mm256_cmpgt_epi32( tiny, exp_f ); + __m256d non_negative = _mm256_cmp_pd( *q, zero, _CMP_GE_OQ ); + + // tiny negatives should be replaced by -tiny, tiny non negatives are replace by 0. + __m256d fix = blend( cast_d( minus_tiny ), zero, non_negative ); + + // fixing the tiny entries + *q = blend( *q, fix, is_tiny_f ); + + // restoring the rhs sign + *q = xor_d( *q, sign ); + + // cleaning up the zeros + __m256i abs = _mm256_slli_epi64( cast_i( *q ), 1 ); // throwing away the sign bit + __m256i is_zero = _mm256_cmpeq_epi64( abs, cast_i( zero ) ); + + *q = blend( *q, zero, is_zero ); + + ++quadrant[ iq ]; + } + else + { + // | a | <= tiny and | b | <= tiny: and |c| >= huge, + // The constraint is either irrelevant or fatal. + + int pb_sign = _mm256_movemask_pd( *q ); + if( !( pb_sign & 0x4 ) ) // drop the quadrant: the right hand side is too large + { + ret |= ( 0x1 << iq ); + break; + } + + // else, drop the constraint: the right hand side is too small + + *q = *( --end ); + } + } + } + + return ret; +} + +} // namespace wm::nlp::lp2d +//---------------------------------------------------------------------------------------- + +extern "C" +{ + +int interval_system_2d( double* solution, double const* box, double const* eq, + unsigned int n_equations ) +{ + constexpr int max_quadrant_size = 32; + + using namespace wm::nlp::lp2d; + + double* solsPtr = solution; + + printf("box: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", box[0], box[1], box[2], box[3]); + printf("n_equations = %u\n", n_equations); + for (int i = 0; i < n_equations; i++) { + printf("equation %u: a_min = %f, a_max = %f, b_min = %f, b_max = %f, c_min = %f, c_max = %f\n", i, eq[6*i], eq[6*i + 1], eq[6*i + 2], eq[6*i + 3], eq[6*i + 4], eq[6*i + 5]); + } + + if( n_equations > ( max_quadrant_size / 2 ) ) + return -1; + + if( std::bit_cast< uint64_t >( box ) & 0x1F ) + return -2; + + if( std::bit_cast< uint64_t >( eq ) & 0x1F ) + return -3; + + + + int ret = 0; + int n_equations_q = n_equations / 2; + int n_equations_r = n_equations % 2; + + double const* eq_max = eq + n_equations_q; + + __m256i huge; + __m256i tiny; + __m256i signi = bounds( &tiny, &huge ); + __m256i exp_mask = _mm256_srli_epi64( ones_i(), 53 ); + exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000... } + + __m256d sign = cast_d( signi ); + __m256d sign_01 = blend< 0, 0, 1, 1 >( sign, zero_d() ); // { -1: -1, 0, 0 } + + int old_rounding_mode = std::fegetround(); + + if( std::fesetround( FE_DOWNWARD ) ) + return -4; + + __m256d quadrants[ 4 * max_quadrant_size ]; + + __m256d* q[ 4 ] = { quadrants, + quadrants + max_quadrant_size, + quadrants + 2 * max_quadrant_size, + quadrants + 3 * max_quadrant_size }; + + int empty_quadrant = 0; + double const* p_eq = eq; + int quadrant_is_not_empty = 0xF; + + for( ; p_eq < eq_max; p_eq += 12 ) + { + __m256d a0 = _mm256_load_pd( p_eq ); + __m256d aux = _mm256_load_pd( p_eq + 4 ); + __m256d bc1 = _mm256_load_pd( p_eq + 8 ); + + // the equations in this group are + // + // a0[ 0, 1 ] x + a0[ 2, 3 ] y = aux[ 0, 1 ] + // aux[ 2, 3 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] + + __m256d bc0 = gather_lanes< 1, 0 >( a0, aux ); + __m256d a1 = gather_lanes< 1, 0 >( aux ); + + // the equations in this group are now + // + // a0[ 0, 1 ] x + bc0[ 0, 1 ] y = bc0[ 2, 3 ] + // a1[ 0, 1 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] + + __m256d max_c0 = permute< 0, 1, 3, 2 >( bc0 ); //{ ?, ?, bc0[ 3 ], ? } + __m256d max_c1 = permute< 0, 1, 3, 2 >( bc1 ); //{ ?, ?, bc1[ 3 ], ? } + + for( int iq = 0; iq < 4; ++iq ) // iqth quadrant + { + q[ iq ][ 0 ] = gather_low( a0, bc0 ); // { a0[ 0 ], bc0[ 0 ], ?, ? } + q[ iq ][ 1 ] = gather_high( a0, bc0 ); // { a0[ 1 ], bc0[ 1 ], ?, ? } + q[ iq ][ 2 ] = gather_low( a1, bc1 ); // { a1[ 0 ], bc1[ 0 ], ?, ? } + q[ iq ][ 3 ] = gather_high( a1, bc1 ); // { a1[ 1 ], bc1[ 1 ], ?, ? } + + q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c0 ); // { a0[ 0 ], bc0[ 0 ], bc0[ 3 ], ? } + q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], bc0 ); // { a0[ 1 ], bc0[ 1 ], bc0[ 2 ], ? } + q[ iq ][ 2 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 2 ], max_c1 ); // { a1[ 0 ], bc1[ 0 ], bc1[ 3 ], ? } + q[ iq ][ 3 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 3 ], bc1 ); // { a1[ 1 ], bc1[ 1 ], bc1[ 2 ], ? } + + q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -a0[ 0 ], -bc0[ 0 ], -bc0[ 3 ], ? } + q[ iq ][ 2 ] = xor_d( q[ iq ][ 2 ], sign ); // { -a1[ 0 ], -bc1[ 0 ], -bc1[ 3 ], ? } + + // moving to the next quadrant + + if( iq & 0x1 ) // replacing y by - y + { + bc0 = xor_d( bc0, sign_01 ); + bc1 = xor_d( bc1, sign_01 ); + bc0 = permute< 1, 0, 2, 3 >( bc0 ); + bc1 = permute< 1, 0, 2, 3 >( bc1 ); + } + else // replacing x by - x + { + a0 = xor_d( a0, sign_01 ); + a1 = xor_d( a1, sign_01 ); + a0 = permute< 1, 0, 2, 3 >( a0 ); + a1 = permute< 1, 0, 2, 3 >( a1 ); + } + } + + //---------------------------------------------------------------------------------------- + // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we + // normalize equation in which some elements are either +0.0 or have absolute value outside + // the range [ tiny, huge ]. In this process we may detect that the intersection of the + // feasible region with some quadrants is empty, or that some equations are irrelevant. + //---------------------------------------------------------------------------------------- + + __m256d ra = gather_lanes< 0, 0 >( a0, a1 ); // { row[ 0 ][ 0 ], row[ 0 ][ 1 ], row[ 3 ][ 0 ], row[ 3 ][ 1 ] } + __m256i expa = and_i( ra, exp_mask ); + __m256i exp0 = and_i( bc0, exp_mask ); + __m256i exp1 = and_i( bc1, exp_mask ); + + __m256i min = _mm256_min_epi32( expa, exp0 ); + __m256i max = _mm256_max_epi32( expa, exp0 ); + min = _mm256_min_epi32( min, exp1 ); + max = _mm256_max_epi32( max, exp1 ); + + __m256i c_min = _mm256_cmpgt_epi32( tiny, min ); + __m256i c_max = _mm256_cmpgt_epi32( max, huge ); + __m256i bad = or_i( c_min, c_max ); + + if( _mm256_testz_si256( bad, bad ) ) + { + q[ 0 ] += 4; + q[ 1 ] += 4; + q[ 2 ] += 4; + q[ 3 ] += 4; + } + else + empty_quadrant |= normalize_equations( q, 4, tiny, huge ); + } + + if( n_equations_r ) + { + __m256d ab = _mm256_load_pd( p_eq ); + __m128d aux = _mm_load_pd( p_eq + 4 ); + + __m256d sign_23 = gather_lanes< 1, 0 >( sign_01 ); + + // the equations in this group are + // ab[ 0, 1 ] x + ab[ 2, 3 ] y = aux[ 0, 1 ] + + __m256d c = _mm256_castpd128_pd256( aux ); + c = gather_lanes< 0, 0 >( c ); + // the equations in this group are now + // ab[ 0, 1 ] x + ab[ 2, 3 ] y = c[ 0, 1 ] + + __m256d max_c = permute< 0, 1, 3, 3 >( c ); //{ ?, ?, c[ 1 ], ? } + + for( int iq = 0; iq < 4; ++iq ) // iqth quadrant + { + q[ iq ][ 0 ] = permute< 0, 2, 2, 3 >( ab ); // { ab[ 0 ], ab[ 2 ], ?, ? } + q[ iq ][ 1 ] = permute< 1, 3, 2, 3 >( ab ); // { ab[ 1 ], ab[ 3 ], ?, ? } + + q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c ); // { ab[ 0 ], ab[ 2 ], c[ 1 ], ? } + q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], c ); // { ab[ 1 ], ab[ 3 ], c[ 0 ], ? } + + q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -ab[ 0 ], -ab[ 2 ], -c[ 1 ], ? } + + // moving to the next quadrant + + if( iq & 0x1 ) // replacing y by - y + { + ab = xor_d( ab, sign_23 ); + ab = permute< 0, 1, 3, 2 >( ab ); + } + else // replacing x by - x + { + ab = xor_d( ab, sign_01 ); + ab = permute< 1, 0, 2, 3 >( ab ); + } + } + + //---------------------------------------------------------------------------------------- + // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we + // normalize equation in which some elements are either +0.0 or have absolute value outside + // the range [ tiny, huge ]. In this process we may detect that the intersection of the + // feasible region with some quadrants is empty, or that some equations are irrelevant. + //---------------------------------------------------------------------------------------- + + __m256i exp_ab = and_i( ab, exp_mask ); + __m256i exp_c = and_i( c, exp_mask ); + exp_c = blend< 0, 0, 1, 1 >( exp_c, exp_ab ); + + __m256i min = _mm256_min_epi32( exp_ab, exp_c ); + __m256i max = _mm256_max_epi32( exp_ab, exp_c ); + + __m256i c_min = _mm256_cmpgt_epi32( tiny, min ); + __m256i c_max = _mm256_cmpgt_epi32( max, huge ); + __m256i bad = or_i( c_min, c_max ); + + if( _mm256_testz_si256( bad, bad ) ) + { + q[ 0 ] += 2; + q[ 1 ] += 2; + q[ 2 ] += 2; + q[ 3 ] += 2; + } + else + empty_quadrant |= normalize_equations( q, 2, tiny, huge ); + } + + __m256d avx_box = _mm256_load_pd( box ); + __m256d cmp = _mm256_cmp_pd( avx_box, zero_d(), _CMP_GE_OQ ); + int positive_box = _mm256_movemask_pd( cmp ); + + __m256d edge[ 2 * ( max_quadrant_size + 2 ) ]; + __m256d vertex_pair[ 3 * ( max_quadrant_size + 2 ) ]; + bounded_convex bc( 2 * ( max_quadrant_size + 2 ), edge, vertex_pair ); + + constexpr uint x_min_field = 0x1; + constexpr uint x_max_field = 0x2; + constexpr uint y_min_field = 0x4; + constexpr uint y_max_field = 0x8; + + constexpr uint first_quadrant_field = 0x1; + constexpr uint second_quadrant_field = 0x2; + constexpr uint third_quadrant_field = 0x4; + constexpr uint fourth_quadrant_field = 0x8; + + if( positive_box & x_max_field ) // x_max >= 0 + { + if( !( empty_quadrant & first_quadrant_field ) && + ( positive_box & y_max_field ) ) // y_max >= 0, first quadrant + { + bc.setup( avx_box ); + bc.push( quadrants, q[ 0 ] ); + if( bc.type != bounded_convex_type::empty ) + { + ++ret; + __m256d bc_box = bc.box(); + _mm256_store_pd( solution, bc_box ); + solution += 4; + } + } + + if( !( empty_quadrant & fourth_quadrant_field ) && + !( positive_box & y_min_field ) ) // y_min < 0, fourth quadrant + { + __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 ); + y_sign = gather_lanes< clear_lane, 1 >( y_sign ); + + __m256d q_box = permute< 0, 1, 3, 2 >( avx_box ); + q_box = xor_d( q_box, y_sign ); + + bc.setup( q_box ); + bc.push( quadrants + 3 * max_quadrant_size, q[ 3 ] ); + + if( bc.type != bounded_convex_type::empty ) + { + ++ret; + __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 ); + y_sign = gather_lanes< clear_lane, 1 >( y_sign ); + + __m256d bc_box = bc.box(); + bc_box = xor_d( bc_box, y_sign ); + bc_box = permute< 0, 1, 3, 2 >( bc_box ); + _mm256_store_pd( solution, bc_box ); + solution += 4; + } + } + } + + if( !( positive_box & x_min_field ) ) // x_min < 0 + { + if( !( empty_quadrant & second_quadrant_field) && + ( positive_box & y_max_field ) ) // y_max >= 0, second quadrant + { + __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 ); + x_sign = gather_lanes< 0, clear_lane >( x_sign ); + + __m256d q_box = permute< 1, 0, 2, 3 >( avx_box ); + q_box = xor_d( q_box, x_sign ); + + bc.setup( q_box ); + bc.push( quadrants + max_quadrant_size, q[ 1 ] ); + + if( bc.type != bounded_convex_type::empty ) + { + ++ret; + __m256d bc_box = bc.box(); + __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 ); + x_sign = gather_lanes< 0, clear_lane >( x_sign ); + bc_box = xor_d( bc_box, x_sign ); + bc_box = permute< 1, 0, 2, 3 >( bc_box ); + _mm256_store_pd( solution, bc_box ); + solution += 4; + } + } + + if( !( empty_quadrant & third_quadrant_field ) && + !( positive_box & y_min_field ) ) // y_min < 0, third quadrant + { + __m256i sign = _mm256_slli_epi64( ones_i(), 63 ); + avx_box = permute< 1, 0, 3, 2 >( avx_box ); + avx_box = xor_d( avx_box, sign ); + bc.setup( avx_box ); + bc.push( quadrants + 2 * max_quadrant_size, q[ 2 ] ); + + if( bc.type != bounded_convex_type::empty ) + { + ++ret; + __m256d bc_box = bc.box(); + __m256i sign = _mm256_slli_epi64( ones_i(), 63 ); + bc_box = xor_d( bc_box, sign ); + bc_box = permute< 1, 0, 3, 2 >( bc_box ); + _mm256_store_pd( solution, bc_box ); + } + } + } + + std::fesetround( old_rounding_mode ); + + for (int i = 0; i < ret; i++) { + printf("solution %u: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", i, solsPtr[4*i], solsPtr[4*i + 1], solsPtr[4*i + 2], solsPtr[4*i + 3]); + } + + return ret; +} + +} // extern "C" +//---------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/brush-strokes/cbits/lp_2d.hpp b/brush-strokes/cbits/lp_2d.hpp new file mode 100644 index 0000000..32ffa0b --- /dev/null +++ b/brush-strokes/cbits/lp_2d.hpp @@ -0,0 +1,1265 @@ +#pragma once + +//---------------------------------------------------------------------------------------- +// Copyright (c) 2024 Walter Mascarenhas +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability", +// and we provide no guarantee regarding the correctness of this source code. +// +//---------------------------------------------------------------------------------------- + +#include +#include +#include + +typedef unsigned int uint; + +//---------------------------------------------------------------------------------------- +namespace wm::nlp::lp2d { + +//---------------------------------------------------------------------------------------- +// avx2 wrappers, to simplify the notation a bit and clarify the use of immediates +//---------------------------------------------------------------------------------------- + +inline __m256d and_d( __m256d x, __m256d y ) +{ + return _mm256_and_pd( x, y ); +} + +inline __m256i and_i( __m256i x, __m256d y ) +{ + __m256i yi = _mm256_castpd_si256( y ); + return _mm256_and_si256( x, yi ); +} + +inline __m256i and_i( __m256d x, __m256i y ) +{ + __m256i xi = _mm256_castpd_si256( x ); + __m256i r = _mm256_and_si256( xi, y ); + return r; +} + +inline __m256i and_i( __m256i x, __m256i y ) +{ + return _mm256_and_si256( x, y ); +} + +inline __m256d and_not_d( __m256i x, __m256i y ) +{ + __m256d xd = _mm256_castsi256_pd( x ); + __m256d yd = _mm256_castsi256_pd( y ); + return _mm256_andnot_pd( xd, yd ); +} + +inline __m256d and_not_d( __m256d x, __m256d y ) +{ + return _mm256_andnot_pd( x, y ); +} + +inline __m256i and_not_i( __m256i x, __m256i y ) +{ + __m256d xd = _mm256_castsi256_pd( x ); + __m256d yd = _mm256_castsi256_pd( y ); + __m256d rd = _mm256_andnot_pd( xd, yd ); + __m256i r = _mm256_castpd_si256( rd ); + return r; +} + +template < uint I0, uint I1, uint I2, uint I3 > +requires ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 < 2 ) && ( I3 < 2 ) ) +__m256d blend( __m256d x, __m256d y ) +{ + __m256d p = _mm256_blend_pd( x, y, ( I0 | ( I1 << 1 ) | ( I2 << 2 ) | ( I3 << 3 ) ) ); + return p; +} + +template < uint I0, uint I1, uint I2, uint I3 > +requires ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 < 2 ) && ( I3 < 2 ) ) +__m256i blend( __m256i x, __m256i y ) +{ + __m256d xd = _mm256_castsi256_pd( x ); + __m256d yd = _mm256_castsi256_pd( y ); + __m256d pd = _mm256_blend_pd( xd, yd, ( I0 | ( I1 << 1 ) | ( I2 << 2 ) | ( I3 << 3 ) ) ); + __m256i p = _mm256_castpd_si256( pd ); + return p; +} + +inline __m256d blend( __m256d a, __m256d b, __m256d mask ) +{ + __m256d blended = _mm256_blendv_pd( a, b, mask ); + return blended; +} + +inline __m256d blend( __m256d a, __m256d b, __m256i mask ) +{ + __m256d mask_d = _mm256_castsi256_pd( mask ); + __m256d blended = _mm256_blendv_pd( a, b, mask_d ); + return blended; +} + +inline __m256i blend( __m256i a, __m256i b, __m256i mask ) +{ + __m256d a_d = _mm256_castsi256_pd( a ); + __m256d b_d = _mm256_castsi256_pd( b ); + __m256d mask_d = _mm256_castsi256_pd( mask ); + __m256d blended_d = _mm256_blendv_pd( a_d, b_d, mask_d ); + __m256i blended = _mm256_castpd_si256( blended_d ); + return blended; +} + +inline __m256d blend_max( __m256d x, __m256d y, __m256d greater_mask ) +{ + __m256d max = blend( y, x, greater_mask ); + return max; +} + +inline __m256d blend_min( __m256d x, __m256d y, __m256d greater_mask ) +{ + __m256d min = blend( x, y, greater_mask ); + return min; +} + +inline __m256d cast_d( __m256i x ) +{ + return _mm256_castsi256_pd( x ); +} + +inline __m256i cast_i( __m256d x ) +{ + return _mm256_castpd_si256( x ); +} + +//---------------------------------------------------------------------------------------- +// +// gather_lanes< a, b >( x ), with a,b in { 0, 1, 2 } returns { x.lane( a ), x.lane( b ) } +// with x.lane( 2 ) = 0 +// +//---------------------------------------------------------------------------------------- + +constexpr int clear_lane = 8; + +template < uint I0, uint I1 > +requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) ) +__m256i gather_lanes( __m256i x ) +{ + constexpr int lane_0 = I0; + constexpr int lane_1 = I1 << 4; + __m256d xd = _mm256_castsi256_pd( x ); + __m256d gd = _mm256_permute2f128_pd( xd, xd, lane_0 | lane_1 ); + __m256i g = _mm256_castpd_si256( gd ); + return g; +} + +template < uint I0, uint I1 > +requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) ) +__m256d gather_lanes( __m256d x ) +{ + constexpr int lane_0 = I0; + constexpr int lane_1 = I1 << 4; + + __m256d p; + if constexpr ( ( I0 == 0 ) && ( I1 == 1 ) ) + p = x; + else + p = _mm256_permute2f128_pd( x, x, lane_0 | lane_1 ); + return p; +} + +//---------------------------------------------------------------------------------------- +// +// gather_lanes< a, b >( x, y ), with a,b in { 0, 1, clear_lane } returns { x.lane( a ), y.lane( b ) } +// with x.lane( clear_lane ) = 0 +// +//---------------------------------------------------------------------------------------- + +template < uint I0, uint I1 > +requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) ) +__m256d gather_lanes( __m256d x, __m256d y ) +{ + constexpr int lane_0 = I0; + constexpr int lane_1 = ( (I1 == clear_lane) ? clear_lane : ( I1 + 2 ) ) << 4; + + __m256d p; + if( ( I0 == 0 ) && ( I1 == 1 ) ) + p = blend< 0, 0, 1, 1 >( x, y ); + else + p = _mm256_permute2f128_pd( x, y, lane_0 | lane_1 ); + return p; +} + +//---------------------------------------------------------------------------------------- +// gather_high returns { x[ 1 ], y[ 1 ], x[ 3 ], y[ 3 ] } +//---------------------------------------------------------------------------------------- + +inline __m256d gather_high( __m256d x, __m256d y ) +{ + return _mm256_unpackhi_pd( x, y ); +} + +//---------------------------------------------------------------------------------------- +// gather_low returns { x[ 0 ], y[ 0 ], x[ 2 ], y[ 2 ] } +//---------------------------------------------------------------------------------------- + +inline __m256d gather_low( __m256d x, __m256d y ) +{ + return _mm256_unpacklo_pd( x, y ); +} + +//---------------------------------------------------------------------------------------- +// high_bits_64 returns the bits 63, 64 + 63, 128 + 63 and 196 + 63 of x +//---------------------------------------------------------------------------------------- + +inline int high_bits_64( __m256d x ) +{ + return _mm256_movemask_pd( x ); +} + +inline int high_bits_64( __m256i x ) +{ + __m256d xd = cast_d( x ); + return _mm256_movemask_pd( xd ); +} + +//---------------------------------------------------------------------------------------- +// gather_low returns { x[ 0 ], y[ 0 ], x[ 2 ], y[ 2 ] } +//---------------------------------------------------------------------------------------- + +inline __m256d is_greater( __m256d x, __m256d y ) +{ + __m256d gt = _mm256_cmp_pd( x, y, _CMP_GT_OQ ); + return gt; +} + +inline __m256i ones_i() +{ + __m256i r; + return _mm256_cmpeq_epi64( r, r ); +} + +inline __m256d or_d( __m256d x, __m256d y ) +{ + return _mm256_or_pd( x, y ); +} + +inline __m256i or_i( __m256i x, __m256i y ) +{ + return _mm256_or_si256( x, y ); +} + +template < uint I0, uint I1, uint I2, uint I3 > +requires ( ( I0 < 4 ) && ( I1 < 4 ) && ( I2 < 4 ) && ( I3 < 4 ) ) +__m256d permute( __m256d x ) +{ + __m256d p; + if constexpr ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 > 1 ) && ( I3 > 1 ) ) + p = _mm256_permute_pd( x, ( I0 | ( I1 << 1 ) | ( ( I2 - 2 ) << 2 ) | ( ( I3 - 2 ) << 3 ) ) ); + else + p = _mm256_permute4x64_pd( x, ( I0 | ( I1 << 2 ) | ( I2 << 4 ) | ( I3 << 6 ) ) ); + return p; +} + +template < uint I0, uint I1, uint I2, uint I3 > +requires ( ( I0 < 4 ) && ( I1 < 4 ) && ( I2 < 4 ) && ( I3 < 4 ) ) +__m256i permute( __m256i x ) +{ + __m256d xd = _mm256_castsi256_pd( x ); + __m256d pd = permute< I0, I1, I2, I3 >( xd ); + __m256i p = _mm256_castpd_si256( pd ); + return p; +} + +inline __m256d xor_d( __m256d x, __m256d y ) +{ + return _mm256_xor_pd( x, y ); +} + +inline __m256d xor_d( __m256d x, __m256i y ) +{ + __m256d yd = cast_d( y ); + return _mm256_xor_pd( x, yd ); +} + +inline __m256d xor_d( __m256i x, __m256d y ) +{ + __m256d xd = cast_d( x ); + return _mm256_xor_pd( xd, y ); +} + +inline __m256i xor_i( __m256i x, __m256i y ) +{ + return _mm256_xor_si256( x, y ); +} + +inline __m256d zero_d() +{ + __m256d r; + return _mm256_xor_pd( r, r ); +} + +inline __m256i zero_i() +{ + __m256i r; + return _mm256_xor_si256( r, r ); +} + +//---------------------------------------------------------------------------------------- +// avx2 wrappers, to simplify the notation a bit and clarify the use of immediates +//---------------------------------------------------------------------------------------- + +enum class location +{ + null = -1024, + out = -1, + border = 0, + in = 1 +}; + +enum class bounded_convex_type +{ + empty = 0, + point = 1, + segment = 2, + polygon = 3 +}; + +//---------------------------------------------------------------------------------------- +// vertex_0 and vertex_1 scatter the arrays +// +// { inf_y, inf_z, inf_x, ? } and +// { m_sup_y, m_sup_z, m_sup_x, ? } +// +// to vertex_pair, in the x, y, z order +//---------------------------------------------------------------------------------------- + +struct vertex_0 +{ + static void scatter( __m256d* vertex_pair, __m256d inf, __m256d m_sup ) + { + __m256d y = gather_low( inf, m_sup ); // { inf( y ), m_sup( y ), inf( x ), m_sup( x ) } + __m256d z = gather_high( inf, m_sup ); // { inf( z ), m_sup( z ), ? ? } + __m256d x = gather_lanes< 1, 1 >( y ); // { inf( x ), m_sup( x ), ? ? } + + vertex_pair[ 0 ] = blend< 0, 0, 1, 1 >( x, vertex_pair[ 0 ] ); + vertex_pair[ 1 ] = blend< 0, 0, 1, 1 >( y, vertex_pair[ 1 ] ); + vertex_pair[ 2 ] = blend< 0, 0, 1, 1 >( z, vertex_pair[ 2 ] ); + } +}; + +struct vertex_1 +{ + static void scatter( __m256d* vertex_pair, __m256d inf, __m256d m_sup ) + { + __m256d x = gather_low( inf, m_sup ); // { inf( y ), m_sup( y ), inf( x ), m_sup( x ) } + __m256d z = gather_high( inf, m_sup ); // { inf( z ), m_sup( z ), ?, ? } + __m256d y = gather_lanes< 0, 0 >( x ); // { ?, ?, inf( y ), m_sup( y ) } + z = gather_lanes< 0, 0 >( z ); // { ?. . inf( z ), m_sup( z ) } + + vertex_pair[ 0 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 0 ], x ); + vertex_pair[ 1 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 1 ], y ); + vertex_pair[ 2 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 2 ], z ); + } +}; + +//---------------------------------------------------------------------------------------- +// cross computes the solution ( x / z, y / z ) of the equation +// +// a[ 0 ] x + a[ 1 ] y = a[ 2 ] +// b[ 0 ] x + b[ 1 ] y = b[ 2 ] +// +// which is +// +// x = a[ 2 ] b[ 1 ] - a[ 1 ] b[ 2 ] +// y = a[ 0 ] b[ 2 ] - a[ 2 ] b[ 0 ] +// z = a[ 0 ] b[ 1 ] - a[ 1 ] b[ 0 ] +// +// which is convenient to order as +// +// y = a[ 0 ] b[ 2 ] - a[ 2 ] b[ 0 ] +// z = a[ 0 ] b[ 1 ] - a[ 1 ] b[ 0 ] +// x = a[ 2 ] b[ 1 ] - a[ 1 ] b[ 2 ] +// +// It assumes that x, y >= 0 and z > 0 and makes sure that the rounded values of +// x, y and z also satisfy these lower bounds. +//---------------------------------------------------------------------------------------- + +template < class Vertex > +inline void cross( __m256d* vertex_pair, __m256d a, __m256d b ) +{ + __m256d inf_a = _mm256_movedup_pd( a ); // { a[ 0 ], a[ 0 ], a[ 2 ], ? } + __m256d m_sup_b = _mm256_movedup_pd( b ); // { b[ 0 ], b[ 0 ], b[ 2 ], ? } + + __m256d inf_b = permute< 2, 1, 1, 3 >( b ); // { b[ 2 ], b[ 1 ], b[ 1 ], ? } + __m256d m_sup_a = permute< 2, 1, 1, 3 >( a ); // { a[ 2 ], a[ 1 ], a[ 1 ], ? } + + __m256d inf_h = _mm256_mul_pd( inf_a, inf_b ); + __m256d m_sup_h = _mm256_mul_pd( m_sup_a, m_sup_b ); + + __m256d inf = _mm256_fnmadd_pd( m_sup_a, m_sup_b, inf_h ); // inf(x) = rounded_down( x ) + __m256d m_sup = _mm256_fnmadd_pd( inf_a, inf_b, m_sup_h ); // -sup(x) = rounded_down( -x ) + + // x,y must be >= 0 and z must be positive and the computed values violate this due to + // rounding. In this rare case in which inf[ i ] <= 0 I can prove that the exact value + // of inf[ i ] is m_sup[ i ] - inf[ i ] + + __m256d zero = zero_d(); + __m256d cmp = _mm256_cmp_pd( inf, zero, _CMP_LE_OQ ); // rounded_down( x ) <= 0 ? + __m256d ds = _mm256_sub_pd( m_sup, inf ); + inf = blend( inf, ds, cmp ); + + Vertex::scatter( vertex_pair, inf, m_sup ); +} + +//---------------------------------------------------------------------------------------- +// exact_location returns the exact location of the intersection ( x, y ) of the lines +// +// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ] +// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ] +// +// with respect to the set +// +// e0 x + e1 y >= e2 +// +// under the assumption that +// +// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0 +// +// where ei = edge[ i ], b0 = border[ 0 ] and b1 = b[ 1 ] +// +// This amounts to evaluating the sign of +// +// p = e0 r + e1 s - e2 d +// +// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ] +// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] , +// +// which motivates the evaluation of the six products +// +// group 1 group 2 +// e0 b0[ 2 ] b1[ 1 ], -e0 b0[ 1 ] b1[ 2 ] +// e1 b0[ 0 ] b1[ 2 ], -e1 b0[ 2 ] b1[ 0 ] +// e2 b0[ 1 ] b1[ 0 ]. -e2 b0[ 0 ] b1[ 1 ] +// +// and we want to find the sign of their sum. +// +// The function assumes that the edges entries are either normal or zero, ie., +// there are no subnormal entries. +//---------------------------------------------------------------------------------------- + +location exact_location( __m256d e, __m256d b0, __m256d b1 ); + +//---------------------------------------------------------------------------------------- +// The struct bounded convex represents a bounded convex set on the plane, which can +// have one of the following types: +// +// a) empty +// b) ṕoint +// c) segment +// d) polygon +// +//---------------------------------------------------------------------------------------- + +struct bounded_convex final +{ + constexpr bounded_convex( int alloc, __m256d* edge, __m256d* vertex_p ) + : alloc( alloc ), edge( edge ), vertex_p( vertex_p ){} + + void setup( __m256d box ); + + int insert_vertex( int at ); + int remove_vertices( int begin_vertex, int end_vertex ); + + void set_empty() + { + end = 0; + begin = 0; + type = bounded_convex_type::empty; + } + + void set_point( int first_vertex ) + { + end = 1; + begin = 0; + type = bounded_convex_type::point; + edge[ 0 ] = edge[ first_vertex ]; + edge[ 1 ] = edge[ first_vertex + 1 ]; + + __m256d* src = vertex_pair( first_vertex ); + + if( first_vertex & 0x1 ) + { + vertex_p[ 0 ] = gather_lanes< 1, 1 >( src[ 0 ] ); + vertex_p[ 1 ] = gather_lanes< 1, 1 >( src[ 1 ] ); + vertex_p[ 2 ] = gather_lanes< 1, 1 >( src[ 2 ] ); + } + else + { + vertex_p[ 0 ] = src[ 0 ]; + vertex_p[ 1 ] = src[ 1 ]; + vertex_p[ 2 ] = src[ 2 ]; + } + } + + void set_to_last_segment(); + + void set_segment( int first_vertex ) + { + end = 2; + begin = 0; + type = bounded_convex_type::segment; + edge[ 0 ] = edge[ first_vertex ]; + edge[ 1 ] = edge[ first_vertex + 1 ]; + edge[ 2 ] = edge[ first_vertex + 2 ]; + + __m256d* src = vertex_pair( first_vertex ); + + if( first_vertex & 0x1 ) + { + vertex_p[ 0 ] = gather_lanes< 1, 0 >( src[ 0 ], src[ 3 ] ); + vertex_p[ 1 ] = gather_lanes< 1, 0 >( src[ 1 ], src[ 4 ] ); + vertex_p[ 2 ] = gather_lanes< 1, 0 >( src[ 2 ], src[ 5 ] ); + } + else + { + vertex_p[ 0 ] = *src; + vertex_p[ 1 ] = *( ++src ); + vertex_p[ 2 ] = *( ++src ); + } + } + + __m256d box() const; + void push( __m256d* begin, __m256d* end ); + + __m256d plain_edge( int i ) const + { + return edge[ begin + i ]; + } + + void plain_vertex( __m256d* inf_msup, int i ) const + { + int ri = ( begin + i ); + __m256d const* vp = vertex_pair( ri ); + int di = 2 * ( ( begin + i ) & 0x1 ); + inf_msup[ 0 ] = _mm256_setr_pd( vp[ 0 ][ di ], vp[ 1 ][ di ], vp[ 2 ][ di ], 0.0 ); + inf_msup[ 1 ] = _mm256_setr_pd( vp[ 0 ][ di + 1 ], vp[ 1 ][ di + 1 ], vp[ 2 ][ di + 1 ], 0.0 ); + } + + __m256d* vertex_pair( int index ) + { + __m256d* p = vertex_p + 3 * ( index / 2 ); + return p; + } + + __m256d const* vertex_pair( int index ) const + { + __m256d* p = vertex_p + 3 * ( index / 2 ); + return p; + } + + int end; + int begin; + int const alloc; + __m256d* const edge; + __m256d* const vertex_p; + bounded_convex_type type; +}; + +// huge = { 0x5500'0000'0000'0000, 0x5500'0000'0000'0000, 0x5500'0000'0000'0000, 0x7FFF'7FFF'7FFF' } +// tiny = { 0x2bd0'0000'0000'0000, 0x2bd0'0000'0000'0000, 0x2bd0'0000'0000'0000, 0x0000'0000'0000'00000 } + +__m256i bounds( __m256i* tiny, __m256i* huge ); + +int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge ); + +//---------------------------------------------------------------------------------------- +// +// The class locator is used to find the location of a point with respect to the half +// plane. +// +// a x + b y >= c z +// +// The template parameters indicate the signs of the coefficients a, b and c. +// +//---------------------------------------------------------------------------------------- + +template < int flip_x, int flip_y, int flip_z > +struct locator final +{ + //---------------------------------------------------------------------------------------- + // The raw method assumes that + // + // edges[ 0 ] = { a, a, a, a } + // edges[ 1 ] = { b, b, b, b } + // edges[ 2 ] = { c, c, c, c } + // + // vertex_pair[ 0 ] = { x0_inf, x0_msup, x1_inf, x1_msup } + // vertex_pair[ 1 ] = { y0_inf, y0_msup, y1_inf, y1_msup } + // vertex_pair[ 2 ] = { z0_inf, z0_msup, z1_inf, z1_msup } + // + // and checks the location of the vertices ( x0, y0 ) and ( x1, y1 ) with respect to + // the half plane + // + // (-1)^flip_x a x + (-1)^flip_y b y >= (-1)^flip_z c z + // + // This is done by computing a x + b y - c z rounded up and down. + // + //---------------------------------------------------------------------------------------- + + static int raw( __m256d const* edge, __m256d const* vertex_pair ) + { + __m256d chk; + __m256d zero_d = _mm256_xor_pd( zero_d, zero_d ); + + if constexpr ( flip_x ) + chk = _mm256_mul_pd( edge[ 0 ], permute< 1, 0, 3, 2 >( vertex_pair[ 0 ] ) ); + else + chk = _mm256_mul_pd( edge[ 0 ], vertex_pair[ 0 ] ); + + if constexpr ( flip_y) + chk = _mm256_fmadd_pd( edge[ 1 ], permute< 1, 0, 3, 2 >( vertex_pair[ 1 ] ), chk ); + else + chk = _mm256_fmadd_pd( edge[ 1 ], vertex_pair[ 1 ], chk ); + + if constexpr ( flip_z ) + chk = _mm256_fmadd_pd( edge[ 2 ], vertex_pair[ 2 ], chk ); + else + chk = _mm256_fmadd_pd( edge[ 2 ], permute< 1, 0, 3, 2 >( vertex_pair[ 2 ] ), chk ); + + __m256d cmp = _mm256_cmp_pd( chk, zero_d, _CMP_GT_OQ ); + + int cm = _mm256_movemask_pd( cmp ); + + return cm; + } + + static location locate_vertex_0( __m256d const* edge, __m256d const* vertex_pair ) + { + location loc; + int cm = raw( edge, vertex_pair ); + + if( cm & 0x1 ) // inf( x[ 0 ] ) > 0 + loc = location::in; + else + { + if( cm & 0x2 ) + loc = location::out; + else + loc = location::null; + } + + return loc; + } + + static location locate_vertex_1( __m256d const* edge, __m256d const* vertex_pair ) + { + location loc; + int cm = raw( edge, vertex_pair ); + + if( cm & 0x4 ) // the second vertex is in + loc = location::in; + else + { + if( cm & 0x8 ) // the second vertex is out + loc = location::out; + else + loc = location::null; + } + + return loc; + } + + static void locate_pair( location loc[ 2 ], __m256d const* edge, __m256d const* vertex_pair ) + { + // computing a vertex[ 0 ] + b vertex[ 1 ] + c vertex[ 2 ] + + int cm = raw( edge, vertex_pair ); + if( cm & 0x1 ) // the first vertex is in + loc[ 0 ] = location::in; + else + { + if( cm & 0x2 ) // the first vertex is out + loc[ 0 ] = location::out; + else + loc[ 0 ] = location::null; + } + + if( cm & 0x4 ) // the second vertex is in + loc[ 1 ] = location::in; + else + { + if( cm & 0x8 ) // the second vertex is out + loc[ 1 ] = location::out; + else + loc[ 1 ] = location::null; + } + } + + static void insert( bounded_convex& r, __m256d new_edge ) + { + __m256i ones = _mm256_cmpeq_epi64( ones, ones ); + ones = _mm256_slli_epi64( ones, 63 ); + __m256d sign = _mm256_castsi256_pd( ones ); + + __m256d coefficients[ 3 ] = { permute< 0, 0, 0, 0 >( new_edge ), + permute< 1, 1, 1, 1 >( new_edge ), + permute< 2, 2, 2, 2 >( new_edge ) }; + + coefficients[ 0 ] = _mm256_andnot_pd( sign, coefficients[ 0 ] ); + coefficients[ 1 ] = _mm256_andnot_pd( sign, coefficients[ 1 ] ); + coefficients[ 2 ] = _mm256_andnot_pd( sign, coefficients[ 2 ] ); + + if( r.type != bounded_convex_type::polygon ) + { + if( r.type == bounded_convex_type::point ) + { + location loc = locate_vertex_0( coefficients, r.vertex_p ); + + if( loc == location::null ) + loc = exact_location( new_edge, r.edge[ 0 ], r.edge[ 1 ] ); + + if( loc == location::out ) + r.set_empty(); + + return; + } + + location loc[ 2 ]; + locate_pair( loc, coefficients, r.vertex_p ); + + if( loc[ 0 ] == location::null ) + loc[ 0 ] = exact_location( new_edge, r.edge[ 0 ], r.edge[ 1 ] ); + + if( loc[ 1 ] == location::null ) + loc[ 1 ] = exact_location( new_edge, r.edge[ 1 ], r.edge[ 2 ] ); + + switch( loc[ 0 ] ) + { + case location::in: + { + if( loc[ 1 ] == location::out ) + { + r.edge[ 2 ] = new_edge; + cross< vertex_1 >( r.vertex_p, r.edge[ 1 ], new_edge ); + } + return; + } + case location::border: + { + if( loc[ 1 ] == location::out ) + { + r.end = 1; + r.type = bounded_convex_type::point; + } + return; + } + default: // location::out + { + switch( loc[ 1 ] ) + { + case location::in: + { + r.edge[ 0 ] = new_edge; + cross< vertex_0 >( r.vertex_p, new_edge, r.edge[ 1 ] ); + return; + } + case location::border: + { + r.end = 1; + r.edge[ 0 ] = r.edge[ 1 ]; + r.edge[ 1 ] = r.edge[ 2 ]; + r.type = bounded_convex_type::point; + r.vertex_p[ 0 ] = gather_lanes< 1, 1 >( r.vertex_p[ 0 ] ); + r.vertex_p[ 1 ] = gather_lanes< 1, 1 >( r.vertex_p[ 1 ] ); + r.vertex_p[ 2 ] = gather_lanes< 1, 1 >( r.vertex_p[ 2 ] ); + return; + } + default: + { + r.set_empty(); + return; + } + } + } + } + } + + location loc[ 2 ]; + + int index = r.begin; + int intersection = -1; + bool intersection_at_vertex; + + __m256d* vertex_pair_at_index = r.vertex_pair( index ); + + auto scan = [ & ]() + { + if( ++index >= r.end ) + return location::null; + + if( index & 0x1 ) + return loc[ 1 ]; + + vertex_pair_at_index += 3; + + if( index + 1 >= r.end ) + { + loc[ 0 ] = locate_vertex_0( coefficients, vertex_pair_at_index ); + + if( loc[ 0 ] == location::null ) + loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] ); + } + else + { + locate_pair( loc, coefficients, vertex_pair_at_index ); + + if( loc[ 0 ] == location::null ) + loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] ); + + if( loc[ 1 ] == location::null ) + loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] ); + } + + return loc[ 0 ]; + }; + + if( index & 0x1 ) + { + location begin = locate_vertex_1( coefficients, vertex_pair_at_index ); + + if( begin == location::null ) + begin = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] ); + + switch( begin ) + { + case location::in: + goto cursor_in; + case location::border: + { + intersection = index; + intersection_at_vertex = true; + + ++index; + vertex_pair_at_index += 3; + + locate_pair( loc, coefficients, vertex_pair_at_index ); + + if( loc[ 0 ] == location::null ) + loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] ); + + if( loc[ 1 ] == location::null ) + loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] ); + + switch( loc[ 0 ] ) + { + case location::in: + goto cursor_in; + case location::border: + { + if( loc[ 1 ] == location::out ) + r.set_segment( r.begin ); + return; + } + case location::out: + goto cursor_out; + } + } + default: + goto cursor_out; + } + } + + locate_pair( loc, coefficients, vertex_pair_at_index ); + + if( loc[ 0 ] == location::null ) + loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] ); + + if( loc[ 1 ] == location::null ) + loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] ); + + switch( loc[ 0 ] ) + { + case location::in: + goto cursor_in; + case location::border: + { + intersection = index; + intersection_at_vertex = true; + + ++index; + + switch( loc[ 1 ] ) + { + case location::in: + goto cursor_in; + case location::border: + { + location next = locate_vertex_0( coefficients, vertex_pair_at_index + 3 ); + + if( next == location::null ) + next = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] ); + + if( next == location::out ) + r.set_segment( r.begin ); + + return; + } + default: + goto cursor_out; + } + } + default: + goto cursor_out; + } + + cursor_in: + { + location next_loc = scan(); + + switch( next_loc ) + { + case location::in: + goto cursor_in; + case location::border: + { + if( intersection >= 0 ) + { + if( !intersection_at_vertex ) + { + __m256d* vp = r.vertex_pair( intersection - 1 ); + if( intersection & 0x1 ) + cross< vertex_0 >( vp, new_edge, r.edge[ intersection ] ); + else + cross< vertex_1 >( vp, new_edge, r.edge[ intersection ] ); + + --intersection; + } + + r.end = index + 1; + r.begin = intersection; + r.edge[ r.end ] = new_edge; + r.edge[ r.begin ] = new_edge; + } + else + { + next_loc = scan(); + + if( next_loc == location::out ) + { + intersection = index - 1; + intersection_at_vertex = true; + goto cursor_out; + } + } + return; + } + case location::out: + { + if( intersection < 0 ) + { + intersection = index; + intersection_at_vertex = false; + goto cursor_out; + } + + if( intersection_at_vertex ) + r.begin = intersection; + else + { + r.begin = intersection - 1; + __m256d* vp = r.vertex_pair( r.begin ); + + if( r.begin & 0x1 ) + cross< vertex_1 >( vp, new_edge, r.edge[ intersection ] ); + else + cross< vertex_0 >( vp, new_edge, r.edge[ intersection ] ); + } + + __m256d* vp = r.vertex_pair( index ); + if( index & 0x1 ) + cross< vertex_1 >( vp, r.edge[ index ], new_edge ); + else + cross< vertex_0 >( vp, r.edge[ index ], new_edge ); + + r.end = index + 1; + r.edge[ r.end ] = new_edge; + r.edge[ r.begin ] = new_edge; + return; + } + case location::null: + { + assert( index == r.end ); + + if( intersection >= 0 ) + { + if( intersection_at_vertex ) + { + if( intersection == r.begin ) + return; + } + else + { + __m256d* first_pair = r.vertex_pair( intersection - 1 ); + + if( intersection & 0x1 ) + cross< vertex_0 >( first_pair, new_edge, r.edge[ intersection ] ); + else + cross< vertex_1 >( first_pair, new_edge, r.edge[ intersection ] ); + + --intersection; + } + + __m256d* last_pair = r.vertex_pair( index ); + + if( index & 0x1 ) + cross< vertex_1 >( last_pair, r.edge[ index ], new_edge ); + else + cross< vertex_0 >( last_pair, r.edge[ index ], new_edge ); + + ++r.end; + r.begin = intersection; + r.edge[ r.end ] = new_edge; + r.edge[ r.begin ] = new_edge; + } + + return; + } + } + } + + cursor_out: + { + location new_loc = scan(); + + switch( new_loc ) + { + case location::in: + { + if( intersection < 0 ) + { + intersection = index; + intersection_at_vertex = false; + goto cursor_in; + } + + if( intersection + 1 == index ) + { + assert( !intersection_at_vertex ); + int new_vertex = r.insert_vertex( intersection ); + + if( new_vertex == intersection ) + { + __m256d* vp = r.vertex_pair( intersection ); + if( intersection & 0x1 ) + { + cross< vertex_1 >( vp, r.edge[ intersection ], new_edge ); + cross< vertex_0 >( vp + 3, new_edge, r.edge[ intersection + 2 ] ); + } + else + { + cross< vertex_0 >( vp, r.edge[ intersection ], new_edge ); + cross< vertex_1 >( vp, new_edge, r.edge[ intersection + 2 ] ); + } + + r.edge[ intersection + 1 ] = new_edge; + } + else + { + assert( new_vertex == intersection - 1 ); + + __m256d* vp = r.vertex_pair( intersection ); + + if( intersection & 0x1 ) + { + cross< vertex_1 >( vp, new_edge, r.edge[ index ] ); + cross< vertex_0 >( vp, r.edge[ intersection ], new_edge) ; + } + else + { + cross< vertex_0 >( vp, new_edge, r.edge[ index ] ); + cross< vertex_1 >( vp - 3, r.edge[ intersection ], new_edge) ; + } + + r.edge[ new_vertex ] = r.edge[ intersection ]; + r.edge[ intersection ] = new_edge; + } + return; + } + + if( !intersection_at_vertex ) + { + __m256d* vp = r.vertex_pair( intersection ); + + if( intersection & 0x1 ) + cross< vertex_1 >( vp, r.edge[ intersection ], new_edge ); + else + cross< vertex_0 >( vp, r.edge[ intersection ], new_edge ); + } + + if( intersection + 2 < index ) + index = r.remove_vertices( intersection + 1, index - 1 ) + 2; + + __m256d* vp = r.vertex_pair( index - 1 ); + if( index & 0x1 ) + cross< vertex_0 >( vp, new_edge, r.edge[ index ] ); + else + cross< vertex_1 >( vp, new_edge, r.edge[ index ] ); + + r.edge[ index - 1 ] = new_edge; + return; + } + case location::border: + { + if( intersection >= 0 ) + { + if( intersection_at_vertex ) + { + if( ( intersection == r.begin ) && ( index + 1 == r.end ) ) + { + r.set_to_last_segment(); + return; + } + } + else + { + __m256d* vp = r.vertex_pair( intersection ); + if( intersection & 0x1 ) + cross< vertex_1 >( vp, r.edge[ intersection ], new_edge ); + else + cross< vertex_0 >( vp, r.edge[ intersection ], new_edge ); + } + + if( intersection + 1 < index ) + intersection = r.remove_vertices( intersection + 1, index ); + + r.edge[ intersection + 1 ] = new_edge; + return; + } + + new_loc = scan(); + switch( new_loc ) + { + case location::in: + { + intersection = index - 1; + intersection_at_vertex = true; + goto cursor_in; + } + case location::border: + { + r.set_segment( index - 1 ); + return; + } + default: // location::out or location::null: + { + r.set_point( index - 1 ); + return; + } + } + } + case location::out: + goto cursor_out; + case location::null: + { + assert( index == r.end ); + + if( intersection < 0 ) + { + r.set_empty(); + return; + } + + if( intersection_at_vertex ) + { + if( intersection == r.begin ) + { + r.set_point( r.begin ); + return; + } + } + else + { + __m256d* vp = r.vertex_pair( intersection ); + + if( intersection & 0x1 ) + cross< vertex_1 >( vp, r.edge[ intersection ], new_edge ); + else + cross< vertex_0 >( vp, r.edge[ intersection ], new_edge ); + } + + ++intersection; + __m256d* vp = r.vertex_pair( intersection ); + + if( intersection & 0x1 ) + cross< vertex_1 >( vp, new_edge, r.edge[ index ] ); + else + cross< vertex_0 >( vp, new_edge, r.edge[ index ] ); + + r.end = intersection + 1; + r.edge[ intersection ] = new_edge; + r.edge[ r.end ] = r.edge[ r.begin ]; + } + + return; + } + } + } +}; + +} // namespace wm::nlp::lp2d + +//---------------------------------------------------------------------------------------- +// +// The function interval_system_2d solves the system of interval equations +// +// ai x + bi y = ci +// +// for 0 <= i < n_equations and +// +// ai = [ eq[ 6 * i ], eq[ 6 * i + 1 ] ] with eq[ 6 * i ] <= eq[ 6 * i + 1 ] +// bi = [ eq[ 6 * i + 2 ], eq[ 6 * i + 3 ] ] with eq[ 6 * i + 2 ] <= eq[ 6 * i + 3 ] +// ci = [ eq[ 6 * i + 4 ], eq[ 6 * i + 5 ] ] with eq[ 6 * i + 4 ] <= eq[ 6 * i + 5 ] +// +// The function finds the solutions inside the box +// +// box[ 0 ] <= x <= box[ 1 ] +// box[ 2 ] <= y <= box[ 3 ] +// +// The solution consists of up to 4 boxes bx[ 0 ], bx[ 1 ], bx[ 2 ] and bx[ 3 ], with +// +// bx[ i ] = inf_x[ i ] <= x <= sup_x[ i ] +// inf_y[ i ] <= y <= sup_y[ i ] +// for +// +// inf_x[ i ] = solution[ 4 * i ] +// sup_x[ i ] = solution[ 4 * i + 1 ] +// inf_y[ i ] = solution[ 4 * i + 2 ] +// sup_y[ i ] = solution[ 4 * i + 3 ] +// +// The function returns the number of boxes in the solutions, which can be 0,1,2,3 or 4, +// or a negative number if the input is invalid or if it cannot change the rounding mode. +// +// It assumes that: +// +// a) n_equations <= 16 +// b) all doubles are finite, ie., not nan or +/-inf +// +// c) box and eq are such that +// +// box[ 0 ] <= box[ 1 ] +// box[ 2 ] <= box[ 3] +// eq[ 2 * i ] <= eq[ 2 i + 1 ] for i = 0,..., 3 * n_equations. +// +// d) box and eq are aligned at a 32 byte boundary +// +// It does not verify the input and its results should not be trusted if the inputs are +// invalid. +// +//---------------------------------------------------------------------------------------- + +extern "C" +{ +int interval_system_2d( double* solution, double const* box, double const* eq, unsigned int n_equations ); +} diff --git a/brush-strokes/src/lib/Math/Root/Isolation.hs b/brush-strokes/src/lib/Math/Root/Isolation.hs index 55f8e07..9fc4d9f 100644 --- a/brush-strokes/src/lib/Math/Root/Isolation.hs +++ b/brush-strokes/src/lib/Math/Root/Isolation.hs @@ -28,10 +28,12 @@ module Math.Root.Isolation , BisectionOptions(..), BisectionCoordPicker , defaultBisectionOptions - -- * The interval Newton method with Gauss–Seidel step - , GaussSeidel + -- * The interval Newton method + , NewtonOptions(..) + , defaultNewtonOptions + -- ** Options for the Gauss–Seidel step , GaussSeidelOptions(..), Preconditioner(..) - , defaultGaussSeidelOptions + , GaussSeidelUpdateMethod(..) -- * Box-consistency methods @@ -79,8 +81,9 @@ import Math.Monomial import Math.Root.Isolation.Bisection import Math.Root.Isolation.Core -import Math.Root.Isolation.GaussSeidel import Math.Root.Isolation.Narrowing +import Math.Root.Isolation.Newton +import Math.Root.Isolation.Newton.GaussSeidel -------------------------------------------------------------------------------- @@ -134,16 +137,16 @@ defaultRootIsolationAlgorithms minWidth ε_eq history box = -- Currently: we try an interval Gauss–Seidel. -- (box(1)- and box(2)-consistency don't seem to help when using -- the complete interval union Gauss–Seidel step) - _ -> Right $ AlgoWithOptions @GaussSeidel _gsOptions + _ -> Right $ AlgoWithOptions @Newton _newtonOptions NE.:| [] where verySmall = and $ ( \ cd -> width cd <= minWidth ) <$> coordinates box - _bisOptions = defaultBisectionOptions @n @d minWidth ε_eq box - _gsOptions = defaultGaussSeidelOptions @n @d history - _box1Options = defaultBox1Options @n @d ( minWidth * 100 ) ε_eq - _box2Options = ( defaultBox2Options @n @d minWidth ε_eq ) { box2LambdaMin = 0.001 } + _bisOptions = defaultBisectionOptions @n @d minWidth ε_eq box + _newtonOptions = NewtonLP -- defaultNewtonOptions @n @d history + _box1Options = defaultBox1Options @n @d ( minWidth * 100 ) ε_eq + _box2Options = ( defaultBox2Options @n @d minWidth ε_eq ) { box2LambdaMin = 0.001 } -- Did we reduce the box width by at least ε_eq -- in at least one of the coordinates? diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Core.hs b/brush-strokes/src/lib/Math/Root/Isolation/Core.hs index e6bcb29..901443b 100644 --- a/brush-strokes/src/lib/Math/Root/Isolation/Core.hs +++ b/brush-strokes/src/lib/Math/Root/Isolation/Core.hs @@ -78,7 +78,8 @@ type Box n = 𝕀ℝ n -- NB: we require n <= d (no support for under-constrained systems). -- -- NB: in practice, this constraint should specialise away. -type BoxCt n d = +type BoxCt n d = ( n ~ 2, d ~ 3 ) +{- ( KnownNat n, KnownNat d , 1 <= n, 1 <= d, n <= d @@ -95,7 +96,7 @@ type BoxCt n d = , Module Double ( T ( ℝ d ) ) , Representable Double ( ℝ d ) ) - +-} -- | Boxes we are done with and will not continue processing. data DoneBoxes n = DoneBoxes diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Newton.hs b/brush-strokes/src/lib/Math/Root/Isolation/Newton.hs new file mode 100644 index 0000000..a6a4072 --- /dev/null +++ b/brush-strokes/src/lib/Math/Root/Isolation/Newton.hs @@ -0,0 +1,171 @@ +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE UndecidableInstances #-} + +module Math.Root.Isolation.Newton + ( -- * The interval Newton method, + -- with Gauss–Seidel step or explicit linear programming + Newton + , intervalNewton + + -- ** Configuration options + , NewtonOptions(..) + , defaultNewtonOptions + ) + where + +-- base +import Prelude hiding ( unzip ) +import Control.Arrow + ( first ) +import Data.Bifunctor + ( Bifunctor(bimap) ) +import Data.Kind + ( Type ) +import Data.List + ( partition ) +import GHC.TypeNats + ( Nat, KnownNat, type (<=) ) + +-- transformers +import Control.Monad.Trans.Writer.CPS + ( Writer, tell ) + +-- MetaBrush +import Math.Algebra.Dual + ( D ) +import Math.Interval +import Math.Linear +import Math.Module + ( Module(..) ) +import Math.Monomial + ( MonomialBasis(..), linearMonomial, zeroMonomial ) +import Math.Root.Isolation.Core +import Math.Root.Isolation.Newton.GaussSeidel +import Math.Root.Isolation.Newton.LP +import Math.Root.Isolation.Utils + +-------------------------------------------------------------------------------- +-- Interval Newton + +-- | The interval Newton method; see 'intervalNewton'. +data Newton +instance BoxCt n d => RootIsolationAlgorithm Newton n d where + type instance StepDescription Newton = () + type instance RootIsolationAlgorithmOptions Newton n d = NewtonOptions n d + rootIsolationAlgorithm opts _thisRoundHist _prevRoundsHist eqs box = do + res <- intervalNewton @n @d opts eqs box + return ( (), res ) + {-# INLINEABLE rootIsolationAlgorithm #-} + {-# SPECIALISE rootIsolationAlgorithm + :: RootIsolationAlgorithmOptions Newton 2 3 + -> [ ( RootIsolationStep, Box 2 ) ] + -> BoxHistory 2 + -> ( 𝕀ℝ 2 -> D 1 ( 𝕀ℝ 2 ) ( 𝕀ℝ 3 ) ) + -> Box 2 + -> Writer ( DoneBoxes 2 ) ( StepDescription Newton, [ Box 2 ] ) #-} + -- NB: including this to be safe. The specialiser seems to sometimes + -- be able to generate this specialisation on its own, and sometimes not. + +-- | Options for the interval Newton method. +type NewtonOptions :: Nat -> Nat -> Type +data NewtonOptions n d where + -- | Use the Gauss–Seidel method to solve linear systems. + NewtonGaussSeidel + :: GaussSeidelOptions n d -> NewtonOptions n d + -- | Use linear programming to solve linear systems (2 dimensions only). + NewtonLP + :: NewtonOptions 2 d + +-- | Default options for the interval Newton method. +defaultNewtonOptions + :: forall n d + . ( KnownNat n, KnownNat d + , 1 <= n, 1 <= d, n <= d + , Representable Double ( ℝ n ) + , Representable Double ( ℝ d ) + ) + => BoxHistory n + -> NewtonOptions n d +defaultNewtonOptions history = + NewtonGaussSeidel $ defaultGaussSeidelOptions history +{-# INLINEABLE defaultNewtonOptions #-} + +-- | Interval Newton method with Gauss–Seidel step. +intervalNewton + :: forall n d + . BoxCt n d + => NewtonOptions n d + -> ( 𝕀ℝ n -> D 1 ( 𝕀ℝ n ) ( 𝕀ℝ d ) ) + -- ^ equations + -> 𝕀ℝ n + -- ^ box + -> Writer ( DoneBoxes n ) [ 𝕀ℝ n ] +intervalNewton opts eqs x = case opts of + NewtonGaussSeidel + ( GaussSeidelOptions + { gsPreconditioner = precondMeth + , gsPickEqs = pickEqs + , gsUpdate + } ) -> + let x_mid = singleton $ boxMidpoint x + f :: 𝕀ℝ n -> 𝕀ℝ n + f = \ x_0 -> pickEqs $ eqs x_0 `monIndex` zeroMonomial + f'_x :: Vec n ( 𝕀ℝ n ) + f'_x = fmap ( \ i -> pickEqs $ eqs x `monIndex` linearMonomial i ) ( universe @n ) + + -- Interval Newton method: take one Gauss–Seidel step + -- for the system of equations f'(x) ( x - x_mid ) = - f(x_mid). + minus_f_x_mid = unT $ -1 *^ T ( boxMidpoint $ f x_mid ) + + -- Precondition the above linear system into A ( x - x_mid ) = B. + ( a, b ) = precondition precondMeth + f'_x ( singleton minus_f_x_mid ) + + -- NB: we have to change coordinates, putting the midpoint of the box + -- at the origin, in order to take a Gauss–Seidel step. + gsGuesses = map ( first ( \ x' -> unT $ x' ^+^ T x_mid ) ) + $ gaussSeidelUpdate gsUpdate a b ( T x ^-^ T x_mid ) + ( done, todo ) = bimap ( map fst ) ( map fst ) + $ partition snd gsGuesses + in -- If the Gauss–Seidel step was a contraction, then the box + -- contains a unique solution (by the Banach fixed point theorem). + -- + -- These boxes can thus be directly added to the solution set: + -- Newton's method is guaranteed to converge to the unique solution. + do tell $ noDoneBoxes { doneSolBoxes = done } + return todo + NewtonLP -> + -- TODO: reduce duplication with the above. + let x_mid = singleton $ boxMidpoint x + f :: 𝕀ℝ 2 -> 𝕀ℝ d + f = \ x_0 -> eqs x_0 `monIndex` zeroMonomial + f'_x :: Vec 2 ( 𝕀ℝ d ) + f'_x = fmap ( \ i -> eqs x `monIndex` linearMonomial i ) ( universe @2 ) + + minus_f_x_mid = unT $ -1 *^ T ( boxMidpoint $ f x_mid ) + ( a, b ) = ( f'_x, singleton minus_f_x_mid ) + lpGuesses = map ( first ( \ x' -> unT $ x' ^+^ T x_mid ) ) + $ solveIntervalLinearEquations a b ( T x ^-^ T x_mid ) + ( done, todo ) = bimap ( map fst ) ( map fst ) + $ partition snd lpGuesses + in do tell $ noDoneBoxes { doneSolBoxes = done } + return todo +{-# INLINEABLE intervalNewton #-} +{- + +mbDeg = topologicalDegree 0.005 f x +det = case f'_x of + Vec [ c1, c2 ] -> + let a_11 = c1 `index` Fin 1 + a_12 = c2 `index` Fin 1 + a_21 = c1 `index` Fin 2 + a_22 = c2 `index` Fin 2 + in a_11 * a_22 - a_12 * a_21 + _ -> error "TODO: just testing n=2 here" + +if | not $ 0 ∈ det + , mbDeg == Just 0 + -> return [] + -- If the Jacobian is invertible over the box, then the topological + -- degree tells us exactly how many solutions there are in the box. +-} diff --git a/brush-strokes/src/lib/Math/Root/Isolation/GaussSeidel.hs b/brush-strokes/src/lib/Math/Root/Isolation/Newton/GaussSeidel.hs similarity index 58% rename from brush-strokes/src/lib/Math/Root/Isolation/GaussSeidel.hs rename to brush-strokes/src/lib/Math/Root/Isolation/Newton/GaussSeidel.hs index b1b5ff1..99e8dd3 100644 --- a/brush-strokes/src/lib/Math/Root/Isolation/GaussSeidel.hs +++ b/brush-strokes/src/lib/Math/Root/Isolation/Newton/GaussSeidel.hs @@ -1,33 +1,26 @@ -{-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE UndecidableInstances #-} +{-# LANGUAGE ScopedTypeVariables #-} -module Math.Root.Isolation.GaussSeidel - ( -- * The interval Newton method with Gauss–Seidel step - GaussSeidel - , intervalGaussSeidel - - -- ** Configuration options - , GaussSeidelOptions(..), Preconditioner(..) +-- | The Gauss–Seidel method for solving systems of +-- interval linear equations +module Math.Root.Isolation.Newton.GaussSeidel + ( -- * Gauss–Seidel step + gaussSeidelUpdate + -- ** Options for the Gauss–Seidel method + , GaussSeidelOptions(..) + , Preconditioner(..), GaussSeidelUpdateMethod(..) , defaultGaussSeidelOptions + , precondition ) where -- base import Prelude hiding ( unzip ) -import Control.Arrow - ( first ) -import Data.Bifunctor - ( Bifunctor(bimap) ) import Data.Coerce ( coerce ) import Data.Kind ( Type ) import Data.Foldable - ( toList ) -import Data.List - ( partition ) -import Data.List.NonEmpty - ( unzip ) + ( toList ) import Data.Proxy ( Proxy(..) ) import Data.Type.Ord @@ -41,44 +34,16 @@ import qualified Eigen.Matrix as Eigen , generate, inverse, unsafeCoeff ) --- transformers -import Control.Monad.Trans.Writer.CPS - ( Writer, tell ) - -- MetaBrush -import Math.Algebra.Dual - ( D ) import Math.Interval import Math.Linear -import Math.Module - ( Module(..) ) -import Math.Monomial - ( MonomialBasis(..), linearMonomial, zeroMonomial ) import Math.Root.Isolation.Core +import Math.Root.Isolation.Utils -------------------------------------------------------------------------------- -- Gauss–Seidel --- | The interval Newton method with a Gauss–Seidel step; see 'intervalGaussSeidel'. -data GaussSeidel -instance BoxCt n d => RootIsolationAlgorithm GaussSeidel n d where - type instance StepDescription GaussSeidel = () - type instance RootIsolationAlgorithmOptions GaussSeidel n d = GaussSeidelOptions n d - rootIsolationAlgorithm opts _thisRoundHist _prevRoundsHist eqs box = do - res <- intervalGaussSeidel @n @d opts eqs box - return ( (), res ) - {-# INLINEABLE rootIsolationAlgorithm #-} - {-# SPECIALISE rootIsolationAlgorithm - :: RootIsolationAlgorithmOptions GaussSeidel 2 3 - -> [ ( RootIsolationStep, Box 2 ) ] - -> BoxHistory 2 - -> ( 𝕀ℝ 2 -> D 1 ( 𝕀ℝ 2 ) ( 𝕀ℝ 3 ) ) - -> Box 2 - -> Writer ( DoneBoxes 2 ) ( StepDescription GaussSeidel, [ Box 2 ] ) #-} - -- NB: including this to be safe. The specialiser seems to sometimes - -- be able to generate this specialisation on its own, and sometimes not. - --- | Options for the interval Gauss–Seidel method. +-- | Options for the Gauss–Seidel method. type GaussSeidelOptions :: Nat -> Nat -> Type data GaussSeidelOptions n d = GaussSeidelOptions @@ -98,7 +63,7 @@ data GaussSeidelUpdateMethod | GS_Complete deriving stock Show --- | Default options for the interval Gauss–Seidel method. +-- | Default options for the Gauss–Seidel step. defaultGaussSeidelOptions :: forall n d . ( KnownNat n, KnownNat d @@ -133,72 +98,6 @@ data Preconditioner | InverseMidpoint deriving stock Show --- | Interval Newton method with Gauss–Seidel step. -intervalGaussSeidel - :: forall n d - . BoxCt n d - => GaussSeidelOptions n d - -> ( 𝕀ℝ n -> D 1 ( 𝕀ℝ n ) ( 𝕀ℝ d ) ) - -- ^ equations - -> 𝕀ℝ n - -- ^ box - -> Writer ( DoneBoxes n ) [ 𝕀ℝ n ] -intervalGaussSeidel - ( GaussSeidelOptions - { gsPreconditioner = precondMeth - , gsPickEqs = pickEqs - , gsUpdate - } ) - eqs - x - | let x_mid = singleton $ boxMidpoint x - f :: 𝕀ℝ n -> 𝕀ℝ n - f = \ x_0 -> pickEqs $ eqs x_0 `monIndex` zeroMonomial - - f'_x :: Vec n ( 𝕀ℝ n ) - f'_x = fmap ( \ i -> pickEqs $ eqs x `monIndex` linearMonomial i ) ( universe @n ) - - = let -- Interval Newton method: take one Gauss–Seidel step - -- for the system of equations f'(x) ( x - x_mid ) = - f(x_mid). - minus_f_x_mid = unT $ -1 *^ T ( boxMidpoint $ f x_mid ) - - -- Precondition the above linear system into A ( x - x_mid ) = B. - ( a, b ) = precondition precondMeth - f'_x ( singleton minus_f_x_mid ) - - -- NB: we have to change coordinates, putting the midpoint of the box - -- at the origin, in order to take a Gauss–Seidel step. - gsGuesses = map ( first ( \ x' -> unT $ x' ^+^ T x_mid ) ) - $ gaussSeidelUpdate gsUpdate a b ( T x ^-^ T x_mid ) - ( done, todo ) = bimap ( map fst ) ( map fst ) - $ partition snd gsGuesses - in -- If the Gauss–Seidel step was a contraction, then the box - -- contains a unique solution (by the Banach fixed point theorem). - -- - -- These boxes can thus be directly added to the solution set: - -- Newton's method is guaranteed to converge to the unique solution. - do tell $ noDoneBoxes { doneSolBoxes = done } - return todo -{-# INLINEABLE intervalGaussSeidel #-} -{- - -mbDeg = topologicalDegree 0.005 f x -det = case f'_x of - Vec [ c1, c2 ] -> - let a_11 = c1 `index` Fin 1 - a_12 = c2 `index` Fin 1 - a_21 = c1 `index` Fin 2 - a_22 = c2 `index` Fin 2 - in a_11 * a_22 - a_12 * a_21 - _ -> error "TODO: just testing n=2 here" - -if | not $ 0 ∈ det - , mbDeg == Just 0 - -> return [] - -- If the Jacobian is invertible over the box, then the topological - -- degree tells us exactly how many solutions there are in the box. --} - -- | A partial or complete Gauss–Seidel step for the equation \( A X = B \), -- refining the initial guess box for \( X \) into up to \( 2^n \) (disjoint) new boxes. gaussSeidelUpdate @@ -291,22 +190,6 @@ gaussSeidelStep_Complete as b ( T x0 ) = coerce $ do return ( x', and subs ) {-# INLINEABLE gaussSeidelStep_Complete #-} -fromComponents - :: forall n - . ( Representable Double ( ℝ n ), n ~ RepDim ( ℝ n ) ) - => ( Fin n -> [ ( 𝕀 Double, Bool ) ] ) -> [ ( 𝕀ℝ n, Vec n Bool ) ] -fromComponents f = do - ( xs, bs ) <- unzip <$> traverse f ( universe @n ) - return $ ( tabulate $ \ i -> xs ! i, bs ) - -- TODO: this could be more efficient. -{-# INLINEABLE fromComponents #-} - --- | The midpoint of a box. -boxMidpoint :: Representable Double ( ℝ n ) => 𝕀ℝ n -> ℝ n -boxMidpoint box = - tabulate $ \ i -> midpoint ( box `index` i ) -{-# INLINEABLE boxMidpoint #-} - -- | Pre-condition the system \( AX = B \). precondition :: forall n @@ -340,16 +223,3 @@ precondition meth as b = ) ( universe @n ) {-# INLINEABLE precondition #-} - --- | Matrix multiplication \( A v \). -matMulVec - :: forall n m - . ( Representable Double ( ℝ n ), Representable Double ( ℝ m ) ) - => Vec m ( ℝ n ) -- ^ columns of the matrix \( A ) - -> 𝕀ℝ m -- ^ vector \( v \) - -> 𝕀ℝ n -matMulVec as v = tabulate $ \ r -> - sum [ scaleInterval ( a `index` r ) ( index v c ) - | ( c, a ) <- toList ( (,) <$> universe @m <*> as ) - ] -{-# INLINEABLE matMulVec #-} diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs b/brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs new file mode 100644 index 0000000..fe2802d --- /dev/null +++ b/brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs @@ -0,0 +1,132 @@ +{-# LANGUAGE CApiFFI #-} +{-# LANGUAGE ForeignFunctionInterface #-} +{-# LANGUAGE ParallelListComp #-} +{-# LANGUAGE ScopedTypeVariables #-} + +-- | A linear programming approach for solving systems of +-- interval linear equations. +module Math.Root.Isolation.Newton.LP + ( solveIntervalLinearEquations ) + where + +-- base +import Control.Arrow + ( (&&&) ) +import Data.Coerce + ( coerce ) +import Data.Foldable + ( toList ) +import Foreign.C.Types + ( CDouble(..), CInt(..), CUInt(..) ) +import Foreign.Marshal + ( allocaArray, peekArray, pokeArray, with, withArray ) +import Foreign.Ptr + ( Ptr, castPtr ) +import Foreign.Storable + ( Storable(..) ) +import GHC.TypeNats + ( KnownNat, natVal' ) +import GHC.Exts + ( proxy# ) +import System.IO.Unsafe + ( unsafePerformIO ) + +-- MetaBrush +import Math.Interval +import Math.Linear + +-------------------------------------------------------------------------------- +-- Linear programming approach to solving systems of linear equations +-- (in two variables). + +-- | Solve the system of linear equations \( A X = B \) +-- using linear programming. +solveIntervalLinearEquations + :: ( KnownNat d, Representable Double ( ℝ d ), Show ( ℝ d ) ) + => Vec 2 ( 𝕀ℝ d ) -- ^ columns of \( A \) + -> 𝕀ℝ d -- ^ \( B \) + -> T ( 𝕀ℝ 2 ) -- ^ initial box \( X \) + -> [ ( T ( 𝕀ℝ 2 ), Bool ) ] +solveIntervalLinearEquations a b x = + let !sols = unsafePerformIO $ intervalSystem2D_LP a b x + in if + | any hasNaNs sols + -> [ ( x, False ) ] + | length sols <= 1 + -> map ( id &&& isSubBox x ) sols + | otherwise + -> map ( , False ) sols + +-- Assuming the second box is a subset of the second box, returns whether it +-- is in fact a strict subset. +isSubBox :: T ( 𝕀ℝ 2 ) -> T ( 𝕀ℝ 2 ) -> Bool +isSubBox + ( T ( 𝕀 ( ℝ2 x1_min y1_min ) ( ℝ2 x1_max y1_max ) ) ) + ( T ( 𝕀 ( ℝ2 x2_min y2_min ) ( ℝ2 x2_max y2_max ) ) ) + = ( ( x2_min > x1_min && x2_max < x1_max ) || x2_min == x2_max ) + && ( ( y2_min > y1_min && y2_max < y1_max ) || y2_min == y2_max ) + +hasNaNs :: T (𝕀ℝ 2) -> Bool +hasNaNs ( T ( 𝕀 ( ℝ2 x_min y_min ) ( ℝ2 x_max y_max ) ) ) = + any ( \ x -> isNaN x || isInfinite x ) [ x_min, y_min, x_max, y_max ] + +intervalSystem2D_LP + :: forall d + . ( KnownNat d, Representable Double ( ℝ d ) ) + => Vec 2 ( 𝕀ℝ d ) + -> 𝕀ℝ d + -> T ( 𝕀ℝ 2 ) + -> IO [ T ( 𝕀ℝ 2 ) ] +intervalSystem2D_LP a b x = + allocaArray 4 \ ptrSolutions -> + with ( CBox $ unT $ x ) \ ptrBox -> + withArray ( mkEquationArray a b ) \ ptrEqs -> do + CInt nbSols <- + interval_system_2d ptrSolutions ptrBox ptrEqs ( fromIntegral d ) + if nbSols < 0 + then + error $ unlines + [ "interval_system_2d returned with exit code " ++ show nbSols + , "This probably means it was given invalid input." ] + else + coerce <$> peekArray ( fromIntegral nbSols ) ptrSolutions + where + d = natVal' @d proxy# + +mkEquationArray + :: ( KnownNat d, Representable Double ( ℝ d ) ) + => Vec 2 ( 𝕀ℝ d ) -> 𝕀ℝ d -> [ CEqn ] +mkEquationArray ( Vec [ a_x, a_y ] ) b = + [ CEqn a_x_i a_y_i b_i + | a_x_i <- toList $ coordinates a_x + | a_y_i <- toList $ coordinates a_y + | b_i <- toList $ coordinates b + ] +mkEquationArray _ _ = error "impossible" + +foreign import ccall "interval_system_2d" + interval_system_2d :: Ptr CBox -> Ptr CBox -> Ptr CEqn -> CUInt -> IO CInt + +data CEqn = CEqn !( 𝕀 Double ) !( 𝕀 Double ) !( 𝕀 Double ) +instance Storable CEqn where + sizeOf _ = 6 * sizeOf @Double undefined + alignment _ = 4 * alignment @Double undefined + peek ptr = do + [ CDouble a_min, CDouble a_max, CDouble b_min, CDouble b_max, CDouble c_min, CDouble c_max ] + <- peekArray 6 ( castPtr ptr :: Ptr CDouble ) + return $ + CEqn ( 𝕀 a_min a_max ) ( 𝕀 b_min b_max ) ( 𝕀 c_min c_max ) + poke ptr ( CEqn ( 𝕀 a_min a_max ) ( 𝕀 b_min b_max ) ( 𝕀 c_min c_max ) ) + = pokeArray ( castPtr ptr ) [ CDouble a_min, CDouble a_max, CDouble b_min, CDouble b_max, CDouble c_min, CDouble c_max ] + + +newtype CBox = CBox ( 𝕀ℝ 2 ) +instance Storable CBox where + sizeOf _ = 4 * sizeOf @Double undefined + alignment _ = 4 * alignment @Double undefined + peek ptr = do + [ CDouble x_min, CDouble x_max, CDouble y_min, CDouble y_max ] <- peekArray 4 ( castPtr ptr :: Ptr CDouble ) + return $ + CBox ( 𝕀 ( ℝ2 x_min y_min ) ( ℝ2 x_max y_max ) ) + poke ptr ( CBox ( 𝕀 ( ℝ2 x_min y_min ) ( ℝ2 x_max y_max ) ) ) = + pokeArray ( castPtr ptr ) [ CDouble x_min, CDouble x_max, CDouble y_min, CDouble y_max ] diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Utils.hs b/brush-strokes/src/lib/Math/Root/Isolation/Utils.hs new file mode 100644 index 0000000..f273bcf --- /dev/null +++ b/brush-strokes/src/lib/Math/Root/Isolation/Utils.hs @@ -0,0 +1,48 @@ +{-# LANGUAGE ScopedTypeVariables #-} + +-- | Utilities for root isolation +module Math.Root.Isolation.Utils + ( fromComponents, matMulVec, boxMidpoint ) + where + +-- base +import Prelude hiding ( unzip ) +import Data.Foldable + ( toList ) +import Data.List.NonEmpty + ( unzip ) + +-- MetaBrush +import Math.Interval +import Math.Linear + +-------------------------------------------------------------------------------- + +fromComponents + :: forall n + . ( Representable Double ( ℝ n ), n ~ RepDim ( ℝ n ) ) + => ( Fin n -> [ ( 𝕀 Double, Bool ) ] ) -> [ ( 𝕀ℝ n, Vec n Bool ) ] +fromComponents f = do + ( xs, bs ) <- unzip <$> traverse f ( universe @n ) + return $ ( tabulate $ \ i -> xs ! i, bs ) + -- TODO: this could be more efficient. +{-# INLINEABLE fromComponents #-} + +-- | The midpoint of a box. +boxMidpoint :: Representable Double ( ℝ n ) => 𝕀ℝ n -> ℝ n +boxMidpoint box = + tabulate $ \ i -> midpoint ( box `index` i ) +{-# INLINEABLE boxMidpoint #-} + +-- | Matrix multiplication \( A v \). +matMulVec + :: forall n m + . ( Representable Double ( ℝ n ), Representable Double ( ℝ m ) ) + => Vec m ( ℝ n ) -- ^ columns of the matrix \( A ) + -> 𝕀ℝ m -- ^ vector \( v \) + -> 𝕀ℝ n +matMulVec as v = tabulate $ \ r -> + sum [ scaleInterval ( a `index` r ) ( index v c ) + | ( c, a ) <- toList ( (,) <$> universe @m <*> as ) + ] +{-# INLINEABLE matMulVec #-}