From 91ac61e3cd0d7850ef9c1af5a9d6a7b784e5fdc3 Mon Sep 17 00:00:00 2001
From: sheaf <sam.derbyshire@gmail.com>
Date: Wed, 22 May 2024 16:32:36 +0200
Subject: [PATCH] WIP: add Walter's LP approach to interval Newton

---
 brush-strokes/brush-strokes.cabal             |   24 +-
 brush-strokes/cbits/lp_2d.cpp                 | 2035 +++++++++++++++++
 brush-strokes/cbits/lp_2d.hpp                 | 1265 ++++++++++
 brush-strokes/src/lib/Math/Root/Isolation.hs  |   21 +-
 .../src/lib/Math/Root/Isolation/Core.hs       |    5 +-
 .../src/lib/Math/Root/Isolation/Newton.hs     |  171 ++
 .../Isolation/{ => Newton}/GaussSeidel.hs     |  158 +-
 .../src/lib/Math/Root/Isolation/Newton/LP.hs  |  132 ++
 .../src/lib/Math/Root/Isolation/Utils.hs      |   48 +
 9 files changed, 3703 insertions(+), 156 deletions(-)
 create mode 100644 brush-strokes/cbits/lp_2d.cpp
 create mode 100644 brush-strokes/cbits/lp_2d.hpp
 create mode 100644 brush-strokes/src/lib/Math/Root/Isolation/Newton.hs
 rename brush-strokes/src/lib/Math/Root/Isolation/{ => Newton}/GaussSeidel.hs (58%)
 create mode 100644 brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs
 create mode 100644 brush-strokes/src/lib/Math/Root/Isolation/Utils.hs
diff --git a/brush-strokes/brush-strokes.cabal b/brush-strokes/brush-strokes.cabal
index f38f9a6..816458c 100644
--- a/brush-strokes/brush-strokes.cabal
+++ b/brush-strokes/brush-strokes.cabal
@@ -9,6 +9,9 @@ build-type:     Simple
 description:
   Computing brush strokes using Bézier curves.
 
+extra-source-files:
+  cbits/**/*.{cpp, hpp}
+
 flag use-fma
   description: Use fused-muliply add instructions to implement interval arithmetic.
   default: True
@@ -143,8 +146,11 @@ library
       , Math.Root.Isolation.Bisection
       , Math.Root.Isolation.Core
       , Math.Root.Isolation.Degree
-      , Math.Root.Isolation.GaussSeidel
       , Math.Root.Isolation.Narrowing
+      , Math.Root.Isolation.Newton
+      , Math.Root.Isolation.Newton.GaussSeidel
+      , Math.Root.Isolation.Newton.LP
+      , Math.Root.Isolation.Utils
       , Debug.Utils
 
     other-modules:
@@ -186,6 +192,22 @@ library
       , transformers
            >= 0.5.6.2 && < 0.7
 
+    -- Extra C++ code for 2D linear systems of interval equations
+    include-dirs:
+      cbits
+    cxx-sources:
+      cbits/lp_2d.cpp
+    cxx-options:
+      -std=c++20
+      -mavx2 -mfma
+      -frounding-math -fno-math-errno -fno-signed-zeros
+      -fno-trapping-math -fno-signaling-nans
+      -Wno-unused-result -Wsign-compare -Wno-switch
+      -march=native -mtune=native
+      -O3 -DNDEBUG
+    build-depends:
+      system-cxx-std-lib
+
 --executable convert-metafont
 --
 --    import:
diff --git a/brush-strokes/cbits/lp_2d.cpp b/brush-strokes/cbits/lp_2d.cpp
new file mode 100644
index 0000000..18d713c
--- /dev/null
+++ b/brush-strokes/cbits/lp_2d.cpp
@@ -0,0 +1,2035 @@
+//----------------------------------------------------------------------------------------
+// Copyright (c) 2024 Walter Mascarenhas
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability",
+// and we provide no guarantee regarding the correctness of this source code.
+//
+//----------------------------------------------------------------------------------------
+
+#include <algorithm>
+#include <cfenv>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+
+#include "lp_2d.hpp"
+
+//----------------------------------------------------------------------------------------
+namespace wm::nlp::lp2d {
+
+__m256i bounds( __m256i* tiny, __m256i* huge )
+{
+  __m256i ones = ones_i();
+  __m256i sign = _mm256_slli_epi64( ones, 63 );
+  __m256i a = _mm256_srli_epi64( sign, 1 );     // 0b0100'0000'0000...
+  __m256i b = _mm256_srli_epi64( sign, 3 );     // 0b0001'0000'0000...
+  __m256i c = _mm256_or_si256( a, b );          // 0b0101'0000'0000... 
+  __m256i d = _mm256_srli_epi64( c, 4 );        // 0b0000'0101'0000...
+
+  *huge = _mm256_or_si256( c, d );             // 0b0101'0101'0000... 0x5500...
+
+  __m256i e = _mm256_srli_epi64( *huge, 1 );   // 0b0010'1010'1000...
+  __m256i f = _mm256_srli_epi64( d, 2 );       // 0b0000'0001'0100...
+  __m256i g = _mm256_srli_epi64( d, 4 );       // 0b0000'0000'0101...
+  __m256i h = _mm256_srli_epi64( ones,  1 );
+  __m256i i = _mm256_srli_epi64( sign, 32 );
+
+  __m256i j = _mm256_or_si256( f, g );         // 0b0000'0001'0101...
+ 
+  *tiny = _mm256_or_si256( e, j );             // 0b0010'1011'1101... 0x2bd...
+  __m256i k = and_not_i( i, h );
+  *huge = blend< 0, 0, 0, 1 >( *huge, k     );
+  *tiny = blend< 0, 0, 0, 1 >( *tiny, zero_i() );
+  return sign;
+}
+
+//----------------------------------------------------------------------------------------
+// exact_location returns the exact location of the intersection ( x, y ) of the lines
+//
+// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ]
+// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ]
+//
+// with respect to the set
+//
+// e0 x + e1 y >= e2
+//
+// under the assumption that 
+//
+// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0 
+// 
+// where ei = edge[ i ],  b0 = border[ 0 ] and b1 = b[ 1 ]
+//
+// This amounts to evaluating the sign of
+//
+// p = e0 r + e1 s - e2 d
+//
+// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ] 
+// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] ,
+//
+// which motivates the evaluation of the six products
+//
+//   group 1                    group 2
+//  e0 b0[ 2 ] b1[ 1 ],   -e0 b0[ 1 ] b1[ 2 ] 
+//  e1 b0[ 0 ] b1[ 2 ],   -e1 b0[ 2 ] b1[ 0 ] 
+//  e2 b0[ 1 ] b1[ 0 ].   -e2 b0[ 0 ] b1[ 1 ] 
+//
+// and we want to find the sign of their sum. 
+//
+// The function assumes that the edges entries are either normal or zero, ie.,
+// there are no subnormal entries.
+//----------------------------------------------------------------------------------------
+
+location exact_location( __m256d e, __m256d b0, __m256d b1 )
+{
+  //----------------------------------------------------------------------------------------
+  //----------------------------------------------------------------------------------------
+
+  __m256d b0_a = permute< 1, 2, 0, 3 >( b0 ); // { b0[ 1 ], b0[ 2 ], b0[ 0 ], ? }
+  __m256d b0_b = permute< 2, 0, 1, 3 >( b0 ); // { b0[ 2 ], b0[ 0 ], b0[ 1 ], ? }
+  __m256d b1_a = permute< 1, 2, 0, 3 >( b1 ); // { b1[ 1 ], b1[ 2 ], b1[ 0 ], ? }
+  __m256d b1_b = permute< 2, 0, 1, 3 >( b1 ); // { b1[ 2 ], b1[ 0 ], b1[ 1 ], ? }
+
+  __m256d g1 = _mm256_mul_pd( b0_b, b1_a ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
+  __m256d g2 = _mm256_mul_pd( b0_a, b1_b ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
+  
+  __m256d g1b = _mm256_fmsub_pd( b0_b, b1_a, g1 ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
+  __m256d g2b = _mm256_fmsub_pd( b0_a, b1_b, g2 ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
+  
+  __m256i sign_i  = _mm256_slli_epi64( ones_i(),  63 );
+  __m256d sign    = cast_d( sign_i );
+  __m256d minus_e = xor_d( e, sign );
+
+  __m256d c[ 8 ];
+
+  c[ 0 ] = _mm256_mul_pd( e,        g1 ); 
+  c[ 1 ] = _mm256_mul_pd( minus_e,  g2 ); 
+  c[ 2 ] = _mm256_mul_pd( e,       g1b ); 
+  c[ 3 ] = _mm256_mul_pd( minus_e, g2b ); 
+
+  c[ 4 ] = _mm256_fmsub_pd( e,        g1, c[ 0 ] ); 
+  c[ 5 ] = _mm256_fmsub_pd( minus_e,  g2, c[ 1 ] ); 
+  c[ 6 ] = _mm256_fmsub_pd( e,       g1b, c[ 2 ] ); 
+  c[ 7 ] = _mm256_fmsub_pd( minus_e, g2b, c[ 3 ] ); 
+
+  __m256d a[ 6 ];
+
+  // transposing the products
+
+  __m256d tmp_0 = gather_low(  c[ 0 ], c[ 1 ] );  
+  __m256d tmp_1 = gather_high( c[ 0 ], c[ 1 ] ); 
+  __m256d tmp_2 = gather_low(  c[ 2 ], c[ 3 ] ); 
+  __m256d tmp_3 = gather_high( c[ 2 ], c[ 3 ] ); 
+
+  a[ 0 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x20 ); 
+  a[ 1 ] = _mm256_permute2f128_pd( tmp_1, tmp_3, 0x20 ); 
+  a[ 2 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x31 ); 
+
+  __m256d tmp_4 = gather_low(  c[ 4 ], c[ 5 ] );  
+  __m256d tmp_5 = gather_high( c[ 4 ], c[ 5 ] ); 
+  __m256d tmp_6 = gather_low(  c[ 6 ], c[ 7 ]); 
+  __m256d tmp_7 = gather_high( c[ 6 ], c[ 7 ] ); 
+
+  a[ 3 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x20 ); 
+  a[ 4 ] = _mm256_permute2f128_pd( tmp_5, tmp_7, 0x20 ); 
+  a[ 5 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x31 ); 
+
+  __m256d gt_05 = is_greater( a[ 0 ], a[ 5 ] );
+  __m256d tmp = blend_max( a[ 0 ], a[ 5 ], gt_05 );
+          a[ 0 ] = blend_min( a[ 0 ], a[ 5 ], gt_05 );
+          a[ 5 ] = tmp;
+
+  //----------------------------------------------------------------------------------------
+  // In the loop below we assume that a0[ j ] <= a5[ j ], i = 0,1,2,3
+  //----------------------------------------------------------------------------------------
+  
+  while( true )
+  { 
+    __m256d gt_12 = is_greater( a[ 1 ], a[ 2 ] );
+    __m256d gt_34 = is_greater( a[ 3 ], a[ 4 ] );
+
+    __m256d min_12 = blend_min( a[ 1 ], a[ 2 ], gt_12 );
+    __m256d max_12 = blend_max( a[ 1 ], a[ 2 ], gt_12 );
+
+    __m256d min_34 = blend_min( a[ 3 ], a[ 4 ], gt_34 );
+    __m256d max_34 = blend_max( a[ 3 ], a[ 4 ], gt_34 );
+
+    __m256d gt_min = is_greater( min_12, min_34 );
+    __m256d gt_max = is_greater( max_12, max_34 );
+
+    __m256d min_1234 = blend_min( min_12, min_34, gt_min );
+            a[ 2 ]   = blend_max( min_12, min_34, gt_min );
+    
+            a[ 3 ]   = blend_min( max_12, max_34, gt_max );
+    __m256d max_1234 = blend_max( max_12, max_34, gt_max );
+
+    __m256d gt_0_min_1234 = is_greater( a[ 0 ], min_1234 );
+    __m256d gt_max_1234_5 = is_greater( max_1234, a[ 5 ] );
+
+    a[ 1 ] = blend_max( a[ 0 ], min_1234, gt_0_min_1234 );
+    a[ 0 ] = blend_min( a[ 0 ], min_1234, gt_0_min_1234 );
+
+    a[ 4 ] = blend_min( max_1234, a[ 5 ], gt_max_1234_5 );
+    a[ 5 ] = blend_max( max_1234, a[ 5 ], gt_max_1234_5 );
+
+    // 1) a0[ j ] = min { ai[ j ], i = 0, .... 5 }
+    // 2) a5[ j ] = max { ai[ j ], i = 0, .... 5 }
+
+    __m256d zero = _mm256_xor_pd( zero, zero );
+    __m256d cmp0 = _mm256_cmp_pd( a[ 0 ], zero, _CMP_LT_OQ );
+
+    if( _mm256_testz_pd( cmp0, cmp0 ) ) // all a0[ j ] are >= 0
+    {
+      __m256d cmp = is_greater( a[ 5 ], zero );
+      if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a5 is positive
+        return location::border;
+      else
+        return location::in;
+    }
+
+    __m256d cmp5 = _mm256_cmp_pd( a[ 5 ], zero, _CMP_GT_OQ );
+    
+    if( _mm256_testz_pd( cmp5, cmp5 ) ) // all a5[ j ] are <= 0
+    {
+      __m256d cmp = is_greater( zero, a[ 0 ] );
+      if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a0 is negative
+        return location::border;
+      else
+        return location::out;
+    }
+
+    // now 
+    // 1) min ai[j] has its sign bit set
+    // 2) max ai[j] does not have its sign bit set
+
+    __m256d lo = gather_low(  a[ 0 ], a[ 5 ] ); // { a0[ 0 ], a5[ 0 ], a0[ 2 ], a5[ 2 ] }
+    __m256d hi = gather_high( a[ 0 ], a[ 5 ] ); // { a0[ 1 ], a5[ 1 ], a0[ 3 ], a5[ 3 ] }
+
+    __m256d gt_lo_hi = is_greater( lo, hi );
+    __m256d b0 = blend_min( lo, hi, gt_lo_hi );
+    __m256d b5 = blend_max( lo, hi, gt_lo_hi );
+
+    // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 2 ] } 
+    // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 1 ], b5[ 3 ] } 
+
+    b0 = permute< 0, 2, 1, 3 >( b0 );
+    b5 = permute< 3, 1, 2, 0 >( b5 );
+    
+    // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 1 ] } 
+    // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 0 ], b5[ 1 ] } 
+
+    lo = gather_low(  b0, b5 ); // { b0[ 0 ], b5[ 0 ], b0[ 2 ], b5[ 2 ] }
+    hi = gather_high( b0, b5 ); // { b0[ 1 ], b5[ 1 ], b0[ 3 ], b5[ 3 ] }
+
+    gt_lo_hi = is_greater( lo, hi );
+    b0 = blend_min( lo, hi, gt_lo_hi );
+    b5 = blend_max( lo, hi, gt_lo_hi );
+
+    // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ]
+    // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 1 ]  
+
+    b5 = permute< 1, 0, 3, 2 >( b5 );
+
+    // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] <= 0
+    // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 0 ] >= 0 s
+
+    // fixing possible break of order in positions 1,2,3
+
+    __m256d chk123 = is_greater( b0, b5 );
+    __m256d aux = blend_min( b0, b5, chk123 );
+    b5 = blend_max( b0, b5, chk123 );
+    b0 = aux;
+    
+    __m256i b0i = cast_i( b0 );
+    __m256i b5i = cast_i( b5 );
+
+    uint64_t u0 = _mm256_extract_epi64( b0i, 0 ) & 0x7FFF'FFFF'FFFF'FFFFULL;
+    uint64_t u5 = _mm256_extract_epi64( b5i, 0 );
+
+    if( u0 >= u5 ) // | b0[ 0 | >= | b5[ 0 ] |
+    {
+      if( u0 > u5 + 0x0060'0000'0000'0000ULL )  // |b0| > 2^5 |b1| (we use 0x006... to handle subnormals properly)
+        return location::out;
+
+      __m256d sum   = _mm256_add_pd( b0,  b5 ); // b0  + b5, rounded down
+      __m256d cor   = _mm256_sub_pd( sum, b0 ); // sum - b0, rounded down
+      __m256d delta = _mm256_sub_pd( b5, cor ); // b5 - cor, rounded down.
+      // now b0[ 0 ] + b5[ 0 ] = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ]
+      a[ 0 ] = blend< 1, 0, 0, 0 >( b0, sum   );
+      a[ 5 ] = blend< 1, 0, 0, 0 >( b5, delta );
+    }
+    else
+    {
+      if( u5 > u0 + 0x0060'0000'0000'0000ULL )
+        return location::in;
+
+      __m256i sign_i = _mm256_slli_epi64( ones_i(),  63 );
+      __m256d sign = cast_d( sign_i );
+      __m256d minus_b0 = xor_d( b0, sign );
+
+      __m256d sum   = _mm256_sub_pd( minus_b0,  b5 ); // (-b5) + (-b0) rounded down
+      __m256d cor   = _mm256_add_pd( sum,       b5 ); //   sum - (-b5) rounded down
+      __m256d delta = _mm256_sub_pd( minus_b0, cor ); // (-b0) - cor,  rounded down.
+      
+      // -( b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ] 
+
+      sum   = xor_d( sum,   sign );
+      delta = xor_d( delta, sign );
+
+      //  b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with delta[ 0 ] <= 0 <= sum[ 0 ] 
+
+      a[ 0 ] = blend< 1, 0, 0, 0 >( b0, delta );
+      a[ 5 ] = blend< 1, 0, 0, 0 >( b5, sum   );
+    }
+  }
+}
+
+//----------------------------------------------------------------------------------------
+// bounded_convex::box: finding the box of the bounded convex region, under the assumption
+// that the normals are normalized, that is, for each edge e, either 
+// e[ i ] = +0.0 or  e[ i ] >= tiny ~ 2^(-100), i.e. no signed zeros, underflow
+// or overflow or negative coordinates.
+//----------------------------------------------------------------------------------------
+
+__m256d bounded_convex::box() const
+{
+  constexpr int non_0 = 0;
+  constexpr int non_1 = 2;
+  constexpr int inf_0 = 0;
+  constexpr int sup_0 = 1;
+  constexpr int inf_1 = 2;
+  constexpr int sup_1 = 3;
+
+  assert( type != bounded_convex_type::empty );
+
+  __m256d den;
+  __m256d num;
+  __m256i sign   = _mm256_slli_epi64( ones_i(), 63 ); // { -1, -1, -1, -1 }
+  __m256i sign_h = _mm256_slli_si256( sign, 8      ); // {  0, -1,  0, -1 }
+
+  if( type != bounded_convex_type::polygon )
+  {
+    if( type == bounded_convex_type::point )
+    {
+      // { vertex_pair[ 0 ] =  { x_inf, -x_sup, ? , ? }
+      // { vertex_pair[ 1 ] =  { y_inf, -y_sup, ? , ? }
+      // { vertex_pair[ 2 ] =  { z_inf, -z_sup, ? , ? }
+
+      den = xor_d( vertex_p[ 2 ], sign_h );
+      den = permute< sup_0, inf_0, sup_0, inf_0 >( den );  
+      num = gather_lanes< 0, 0 >( vertex_p[ 0 ], vertex_p[ 1 ] );
+    }
+    else
+    {
+      __m256d num_x;
+      __m256d num_y;
+
+      den = xor_d( vertex_p[ 2 ], sign_h ); 
+
+      int xy_signs = high_bits_64( edge[ 1 ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 0: // edge[ 0 ] >= 0, edge[ 1 ] >= 0: box = { x0, x1, y1, y0 }
+        {
+          num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); 
+          num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); 
+
+          den = permute< sup_0, inf_1, sup_1, inf_0 >( den ); 
+          break;
+        }
+        case 1: // edge[ 0 ] <  0, edge[ 1 ] >= 0: box = { x0, x1, y0, y1 }
+        {
+          num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); 
+          num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); 
+          
+          den = permute< sup_0, inf_0, sup_0, inf_1 >( den ); 
+          break;
+        }
+        case 2: // edge[ 0 ] >= 0, edge[ 1 ]  < 0: box = {x1, x0, y1, y0 }
+        {
+          num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); 
+          num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); 
+
+          den = permute< sup_1, inf_0, sup_1, inf_0 >( den ); 
+
+          break;
+        }
+        case 3: // edge[ 0 ]  < 0, edge[ 1 ]  < 0: box = { x1, x0, y0, y1 }
+        {
+          num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); 
+          num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); 
+          
+          den = permute< sup_1, inf_0, sup_0, inf_1 >( den ); 
+          break;
+        }
+      }
+
+      num = gather_lanes< 0, 1 >( num_x, num_y );
+    }
+  }
+  else
+  {
+    int index = begin;
+
+    auto set_x_max = [ & ]()
+    {
+      int previous_vertex = ( ( index > 0 ) ? index : end );
+      int previous_pair = 3 * ( --previous_vertex / 2 );
+
+      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
+              
+      if( previous_vertex & 0x1 )
+      {
+                z = permute< non_0, inf_1, non_1, non_1 >( z );
+        __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
+        num = blend< 0, 1, 0, 0 >( num, x );
+      }
+      else
+      {
+        z = permute< non_0, inf_0, non_1, non_1 >( z );
+        num = blend< 0, 1, 0, 0 >( num, vertex_p[ previous_pair ]  );
+      }
+
+      den = blend< 0, 1, 0, 0 >( den, z );
+    };
+
+    auto set_x_min = [ & ]()
+    {
+      int previous_vertex = ( ( index > 0 ) ? index : end );
+      int previous_pair = 3 * ( --previous_vertex / 2 );
+
+      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
+              
+      if( previous_vertex & 0x1 )
+      {
+                z = permute< sup_1, non_0, non_1, non_1 >( z );
+        __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
+        num = blend< 1, 0, 0, 0 >( num, x );
+      }
+      else
+      {
+        z = permute< sup_0, non_0, non_1, non_1 >( z );
+        num = blend< 1, 0, 0, 0 >( num, vertex_p[ previous_pair ]  );
+      }
+
+      den = blend< 1, 0, 0, 0 >( den, z );
+    };
+
+    auto set_y_max = [ & ]()
+    {
+      int previous_vertex = ( ( index > 0 ) ? index : end );
+      int previous_pair = 3 * ( --previous_vertex / 2 );
+
+      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
+              
+      if( previous_vertex & 0x1 )
+      {
+        z = permute< non_0, non_0, non_1, inf_1 >( z );
+        num = blend< 0, 0, 0, 1 >( num, vertex_p[ previous_pair + 1 ]  );
+      }
+      else
+      {
+                z = permute< non_0, non_0, non_1, inf_0 >( z );
+        __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
+        num = blend< 0, 0, 0, 1 >( num, y );
+      }
+
+      den = blend< 0, 0, 0, 1 >( den, z );
+    };
+
+    auto set_y_min = [ & ]()
+    {
+      int previous_vertex = ( ( index > 0 ) ? index : end );
+      int previous_pair = 3 * ( --previous_vertex / 2 );
+
+      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
+              
+      if( previous_vertex & 0x1 )
+      {
+          z = permute< non_1, non_1, sup_1, non_1 >( z );
+        num = blend< 0, 0, 1, 0 >( num, vertex_p[ previous_pair + 1 ]  );
+      }
+      else
+      {
+                z = permute< non_0, non_0, sup_0, non_0 >( z );
+        __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
+        num = blend< 0, 0, 1, 0 >( num, y );
+      }
+
+      den = blend< 0, 0, 1, 0 >( den, z );
+    };
+
+    int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+    switch( xy_signs )
+    {
+      case 0: // normal[ 0 ] >= 0, normal[ 1 ] >= 0: first quadrant
+      {
+        if( index & 0x1 )
+          goto quadrant_1_odd;
+        else
+          goto quadrant_1_even;
+      }
+      case 1: // normal[ 0 ] < 0, normal[ 1 ] >= 0: second quadrant
+      {
+        if( index & 0x1 )
+          goto quadrant_2_odd;
+        else
+          goto quadrant_2_even;
+      }
+      case 2: // normal[ 0 ] >= 0, normal[ 1 ] < 0: fourth quadrant
+      {
+        if( index & 0x1 )
+          goto quadrant_4_odd;
+        else
+          goto quadrant_4_even;
+      }
+      case 3: // normal[ 0 ] < 0, normal[ 1 ] < 0: third quadrant
+      {
+        if( index & 0x1 )
+          goto quadrant_3_odd;
+        else
+          goto quadrant_3_even;
+      }
+    }
+
+    quadrant_1_even:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 0: // quadrant_1
+          goto quadrant_1_odd;
+        case 1: // quadrant_2
+        {
+          set_y_min();
+          goto quadrant_2_odd;
+        }
+        case 3: // quadrant_3
+        {
+          set_x_max();
+          set_y_min();
+          goto quadrant_3_odd;
+        }
+      }
+    }
+
+    quadrant_1_odd:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 0: // quadrant_1
+          goto quadrant_1_even;
+        case 1: // quadrant_2
+        {
+          set_y_min();
+          goto quadrant_2_even;
+        }
+        case 3: // quadrant_3
+        {
+          set_x_max();
+          set_y_min();
+          goto quadrant_3_even;
+        }
+      }
+    }
+
+    quadrant_2_even:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 1: // quadrant_2
+          goto quadrant_2_odd;
+        case 2:// quadrant_4
+        {
+          set_x_max();
+          set_y_max();
+          goto quadrant_4_odd;
+        }
+        case 3: // quadrant_3
+        {
+          set_x_max();
+          goto quadrant_3_odd;
+        }
+      }
+    }
+
+    quadrant_2_odd:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 1: // quadrant_2
+          goto quadrant_2_even;
+        case 2:// quadrant_4
+        {
+          set_x_max();
+          set_y_max();
+          goto quadrant_4_even;
+        }
+        case 3: // quadrant_3
+        {
+          set_x_max();
+          goto quadrant_3_even;
+        }
+      }
+    }
+
+    quadrant_3_even:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 0: // quadrant_1
+        {
+          set_y_max();
+          set_x_min();
+          goto quadrant_1_odd;
+        }
+        case 2:// quadrant_4
+        {
+          set_y_max();
+          goto quadrant_4_odd;
+        }
+        case 3: // quadrant_3
+          goto quadrant_3_odd;
+      }
+    }
+
+    quadrant_3_odd:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 0: // quadrant_1
+        {
+          set_x_min();
+          set_y_max();
+          goto quadrant_1_even;
+        }
+        case 2:// quadrant_4
+        {
+          set_y_max();
+          goto quadrant_4_even;
+        }
+        case 3: // quadrant_3
+          goto quadrant_3_even;
+      }
+    }
+
+    quadrant_4_even:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 0: // quadrant_1
+        {
+          set_x_min();
+          goto quadrant_1_odd;
+        }
+        case 1: // quadrant_2
+        {
+          set_x_min();
+          set_y_min();
+          goto quadrant_2_odd;
+        }
+        case 2:// quadrant_4
+          goto quadrant_4_odd;
+      }
+    }
+
+    quadrant_4_odd:
+    {
+      if( ++index > end ) 
+        goto done;
+
+      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;
+
+      switch( xy_signs )
+      {
+        case 0: // quadrant_1
+        {
+          set_x_min();
+          goto quadrant_1_even;
+        }
+        case 1: // quadrant_2
+        {
+          set_x_min();
+          set_y_min();
+          goto quadrant_2_even;
+        }
+        case 2:// quadrant_4
+          goto quadrant_4_even;
+      }
+    }
+
+    done: {}
+  }
+  
+  num = _mm256_div_pd( num, den );
+  num = xor_d( num, sign_h );
+  return num;
+}
+
+int bounded_convex::insert_vertex( int at )
+{
+  int high = end - at;
+  int low  = at - begin;
+  assert( ( low > 0 ) && ( high > 0 ) );
+
+  if( low <= high )
+  {
+    __m256d* ed = edge + ( begin - 1 );
+    __m256d* ed_max = ed + low;
+
+    for( ; ed < ed_max; ++ed )
+    {
+      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed + 1 ) );
+      _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
+    }
+
+    int to_load;
+
+    __m256d* vp = vertex_pair( begin );
+
+    __m256d a0 = _mm256_load_pd( reinterpret_cast< double* >( vp     ) );
+    __m256d a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
+    __m256d a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );
+
+    if( ( begin & 0x1 ) == 0 )
+    {
+      __m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
+      __m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
+      __m256d c2 = gather_lanes< clear_lane, 0 >( a2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), c0 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), c1 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), c2 );
+
+      if( low == 1 )
+      {
+        --begin;
+        return at - 1;
+      }
+      to_load = low - 2;
+    }
+    else
+      to_load = low - 1;
+
+    int q = to_load / 4;
+    __m256d* vp_max = vp + 6 * q;
+    
+    while( vp < vp_max )
+    {
+      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); 
+      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); 
+      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); 
+
+      __m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
+      __m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
+      __m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
+            
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); 
+
+      vp += 3;
+
+      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
+      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
+      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
+
+      __m256d d0 = gather_lanes< 1, 0 >( b0, a0 );
+      __m256d d1 = gather_lanes< 1, 0 >( b1, a1 );
+      __m256d d2 = gather_lanes< 1, 0 >( b2, a2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), d0 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); 
+
+      vp += 3;
+    }
+
+    int r = to_load % 4;
+
+    if( r == 0 )
+    {
+      __m256d c0 = gather_lanes< 1, clear_lane >( a0 );
+      __m256d c1 = gather_lanes< 1, clear_lane >( a1 );
+      __m256d c2 = gather_lanes< 1, clear_lane >( a2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); 
+    }
+    else
+    {
+      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); 
+      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); 
+      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); 
+
+      __m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
+      __m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
+      __m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
+            
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); 
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); 
+
+      switch( r )
+      {
+        case 1: 
+          goto low_end;
+        case 2:
+        {
+          __m256d c0 = gather_lanes< 1, clear_lane >( b0 );
+          __m256d c1 = gather_lanes< 1, clear_lane >( b1 );
+          __m256d c2 = gather_lanes< 1, clear_lane >( b2 );
+                
+          _mm256_store_pd( reinterpret_cast< double* >( vp + 3 ), c0 ); 
+          _mm256_store_pd( reinterpret_cast< double* >( vp + 4 ), c1 ); 
+          _mm256_store_pd( reinterpret_cast< double* >( vp + 5 ), c2 ); 
+          goto low_end;
+        }
+        default:
+        {
+          vp += 3;
+          a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
+          a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
+          a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );
+
+          __m256d c3 = gather_lanes< 1, 0 >( b0, a0 );
+          __m256d c4 = gather_lanes< 1, 0 >( b1, a1 );
+          __m256d c5 = gather_lanes< 1, 0 >( b2, a2 );
+
+          _mm256_store_pd( reinterpret_cast< double* >( vp     ), c3 ); 
+          _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c4 ); 
+          _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c5 ); 
+        }
+      }
+    }
+
+    low_end:
+    {
+      --begin;
+      return at - 1;
+    }
+  }
+  else
+  {
+    __m256d* ed = edge + end + 1;
+    __m256d* ed_min = ed - ( high + 1 );
+
+    for( ; ed > ed_min; --ed )
+    {
+      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed - 1 ) );
+      _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
+    }
+
+    __m256d* vp = vertex_pair( end );
+
+    __m256d a0;
+    __m256d a1;
+    __m256d a2;
+
+    int to_load;
+
+    if( ( end & 0x1 ) == 0 )
+    {
+      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
+      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
+      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
+
+      __m256d c0 = gather_lanes< 1, clear_lane >( a0 );
+      __m256d c1 = gather_lanes< 1, clear_lane >( a1 );
+      __m256d c2 = gather_lanes< 1, clear_lane >( a2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
+
+      if( high == 1 )
+      {
+        ++end;
+        return at;
+      }
+
+      vp -= 3;
+      to_load = high - 2;
+    }
+    else
+    {
+      to_load = high - 1;
+      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp     ) );
+      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
+      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );
+    }
+    
+    int q = to_load / 4;
+
+    __m256d* vp_min = vp - 6 * q;
+    
+    while( vp > vp_min )
+    {
+      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); 
+      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); 
+      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); 
+
+      __m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
+      __m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
+      __m256d c2 = gather_lanes< 1, 0 >( b2, a2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
+
+      vp -= 3;
+
+      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
+      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
+      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
+
+      __m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
+      __m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
+      __m256d d2 = gather_lanes< 1, 0 >( a2, b2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), d0 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
+
+      vp -= 3;
+    }
+
+    int r = to_load % 4;
+
+    if( r == 0 )
+    {
+      __m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
+      __m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
+      __m256d c2 = gather_lanes< clear_lane, 0 >( a2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
+    }
+    else
+    {
+      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); 
+      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); 
+      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); 
+
+      __m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
+      __m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
+      __m256d c2 = gather_lanes< 1, 0 >( b2, a2 );
+
+      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
+      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
+
+      switch( r )
+      {
+        case 1:
+          goto high_end;
+        case 2:
+        {
+          __m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
+          __m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
+          __m256d d2 = gather_lanes< clear_lane, 0 >( b2 );
+
+          _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), d0 );
+          _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), d1 );
+          _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), d2 );
+          goto high_end;
+        }
+        default: // case 3
+        {
+          vp -= 3;
+
+          a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
+          a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
+          a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );
+
+          __m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
+          __m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
+          __m256d d2 = gather_lanes< 1, 0 >( a2, b2 );
+
+          _mm256_store_pd( reinterpret_cast< double* >( vp     ), d0 );
+          _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
+          _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
+        }
+      }
+    }
+
+    high_end:
+    {
+      ++end;
+      return at;
+    }
+  }
+}
+
+void bounded_convex::push( __m256d* begin, __m256d* end )
+{
+  for( ; begin < end; ++begin )
+  {
+    int sign = _mm256_movemask_pd( *begin ) & 0x7;
+
+    switch( sign )
+    {
+      case 0b000: { locator< 0, 0, 0 >::insert( *this, *begin ); break; }
+      case 0b001: { locator< 1, 0, 0 >::insert( *this, *begin ); break; }
+      case 0b010: { locator< 0, 1, 0 >::insert( *this, *begin ); break; }
+      case 0b011: { locator< 1, 1, 0 >::insert( *this, *begin ); break; }
+      case 0b100: { locator< 0, 0, 1 >::insert( *this, *begin ); break; }
+      case 0b101: { locator< 1, 0, 1 >::insert( *this, *begin ); break; }
+      case 0b110: { locator< 0, 1, 1 >::insert( *this, *begin ); break; }
+      case 0b111: { locator< 1, 1, 1 >::insert( *this, *begin ); break; }
+    }
+
+    if( type == bounded_convex_type::empty )
+      return;
+  }
+}
+
+int bounded_convex::remove_vertices( int begin_vertex, int end_vertex )
+{
+  int re   = end_vertex   & 0x1;
+  int rb   = begin_vertex & 0x1;
+  int low  = begin_vertex - begin;
+  int high = end - end_vertex;
+
+  if( low <= high )
+  {
+    __m256d* e_dst     = edge + end_vertex;
+    __m256d* e_dst_min = e_dst - low;
+    __m256d* e_src     = edge + begin_vertex;
+
+    while( --e_dst >= e_dst_min )
+    {
+      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( --e_src ) );
+      _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); 
+    } 
+
+    __m256d* v_dst = vertex_pair( end_vertex   );
+    __m256d* v_src = vertex_pair( begin_vertex );
+
+    if( re == rb )
+    {
+      int q;
+
+      if( re & 0x1 )
+      {
+        __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
+        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
+        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
+
+        __m256d b0 = blend< 0, 0, 1, 1 >( s0, d0 );
+        __m256d b1 = blend< 0, 0, 1, 1 >( s1, d1 );
+        __m256d b2 = blend< 0, 0, 1, 1 >( s2, d2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), b0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );
+
+        q = low;
+      }
+      else
+        q = ( low + 1 );
+
+      q /= 2;
+      __m256d* v_src_min = v_src - 3 * q;
+
+      while( v_src > v_src_min )
+      {
+        --v_src;
+        --v_dst;
+        __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); 
+      }
+    }
+    else
+    {
+      __m256d s0;
+      __m256d s1;
+      __m256d s2;
+
+      int to_load;
+
+      if( rb == 0 )
+      {
+        v_src -= 3;
+
+        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d p0 = gather_lanes< 1, clear_lane >( s0 );
+        __m256d p1 = gather_lanes< 1, clear_lane >( s1 );
+        __m256d p2 = gather_lanes< 1, clear_lane >( s2 );
+
+        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
+        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
+        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
+
+        __m256d b0 = blend< 0, 0, 1, 1 >( p0, d0 );
+        __m256d b1 = blend< 0, 0, 1, 1 >( p1, d1 );
+        __m256d b2 = blend< 0, 0, 1, 1 >( p2, d2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), b0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );
+        
+        if( low == 1 )
+          goto low_end;
+
+        to_load = low - 2;
+      }
+      else
+      {
+        to_load = low - 1;
+
+        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+      }
+
+      int q = to_load / 4;
+      __m256d* v_src_min = v_src - 6 * q;
+
+      while( v_src > v_src_min )
+      {
+        v_src -= 3;
+        v_dst -= 3;
+
+        __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
+        __m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
+        __m256d c2 = gather_lanes< 1, 0 >( b2, s2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
+
+        v_dst -= 3;
+        v_src -= 3;
+
+        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
+        __m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
+        __m256d d2 = gather_lanes< 1, 0 >( s2, b2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), d0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
+      }
+
+      v_dst -= 3;
+      int r = to_load % 4;
+
+      if( r == 0 )
+      {
+        __m256d c0 = gather_lanes< clear_lane, 0 >( s0 );
+        __m256d c1 = gather_lanes< clear_lane, 0 >( s1 );
+        __m256d c2 = gather_lanes< clear_lane, 0 >( s2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
+      }
+      else
+      {
+        v_src -= 3;
+        __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
+        __m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
+        __m256d c2 = gather_lanes< 1, 0 >( b2, s2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
+
+        switch( r )
+        {
+          case 1:
+            goto low_end;
+          case 2:
+          {
+            v_dst -= 3;
+
+            __m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
+            __m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
+            __m256d d2 = gather_lanes< clear_lane, 0 >( b2 );
+
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), d0 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
+            goto low_end;
+          }
+          default:
+          {
+            v_dst -= 3;
+            v_src -= 3;
+
+            s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+            s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+            s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+            __m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
+            __m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
+            __m256d d2 = gather_lanes< 1, 0 >( s2, b2 );
+
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), d0 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
+
+            v_dst -= 3;
+
+            s0 = gather_lanes< clear_lane, 0 >( s0, b0 );
+            s1 = gather_lanes< clear_lane, 0 >( s1, b1 );
+            s2 = gather_lanes< clear_lane, 0 >( s2, b2 );
+
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), s0 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), s1 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), s2 );
+          }
+        }
+      }
+    }
+    
+    low_end:
+    {
+      begin += ( end_vertex - begin_vertex );
+      return end_vertex - 1;
+    }
+  }
+  else
+  {
+    __m256d* e_dst = edge + begin_vertex;
+    __m256d* e_src = edge + end_vertex;
+    __m256d* e_src_max = e_src + high + 1;
+
+    for( ; e_src < e_src_max; ++e_src, ++e_dst )
+    {
+      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( e_src ) );
+      _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); 
+    } 
+
+    __m256d s0;
+    __m256d s1;
+    __m256d s2;
+
+    __m256d* v_dst = vertex_pair( begin_vertex );
+    __m256d* v_src = vertex_pair( end_vertex );
+    
+    if( re == rb )
+    {
+      if( re & 0x1 )
+      {
+        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
+        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
+        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
+
+        __m256d c0 = blend< 0, 0, 1, 1 >( d0, s0 );
+        __m256d c1 = blend< 0, 0, 1, 1 >( d1, s1 );
+        __m256d c2 = blend< 0, 0, 1, 1 >( d2, s2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
+
+        --high;
+        v_src += 3;
+        v_dst += 3;
+      }
+
+      int q = 3 * ( ( high +  1 ) / 2 );
+
+      __m256d* v_src_max = vertex_pair( end + 1 );
+      for( ; v_src < v_src_max; ++v_src, ++v_dst )
+      {
+        __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); 
+      }
+    }
+    else
+    {
+      int to_load;
+
+      __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+      __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+      __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+      if( rb ) // re = 0 =:
+      {
+        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
+        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
+        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
+        
+        __m256d g0 = gather_lanes< 0, 0 >( d0, s0 );
+        __m256d g1 = gather_lanes< 0, 0 >( d1, s1 );
+        __m256d g2 = gather_lanes< 0, 0 >( d2, s2 );
+        
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), g0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), g1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), g2 );
+      
+        if( high == 1 )
+          goto high_end;
+
+        v_dst += 3;
+        to_load = high - 2;
+      }
+      else // re = 1 => only the second lane of the si are used
+        to_load = high - 1;
+    
+      int q = to_load / 4;
+      __m256d* v_src_max = v_src + 6 * q;
+
+      while( v_src < v_src_max )
+      {
+        v_src += 3;
+
+        __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
+        __m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
+        __m256d u2 = gather_lanes< 1, 0 >( s2, t2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), u0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
+
+        v_dst += 3;
+        v_src += 3;
+
+        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
+        __m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
+        __m256d v2 = gather_lanes< 1, 0 >( t2, s2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), v0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
+
+        v_dst += 3;
+      }
+
+      int r = to_load % 4;
+
+      if( r == 0 )
+      {
+        __m256d u0 = gather_lanes< 1, clear_lane >( s0 );
+        __m256d u1 = gather_lanes< 1, clear_lane >( s1 );
+        __m256d u2 = gather_lanes< 1, clear_lane >( s2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), u0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
+      }
+      else
+      {
+        v_src += 3;
+
+        __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+        __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+        __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+        __m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
+        __m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
+        __m256d u2 = gather_lanes< 1, 0 >( s2, t2 );
+
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), u0 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
+        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
+
+        switch( r )
+        {
+          case 1: 
+            goto high_end;
+          case 2:
+          {
+            v_dst += 3;
+            __m256d v0 = gather_lanes< 1, clear_lane >( t0 );
+            __m256d v1 = gather_lanes< 1, clear_lane >( t1 );
+            __m256d v2 = gather_lanes< 1, clear_lane >( t2 );
+
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), v0 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
+
+            goto high_end;
+          }
+          default: // case 3
+          {
+            v_dst += 3;
+            v_src += 3;
+
+            s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
+            s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
+            s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
+
+            __m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
+            __m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
+            __m256d v2 = gather_lanes< 1, 0 >( t2, s2 );
+
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), v0 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
+            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
+          }
+        }
+      }
+    }
+
+    high_end:
+    {
+      end -= ( end_vertex - begin_vertex );
+      return begin_vertex - 1;
+    }
+  }
+}
+
+void bounded_convex::set_to_last_segment()
+{
+  int last = end - 1;
+  edge[ 0 ] = edge[ last      ];
+  edge[ 1 ] = edge[ end       ];
+  edge[ 2 ] = edge[ begin + 1 ];
+
+  __m256d* vl = vertex_pair( last  );
+  __m256d* vb = vertex_pair( begin );
+
+  if( last & 0x1 )
+  {
+    if( begin & 0x1 )
+    {
+      vertex_p[ 0 ] = gather_lanes< 1, 1 >( vl[ 0 ], vb[ 0 ] );
+      vertex_p[ 1 ] = gather_lanes< 1, 1 >( vl[ 1 ], vb[ 1 ] );
+      vertex_p[ 2 ] = gather_lanes< 1, 1 >( vl[ 2 ], vb[ 2 ] );
+    }
+    else
+    {
+      vertex_p[ 0 ] = gather_lanes< 1, 0 >( vl[ 0 ], vb[ 0 ] );
+      vertex_p[ 1 ] = gather_lanes< 1, 0 >( vl[ 1 ], vb[ 1 ] );
+      vertex_p[ 2 ] = gather_lanes< 1, 0 >( vl[ 2 ], vb[ 2 ] );
+    }
+  }
+  else
+  {
+    if( begin & 0x1 )
+    {
+      vertex_p[ 0 ] = blend< 0, 0, 1, 1 >( vl[ 0 ], vb[ 0 ] );
+      vertex_p[ 1 ] = blend< 0, 0, 1, 1 >( vl[ 1 ], vb[ 1 ] );
+      vertex_p[ 2 ] = blend< 0, 0, 1, 1 >( vl[ 2 ], vb[ 2 ] );
+    }
+    else
+    {
+      vertex_p[ 0 ] = gather_lanes< 0, 0 >( vl[ 0 ], vb[ 0 ] );
+      vertex_p[ 1 ] = gather_lanes< 0, 0 >( vl[ 1 ], vb[ 1 ] );
+      vertex_p[ 2 ] = gather_lanes< 0, 0 >( vl[ 2 ], vb[ 2 ] );
+    }
+  }
+  
+  end = 2;
+  begin = 0;
+  type = bounded_convex_type::segment;
+}
+
+void bounded_convex::setup( __m256d box )
+{
+  type = bounded_convex_type::polygon;
+
+  __m256d zero    = _mm256_xor_pd(       zero, zero );
+  __m256i ones    = _mm256_cmpeq_epi64(  ones, ones );
+  __m256i signi   = _mm256_slli_epi64(   ones,   63 );
+  __m256i signi_h = _mm256_slli_si256(   signi,   8 );
+  __m256d sign    = _mm256_castsi256_pd( signi      );
+  __m256d sign_h  = _mm256_castsi256_pd( signi_h    );
+  __m256i onei    = _mm256_srli_epi64(   ones,   54 );
+          onei    = _mm256_slli_epi64(   onei,   52 ); // { 1.0, 1.0, 1.0, 1.0 }
+  __m256d one     = _mm256_castsi256_pd( onei       );
+  
+  box = _mm256_max_pd( box, zero );
+
+  begin = alloc / 2;
+  end = begin + 4;
+  __m256d* e = edge + begin;
+  __m256d x_box = gather_lanes< 0, 0 >( box );      // { x_min, x_max, x_min, x_max }
+  __m256d y_box = gather_lanes< 1, 1 >( box );      // { y_min, y_max, y_min, y_max }
+
+  e[ 0 ] = blend< 0, 1, 0, 0 >( one,    zero   );  // { 1, 0,     0,      0}, 
+  e[ 0 ] = blend< 0, 0, 1, 1 >( e[ 0 ], x_box  );  // { 1, 0, x_min, x_max }, for x >=  x_min
+
+  e[ 1 ] = permute< 1, 0, 2, 3 >( e[ 0 ] );         // { 0 , 1, x_min, x_max }  //  y >=  y_min                        
+  e[ 1 ] = blend< 0, 0, 1, 1 >( e[ 1 ], y_box   );  // { 0 , 1, y_min, y_max }  //  y >=  y_min
+
+  e[ 2 ] = permute< 0, 1, 3, 2 >( e[ 0 ]  );         //  {  1 ,  0,  x_max,  x_min }
+  e[ 2 ] = _mm256_xor_pd( e[ 2 ], sign );            //  { -1 , -, -x_max, -y_min }   -x >= -x_max 
+
+  e[ 3 ] = permute< 0, 1, 3, 2 >( e[ 1 ] );          // {  0, 1,  y_max, ? } 
+  e[ 3 ] = _mm256_xor_pd( e[ 3 ], sign );            // {  0, 1, -y_max, ? }  -y >= -y_max 
+  
+  e[ 4 ] = e[ 0 ];                                   //  { 1, 0,  x_min,  } }
+
+  // initial vertices
+
+  __m256d* vp = vertex_pair( begin );
+  
+  vp[ 0 ] = permute< 0, 0, 3, 3 >( x_box );      // { x_min,  x_min, x_max,  x_max } : vertices 0 and 1: 
+  vp[ 0 ] = _mm256_xor_pd( vp[ 0 ], sign_h );    // { x_min, -x_min, x_max, -x_max } : vertices 0 and 1: 
+
+  vp[ 1 ] = permute< 0, 0, 2, 2 >( y_box );      // { y_min,  y_min, y_min,  y_min }
+  vp[ 1 ] = _mm256_xor_pd( vp[ 1 ], sign_h );    // { y_min, -y_min, y_min, -y_min }
+
+  vp[ 2 ] = _mm256_xor_pd( one, sign_h );        // {    1.0,   -1.0,   1.0,  -1.0 }
+
+  vp[ 3 ] = permute< 1, 1, 2, 2 >( x_box  );    // { x_max,  x_max, x_min, x_min } : vertices 2 and 3: 
+  vp[ 3 ] = _mm256_xor_pd( vp[ 3 ], sign_h );   // { x_max, -x_max, x_min, -x_min }
+
+  vp[ 4 ] = permute< 1, 1, 3, 3 >( y_box );     // { y_max,  y_max, y_max,  y_max }
+  vp[ 4 ] = _mm256_xor_pd( vp[ 4 ], sign_h );   // { y_max, -y_max, y_max, -y_max }
+
+  vp[ 5 ] = vp[ 2 ];
+}
+
+int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge )
+{
+  constexpr unsigned long long x_index_epi8 = 0x1 <<  4;  
+  constexpr unsigned long long y_index_epi8 = 0x1 << 12;  
+  constexpr unsigned long long z_index_epi8 = 0x1 << 16;  
+
+  int ret = 0;
+  __m256i ones     = ones_i();                              // { 0xFFFF'FFFF'FFFF'FFFF, ...   }
+  __m256i exp_mask = _mm256_srli_epi64(     ones, 53 );     // { 0x0000'0000'0000'07FF, ....  }
+  __m256i one      = _mm256_srli_epi64(     ones, 54 );     // { 0x0000'0000'0000'03FF, ....  }
+  __m256i sign     = _mm256_slli_epi64(     ones, 63 );     // { 0x8000'0000'0000'0000, ....  }
+          exp_mask = _mm256_slli_epi64( exp_mask, 52 );     // { 0x7FF0'0000'0000'0000, ....  }
+          one      = _mm256_slli_epi64(      one, 52 );     // { 0x3FF0'0000'0000'0000, ....  }
+          sign     = gather_lanes< 1, clear_lane >( sign ); // { sign_bit, sign_bit, 0, 0 }
+
+  for( int iq = 0; iq < 4; ++iq )
+  {
+    __m256d* end = quadrant[ iq ] + n;
+
+    while( quadrant[ iq ] < end )
+    {
+      __m256d* q       = quadrant[ iq ];
+      __m256i exp      = and_i( *q, exp_mask );
+      __m256i cmp_huge = _mm256_cmpgt_epi32( exp, huge );
+      
+      if( _mm256_testz_si256( cmp_huge, cmp_huge ) )  // no huge entries
+      {
+        __m256i cmp_tiny = _mm256_cmpgt_epi32( tiny, exp );
+
+        if( _mm256_testz_si256( cmp_tiny, cmp_tiny ) )  // no entries are huge or tiny
+        {
+          ++quadrant[ iq ];
+          continue;
+        }
+
+        // no entries are huge, but aome are tiny. We scale by 2^(huge - max_exp[ i ] )
+
+        __m256i exp_b = permute< 1, 0, 2, 3 >( exp );   // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
+        __m256i exp_c = gather_lanes< 1, 1 >( exp );    // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
+        __m256i max   = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
+                max   = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}
+
+        int64_t int_max = _mm256_extract_epi64( max, 0 );
+        if( int_max == 0 ) // all exponents are zero.
+        {
+          __m256d cmp = _mm256_cmp_pd( *q, zero_d(), _CMP_EQ_OQ );
+          if( ( _mm256_movemask_pd( cmp ) & 0x7 ) == 0x7  ) // all coefficients are zero
+          {                                                 // drop the constraint
+            *q = *( --end );
+            continue;
+          }
+        }
+
+        max = permute< 0, 0, 0, 0 >( max );
+        __m256i de = _mm256_sub_epi32( huge, max );    
+        
+        // de may be larger than 0x3ff and we must scale in two steps
+
+        *q = xor_d( *q, sign );  
+
+        __m256i hs    = _mm256_srli_epi64( de, 1 );       // de/2
+                hs    = _mm256_and_si256( hs, exp_mask ); // throwing away the remainder
+        __m256d scale = cast_d( _mm256_add_epi64( hs, one ) );
+         *q = _mm256_mul_pd( scale, *q ); 
+
+        __m256i ls    = _mm256_sub_epi64( de, hs ); // de - hs
+        scale = cast_d( _mm256_add_epi64( ls, one ) );
+        *q = _mm256_mul_pd( scale, *q ); 
+      }
+      else // some huge entries
+      {
+        __m256i exp_b = permute< 1, 0, 2, 3 >( exp );   // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
+        __m256i exp_c = gather_lanes< 1, 1 >( exp );    // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
+        __m256i max   = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
+                max   = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}
+
+        max = permute< 0, 0, 0, 0 >( max );
+        __m256i de = _mm256_sub_epi32( huge, max );    // de > -0x3ff
+        __m256d scale = cast_d( _mm256_add_epi64( de, one ) ); // this does not underflow
+
+        *q = xor_d( *q, sign );
+        *q = _mm256_mul_pd( scale, *q ); // this may be inexact, but the inexact entries will be dropped later
+      }
+
+      // cleaning up the remaining tiny entries
+
+      __m256i exp_d  = and_i( *q, exp_mask );
+      __m256i cmp_d  = _mm256_cmpgt_epi32( exp_d, tiny ); // 8 x (4 byte masks)
+      int cmp_d_mask = _mm256_movemask_epi8( cmp_d ); // 8 bits for x, 8 bits for y, 8 bits for z
+
+      if( cmp_d_mask & ( x_index_epi8 | y_index_epi8 ) )
+      {
+        // In order to preserve the solution set, we must round the right hand side down and 
+        // the left hand side up, so that every solution of the old problem satisfies the new contraints.
+        // To do that, we want to round the left hand side up and the right hand side down. Since
+        // the lhs were flipped, we simply round all entries down, that is:
+        //
+        // we replace x in ( -tiny, 0 ) by -tiny and x in [ 0, tiny  ) by +0.0
+        // (and fix the zeros latter )
+
+        __m256d zero = zero_d();
+        __m256i exp_f        = and_i( *q, exp_mask );
+        __m256i full_sign    = gather_lanes< 0, 0 >( sign );
+        __m256i minus_tiny   = _mm256_or_si256( tiny, full_sign );
+        __m256i is_tiny_f    = _mm256_cmpgt_epi32( tiny, exp_f );
+        __m256d non_negative = _mm256_cmp_pd( *q, zero, _CMP_GE_OQ );
+
+        // tiny negatives should be replaced by -tiny, tiny non negatives are replace by 0.
+        __m256d fix = blend( cast_d( minus_tiny ), zero, non_negative );
+
+        // fixing the tiny entries 
+        *q = blend( *q, fix, is_tiny_f );
+
+        // restoring the rhs sign
+        *q = xor_d( *q, sign );
+
+        // cleaning up the zeros
+        __m256i abs     = _mm256_slli_epi64( cast_i( *q ), 1 ); // throwing away the sign bit
+        __m256i is_zero = _mm256_cmpeq_epi64( abs, cast_i( zero ) );
+
+        *q = blend( *q, zero, is_zero );
+
+        ++quadrant[ iq ];
+      }
+      else
+      {
+        // | a | <= tiny and | b | <= tiny: and |c| >= huge,
+        // The constraint is either irrelevant or fatal.
+
+        int pb_sign = _mm256_movemask_pd( *q );
+        if( !( pb_sign & 0x4 ) ) // drop the quadrant: the right hand side is too large
+        { 
+          ret |= ( 0x1 << iq );
+          break;
+        }
+        
+        // else, drop the constraint: the right hand side is too small
+
+        *q = *( --end );
+      }
+    }
+  }
+
+  return ret;
+}
+
+} // namespace wm::nlp::lp2d
+//----------------------------------------------------------------------------------------
+
+extern "C" 
+{
+
+int interval_system_2d( double* solution, double const* box, double const* eq, 
+                        unsigned int n_equations )
+{
+  constexpr int max_quadrant_size = 32;
+
+  using namespace wm::nlp::lp2d;
+
+  double* solsPtr = solution;
+
+  printf("box: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", box[0], box[1], box[2], box[3]);
+  printf("n_equations = %u\n", n_equations);
+  for (int i = 0; i < n_equations; i++) {
+      printf("equation %u: a_min = %f, a_max = %f, b_min = %f, b_max = %f, c_min = %f, c_max = %f\n", i, eq[6*i], eq[6*i + 1], eq[6*i + 2], eq[6*i + 3], eq[6*i + 4], eq[6*i + 5]);
+  }
+
+  if( n_equations > ( max_quadrant_size / 2 ) ) 
+    return -1;
+
+  if( std::bit_cast< uint64_t >( box ) & 0x1F ) 
+    return -2;
+
+  if( std::bit_cast< uint64_t >( eq ) & 0x1F ) 
+    return -3;
+
+
+
+  int ret = 0;
+  int n_equations_q = n_equations / 2;
+  int n_equations_r = n_equations % 2;
+  
+  double const* eq_max = eq + n_equations_q;
+
+  __m256i huge;
+  __m256i tiny;
+  __m256i signi    = bounds( &tiny, &huge );
+  __m256i exp_mask = _mm256_srli_epi64( ones_i(), 53 );
+          exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000... }
+
+  __m256d sign     = cast_d( signi   );
+  __m256d sign_01  = blend< 0, 0, 1, 1 >( sign, zero_d() ); // { -1: -1, 0, 0 }
+  
+  int old_rounding_mode = std::fegetround();
+
+  if( std::fesetround( FE_DOWNWARD ) )
+    return -4;
+
+  __m256d quadrants[ 4 * max_quadrant_size ];
+  
+  __m256d* q[ 4 ] = { quadrants, 
+                      quadrants +     max_quadrant_size, 
+                      quadrants + 2 * max_quadrant_size, 
+                      quadrants + 3 * max_quadrant_size };
+
+  int empty_quadrant = 0;
+  double const* p_eq = eq;
+  int quadrant_is_not_empty = 0xF;
+
+  for( ; p_eq < eq_max; p_eq += 12 )
+  {
+    __m256d  a0 = _mm256_load_pd( p_eq     );
+    __m256d aux = _mm256_load_pd( p_eq + 4 );
+    __m256d bc1 = _mm256_load_pd( p_eq + 8 );
+
+    // the equations in this group are
+    //
+    //  a0[ 0, 1 ] x +  a0[ 2, 3 ] y = aux[ 0, 1 ] 
+    // aux[ 2, 3 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] 
+
+    __m256d bc0 = gather_lanes< 1, 0 >( a0, aux );
+    __m256d  a1 = gather_lanes< 1, 0 >( aux );
+    
+    // the equations in this group are now 
+    //
+    // a0[ 0, 1 ] x + bc0[ 0, 1 ] y = bc0[ 2, 3 ] 
+    // a1[ 0, 1 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] 
+
+    __m256d max_c0 = permute< 0, 1, 3, 2 >( bc0 ); //{ ?, ?, bc0[ 3 ], ? }
+    __m256d max_c1 = permute< 0, 1, 3, 2 >( bc1 ); //{ ?, ?, bc1[ 3 ], ? }
+
+    for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
+    {
+      q[ iq ][ 0 ] = gather_low(  a0, bc0 ); // { a0[ 0 ], bc0[ 0 ], ?, ? }
+      q[ iq ][ 1 ] = gather_high( a0, bc0 ); // { a0[ 1 ], bc0[ 1 ], ?, ? }
+      q[ iq ][ 2 ] = gather_low(  a1, bc1 ); // { a1[ 0 ], bc1[ 0 ], ?, ? }
+      q[ iq ][ 3 ] = gather_high( a1, bc1 ); // { a1[ 1 ], bc1[ 1 ], ?, ? }
+
+      q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c0 ); // { a0[ 0 ], bc0[ 0 ], bc0[ 3 ], ? }
+      q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], bc0    ); // { a0[ 1 ], bc0[ 1 ], bc0[ 2 ], ? }
+      q[ iq ][ 2 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 2 ], max_c1 ); // { a1[ 0 ], bc1[ 0 ], bc1[ 3 ], ? }
+      q[ iq ][ 3 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 3 ], bc1    ); // { a1[ 1 ], bc1[ 1 ], bc1[ 2 ], ? }
+
+      q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -a0[ 0 ], -bc0[ 0 ], -bc0[ 3 ], ? }
+      q[ iq ][ 2 ] = xor_d( q[ iq ][ 2 ], sign ); // { -a1[ 0 ], -bc1[ 0 ], -bc1[ 3 ], ? }
+
+      // moving to the next quadrant 
+
+      if( iq & 0x1 ) // replacing y by - y
+      {
+        bc0 = xor_d( bc0, sign_01 );
+        bc1 = xor_d( bc1, sign_01 );
+        bc0 = permute< 1, 0, 2, 3 >( bc0 );
+        bc1 = permute< 1, 0, 2, 3 >( bc1 ); 
+      }
+      else // replacing x by - x
+      {
+        a0 = xor_d( a0, sign_01 );
+        a1 = xor_d( a1, sign_01 );
+        a0 = permute< 1, 0, 2, 3 >( a0 ); 
+        a1 = permute< 1, 0, 2, 3 >( a1 );
+      }
+    }
+    
+    //----------------------------------------------------------------------------------------    
+    // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we 
+    // normalize equation in which some elements are either +0.0 or have absolute value outside 
+    // the range [ tiny, huge ]. In this process we may detect that the intersection  of the 
+    // feasible region with some quadrants is empty, or that some equations are irrelevant.
+    //----------------------------------------------------------------------------------------
+
+    __m256d ra  = gather_lanes< 0, 0 >( a0, a1 ); // { row[ 0 ][ 0 ], row[ 0 ][ 1 ], row[ 3 ][ 0 ], row[ 3 ][ 1 ] }
+    __m256i expa = and_i(  ra, exp_mask );
+    __m256i exp0 = and_i( bc0, exp_mask );
+    __m256i exp1 = and_i( bc1, exp_mask );
+
+    __m256i min = _mm256_min_epi32( expa, exp0 );
+    __m256i max = _mm256_max_epi32( expa, exp0 );
+            min = _mm256_min_epi32( min, exp1 );
+            max = _mm256_max_epi32( max, exp1 );
+
+    __m256i c_min = _mm256_cmpgt_epi32( tiny, min );
+    __m256i c_max = _mm256_cmpgt_epi32( max, huge );
+    __m256i bad   = or_i( c_min, c_max );
+
+    if( _mm256_testz_si256( bad, bad ) ) 
+    {
+      q[ 0 ] += 4;
+      q[ 1 ] += 4;
+      q[ 2 ] += 4;
+      q[ 3 ] += 4;
+    }
+    else
+      empty_quadrant |= normalize_equations( q, 4, tiny, huge );
+  }
+
+  if( n_equations_r )
+  {
+    __m256d  ab = _mm256_load_pd( p_eq     );
+    __m128d aux = _mm_load_pd(    p_eq + 4 );
+
+    __m256d sign_23 = gather_lanes< 1, 0 >( sign_01 );
+
+    // the equations in this group are
+    //  ab[ 0, 1 ] x + ab[ 2, 3 ] y = aux[ 0, 1 ] 
+    
+    __m256d c = _mm256_castpd128_pd256( aux );
+            c = gather_lanes< 0, 0 >( c );    
+    // the equations in this group are now 
+    // ab[ 0, 1 ] x + ab[ 2, 3 ] y = c[ 0, 1 ] 
+    
+    __m256d max_c = permute< 0, 1, 3, 3 >( c ); //{ ?, ?, c[ 1 ], ? }
+    
+    for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
+    {
+      q[ iq ][ 0 ] = permute< 0, 2, 2, 3 >( ab ); // { ab[ 0 ], ab[ 2 ], ?, ? }
+      q[ iq ][ 1 ] = permute< 1, 3, 2, 3 >( ab ); // { ab[ 1 ], ab[ 3 ], ?, ? }
+
+      q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c ); // { ab[ 0 ], ab[ 2 ], c[ 1 ], ? }
+      q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ],     c ); // { ab[ 1 ], ab[ 3 ], c[ 0 ], ? }
+
+      q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -ab[ 0 ], -ab[ 2 ], -c[ 1 ], ? }
+      
+      // moving to the next quadrant 
+
+      if( iq & 0x1 ) // replacing y by - y
+      {
+        ab = xor_d( ab, sign_23 );
+        ab = permute< 0, 1, 3, 2 >( ab );
+      }
+      else // replacing x by - x
+      {
+        ab = xor_d( ab, sign_01 );
+        ab = permute< 1, 0, 2, 3 >( ab ); 
+      }
+    }
+    
+    //----------------------------------------------------------------------------------------    
+    // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we 
+    // normalize equation in which some elements are either +0.0 or have absolute value outside 
+    // the range [ tiny, huge ]. In this process we may detect that the intersection  of the 
+    // feasible region with some quadrants is empty, or that some equations are irrelevant.
+    //----------------------------------------------------------------------------------------
+
+    __m256i exp_ab = and_i( ab, exp_mask );
+    __m256i exp_c  = and_i( c,  exp_mask );
+            exp_c  = blend< 0, 0, 1, 1 >( exp_c, exp_ab );
+
+    __m256i min = _mm256_min_epi32( exp_ab, exp_c );
+    __m256i max = _mm256_max_epi32( exp_ab, exp_c );
+
+    __m256i c_min = _mm256_cmpgt_epi32( tiny, min );
+    __m256i c_max = _mm256_cmpgt_epi32( max, huge );
+    __m256i bad   = or_i( c_min, c_max );
+
+    if( _mm256_testz_si256( bad, bad ) ) 
+    {
+      q[ 0 ] += 2;
+      q[ 1 ] += 2;
+      q[ 2 ] += 2;
+      q[ 3 ] += 2;
+    }
+    else
+      empty_quadrant |= normalize_equations( q, 2, tiny, huge );
+  }
+
+  __m256d avx_box  = _mm256_load_pd( box );
+  __m256d cmp      = _mm256_cmp_pd( avx_box, zero_d(), _CMP_GE_OQ );
+  int positive_box = _mm256_movemask_pd( cmp );
+
+  __m256d edge[        2 * ( max_quadrant_size + 2 ) ];
+  __m256d vertex_pair[ 3 * ( max_quadrant_size + 2 ) ];
+  bounded_convex bc( 2 * ( max_quadrant_size + 2 ), edge, vertex_pair );
+
+  constexpr uint x_min_field = 0x1;
+  constexpr uint x_max_field = 0x2;
+  constexpr uint y_min_field = 0x4;
+  constexpr uint y_max_field = 0x8;
+
+  constexpr uint first_quadrant_field  = 0x1;
+  constexpr uint second_quadrant_field = 0x2;
+  constexpr uint third_quadrant_field  = 0x4;
+  constexpr uint fourth_quadrant_field = 0x8;
+
+  if( positive_box & x_max_field ) // x_max >= 0 
+  {
+    if( !( empty_quadrant & first_quadrant_field ) &&
+         ( positive_box   & y_max_field ) ) // y_max >= 0, first quadrant
+    {
+      bc.setup( avx_box );
+      bc.push( quadrants, q[ 0 ] );
+      if( bc.type != bounded_convex_type::empty )
+      {
+        ++ret;
+        __m256d bc_box = bc.box();
+        _mm256_store_pd( solution, bc_box );
+        solution += 4;
+      }
+    }
+
+    if( !( empty_quadrant & fourth_quadrant_field ) && 
+        !( positive_box & y_min_field ) ) // y_min < 0, fourth quadrant
+    {
+      __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
+              y_sign = gather_lanes< clear_lane, 1 >( y_sign );
+
+      __m256d q_box  = permute< 0, 1, 3, 2 >( avx_box );
+              q_box  = xor_d( q_box, y_sign );
+
+      bc.setup( q_box );
+      bc.push( quadrants + 3 * max_quadrant_size, q[ 3 ] );
+
+      if( bc.type != bounded_convex_type::empty )
+      {
+        ++ret;
+        __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
+        y_sign = gather_lanes< clear_lane, 1 >( y_sign );
+
+        __m256d bc_box = bc.box();
+        bc_box = xor_d( bc_box, y_sign );
+        bc_box  = permute< 0, 1, 3, 2 >( bc_box );
+        _mm256_store_pd( solution, bc_box );
+        solution += 4;
+      }
+    }
+  }
+
+  if( !( positive_box & x_min_field ) ) // x_min < 0 
+  {
+    if( !( empty_quadrant & second_quadrant_field) && 
+         ( positive_box & y_max_field ) ) // y_max >= 0, second quadrant
+    {
+      __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
+      x_sign = gather_lanes< 0, clear_lane >( x_sign );
+
+      __m256d q_box = permute< 1, 0, 2, 3 >( avx_box );
+              q_box = xor_d( q_box, x_sign );
+
+      bc.setup( q_box );
+      bc.push( quadrants + max_quadrant_size, q[ 1 ] );
+
+      if( bc.type != bounded_convex_type::empty )
+      {
+        ++ret;
+        __m256d bc_box = bc.box();
+        __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
+        x_sign = gather_lanes< 0, clear_lane >( x_sign );
+        bc_box = xor_d( bc_box, x_sign );
+        bc_box = permute< 1, 0, 2, 3 >( bc_box );
+        _mm256_store_pd( solution, bc_box );
+        solution += 4;
+      }
+    }
+
+    if( !( empty_quadrant & third_quadrant_field ) && 
+        !( positive_box   & y_min_field ) ) // y_min < 0, third quadrant
+    {
+      __m256i sign = _mm256_slli_epi64( ones_i(), 63 );
+      avx_box = permute< 1, 0, 3, 2 >( avx_box );
+      avx_box = xor_d( avx_box, sign );
+      bc.setup( avx_box );
+      bc.push( quadrants + 2 * max_quadrant_size, q[ 2 ] );
+
+      if( bc.type != bounded_convex_type::empty )
+      {
+        ++ret;
+        __m256d bc_box = bc.box();
+        __m256i sign = _mm256_slli_epi64( ones_i(), 63 );
+        bc_box = xor_d( bc_box, sign );
+        bc_box  = permute< 1, 0, 3, 2 >( bc_box );
+        _mm256_store_pd( solution, bc_box );
+      }
+    }
+  }
+
+  std::fesetround( old_rounding_mode );
+
+  for (int i = 0; i < ret; i++) {
+    printf("solution  %u: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", i, solsPtr[4*i], solsPtr[4*i + 1], solsPtr[4*i + 2], solsPtr[4*i + 3]);
+  }
+
+  return ret;
+}
+
+} // extern "C"
+//----------------------------------------------------------------------------------------
\ No newline at end of file
diff --git a/brush-strokes/cbits/lp_2d.hpp b/brush-strokes/cbits/lp_2d.hpp
new file mode 100644
index 0000000..32ffa0b
--- /dev/null
+++ b/brush-strokes/cbits/lp_2d.hpp
@@ -0,0 +1,1265 @@
+#pragma once
+
+//----------------------------------------------------------------------------------------
+// Copyright (c) 2024 Walter Mascarenhas
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability",
+// and we provide no guarantee regarding the correctness of this source code.
+//
+//----------------------------------------------------------------------------------------
+
+#include <bit>
+#include <cassert>
+#include <immintrin.h>
+
+typedef unsigned int uint;
+
+//----------------------------------------------------------------------------------------
+namespace wm::nlp::lp2d {
+
+//----------------------------------------------------------------------------------------
+// avx2 wrappers, to simplify the notation a bit and clarify the use of immediates 
+//----------------------------------------------------------------------------------------
+
+inline __m256d and_d( __m256d x, __m256d y )
+{
+  return _mm256_and_pd( x, y );
+}
+
+inline __m256i and_i( __m256i x, __m256d y )
+{
+  __m256i yi = _mm256_castpd_si256( y );
+  return _mm256_and_si256( x, yi );
+}
+
+inline __m256i and_i( __m256d x, __m256i y )
+{
+  __m256i xi = _mm256_castpd_si256( x );
+  __m256i  r = _mm256_and_si256( xi, y );
+  return r;
+}
+
+inline __m256i and_i( __m256i x, __m256i y )
+{
+  return _mm256_and_si256( x, y );
+}
+
+inline __m256d and_not_d( __m256i x, __m256i y )
+{
+  __m256d xd = _mm256_castsi256_pd( x );
+  __m256d yd = _mm256_castsi256_pd( y );
+  return _mm256_andnot_pd( xd, yd );
+}
+
+inline __m256d and_not_d( __m256d x, __m256d y )
+{
+  return _mm256_andnot_pd( x, y );
+}
+
+inline __m256i and_not_i( __m256i x, __m256i y )
+{
+  __m256d xd = _mm256_castsi256_pd( x );
+  __m256d yd = _mm256_castsi256_pd( y );
+  __m256d rd = _mm256_andnot_pd( xd, yd );
+  __m256i r  = _mm256_castpd_si256( rd );
+  return r;
+}
+
+template < uint I0, uint I1, uint I2, uint I3 >
+requires ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 < 2 ) && ( I3 < 2 ) )
+__m256d blend( __m256d x, __m256d y )
+{
+  __m256d p = _mm256_blend_pd( x, y, ( I0 | ( I1 << 1 ) | ( I2 << 2 ) | ( I3 << 3 ) ) );
+  return p;
+}
+
+template < uint I0, uint I1, uint I2, uint I3 >
+requires ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 < 2 ) && ( I3 < 2 ) )
+__m256i blend( __m256i x, __m256i y )
+{
+  __m256d xd = _mm256_castsi256_pd( x );
+  __m256d yd = _mm256_castsi256_pd( y );
+  __m256d pd = _mm256_blend_pd( xd, yd, ( I0 | ( I1 << 1 ) | ( I2 << 2 ) | ( I3 << 3 ) ) );
+  __m256i p = _mm256_castpd_si256( pd );
+  return p;
+}
+
+inline __m256d blend( __m256d a, __m256d b, __m256d mask )
+{
+  __m256d blended = _mm256_blendv_pd( a, b, mask );
+  return blended;
+}
+
+inline __m256d blend( __m256d a, __m256d b, __m256i mask )
+{
+  __m256d mask_d  = _mm256_castsi256_pd( mask );
+  __m256d blended = _mm256_blendv_pd( a, b, mask_d );
+  return blended;
+}
+
+inline __m256i blend( __m256i a, __m256i b, __m256i mask )
+{
+  __m256d a_d       = _mm256_castsi256_pd( a );
+  __m256d b_d       = _mm256_castsi256_pd( b );
+  __m256d mask_d    = _mm256_castsi256_pd( mask );
+  __m256d blended_d = _mm256_blendv_pd( a_d, b_d, mask_d );
+  __m256i blended   = _mm256_castpd_si256( blended_d );
+  return blended;
+}
+
+inline __m256d blend_max( __m256d x, __m256d y, __m256d greater_mask )
+{
+  __m256d max = blend( y, x, greater_mask );  
+  return max;
+}
+
+inline __m256d blend_min( __m256d x, __m256d y, __m256d greater_mask )
+{
+  __m256d min = blend( x, y, greater_mask );  
+  return min;
+} 
+
+inline __m256d cast_d( __m256i x )
+{
+  return _mm256_castsi256_pd( x );
+}
+
+inline __m256i cast_i( __m256d x )
+{
+  return _mm256_castpd_si256( x );
+}
+
+//----------------------------------------------------------------------------------------
+// 
+// gather_lanes< a, b >( x ), with a,b in { 0, 1, 2 } returns { x.lane( a ), x.lane( b ) }
+// with x.lane( 2 ) = 0
+//
+//----------------------------------------------------------------------------------------
+
+constexpr int clear_lane = 8;
+
+template < uint I0, uint I1 >
+requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) )
+__m256i gather_lanes( __m256i x )
+{
+  constexpr int lane_0 = I0;
+  constexpr int lane_1 = I1 << 4;
+  __m256d xd = _mm256_castsi256_pd( x );
+  __m256d gd = _mm256_permute2f128_pd( xd, xd, lane_0 | lane_1 );
+  __m256i g  = _mm256_castpd_si256( gd );
+  return g;
+}
+
+template < uint I0, uint I1 >
+requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) )
+__m256d gather_lanes( __m256d x )
+{
+  constexpr int lane_0 = I0;
+  constexpr int lane_1 = I1 << 4;
+
+  __m256d p;
+  if constexpr ( ( I0 == 0 ) && ( I1 == 1 ) )
+    p = x;
+  else
+    p = _mm256_permute2f128_pd( x, x, lane_0 | lane_1 );
+  return p;
+}
+
+//----------------------------------------------------------------------------------------
+// 
+// gather_lanes< a, b >( x, y ), with a,b in { 0, 1, clear_lane } returns { x.lane( a ), y.lane( b ) }
+// with x.lane( clear_lane ) = 0
+//
+//----------------------------------------------------------------------------------------
+
+template < uint I0, uint I1 >
+requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) )
+__m256d gather_lanes( __m256d x, __m256d y )
+{
+  constexpr int lane_0 = I0;
+  constexpr int lane_1 = ( (I1 == clear_lane) ? clear_lane : ( I1 + 2 ) ) << 4;
+
+  __m256d p;
+  if( ( I0 == 0 ) && ( I1 == 1 ) )
+    p = blend< 0, 0, 1, 1 >( x, y );
+  else
+    p = _mm256_permute2f128_pd( x, y, lane_0 | lane_1 );
+  return p;
+}
+
+//----------------------------------------------------------------------------------------
+// gather_high returns { x[ 1 ], y[ 1 ], x[ 3 ], y[ 3 ] }
+//----------------------------------------------------------------------------------------
+
+inline __m256d gather_high( __m256d x, __m256d y )
+{
+  return _mm256_unpackhi_pd( x, y );
+}
+
+//----------------------------------------------------------------------------------------
+// gather_low returns { x[ 0 ], y[ 0 ], x[ 2 ], y[ 2 ] }
+//----------------------------------------------------------------------------------------
+
+inline __m256d gather_low( __m256d x, __m256d y )
+{
+  return _mm256_unpacklo_pd( x, y );
+}
+
+//----------------------------------------------------------------------------------------
+// high_bits_64 returns the bits 63, 64 + 63, 128 + 63 and 196 + 63 of x
+//----------------------------------------------------------------------------------------
+
+inline int high_bits_64( __m256d x )
+{
+  return _mm256_movemask_pd( x );
+}
+
+inline int high_bits_64( __m256i x )
+{
+  __m256d xd = cast_d( x );
+  return _mm256_movemask_pd( xd );
+}
+
+//----------------------------------------------------------------------------------------
+// gather_low returns { x[ 0 ], y[ 0 ], x[ 2 ], y[ 2 ] }
+//----------------------------------------------------------------------------------------
+
+inline __m256d is_greater( __m256d x, __m256d y )
+{
+  __m256d gt = _mm256_cmp_pd( x, y, _CMP_GT_OQ );
+  return gt;
+}
+
+inline __m256i ones_i() 
+{
+  __m256i r;
+  return _mm256_cmpeq_epi64( r, r );
+}
+
+inline __m256d or_d( __m256d x, __m256d y )
+{
+  return _mm256_or_pd( x, y );
+}
+
+inline __m256i or_i( __m256i x, __m256i y )
+{
+  return _mm256_or_si256( x, y );
+}
+
+template < uint I0, uint I1, uint I2, uint I3 >
+requires ( ( I0 < 4 ) && ( I1 < 4 ) && ( I2 < 4 ) && ( I3 < 4 ) )
+__m256d permute( __m256d x )
+{
+  __m256d p;
+  if constexpr ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 > 1 ) && ( I3 > 1 ) )
+    p = _mm256_permute_pd( x, ( I0 | ( I1 << 1 ) | ( ( I2 - 2 ) << 2 ) | ( ( I3 - 2 ) << 3 ) ) );
+  else
+    p = _mm256_permute4x64_pd( x, ( I0 | ( I1 << 2 ) | ( I2 << 4 ) | ( I3 << 6 ) ) );
+  return p;
+}
+
+template < uint I0, uint I1, uint I2, uint I3 >
+requires ( ( I0 < 4 ) && ( I1 < 4 ) && ( I2 < 4 ) && ( I3 < 4 ) )
+__m256i permute( __m256i x )
+{
+  __m256d xd = _mm256_castsi256_pd( x );
+  __m256d pd = permute< I0, I1, I2, I3 >( xd );
+  __m256i p = _mm256_castpd_si256( pd );
+  return p;
+}
+
+inline __m256d xor_d( __m256d x, __m256d y )
+{
+  return _mm256_xor_pd( x, y );
+}
+
+inline __m256d xor_d( __m256d x, __m256i y )
+{
+  __m256d yd = cast_d( y );
+  return _mm256_xor_pd( x, yd );
+}
+
+inline __m256d xor_d( __m256i x, __m256d y )
+{
+  __m256d xd = cast_d( x );
+  return _mm256_xor_pd( xd, y );
+}
+
+inline __m256i xor_i( __m256i x, __m256i y )
+{
+  return _mm256_xor_si256( x, y );
+}
+
+inline __m256d zero_d() 
+{
+  __m256d r;
+  return _mm256_xor_pd( r, r );
+}
+
+inline __m256i zero_i() 
+{
+  __m256i r;
+  return _mm256_xor_si256( r, r );
+}
+
+//----------------------------------------------------------------------------------------
+// avx2 wrappers, to simplify the notation a bit and clarify the use of immediates 
+//----------------------------------------------------------------------------------------
+
+enum class location 
+{
+  null   =  -1024,
+  out    =  -1, 
+  border =   0,
+  in     =   1
+};
+
+enum class bounded_convex_type
+{
+  empty = 0,
+  point = 1,
+  segment = 2,
+  polygon = 3
+};
+
+//----------------------------------------------------------------------------------------
+// vertex_0 and vertex_1 scatter the arrays 
+//
+// {   inf_y,   inf_z, inf_x,   ? } and 
+// { m_sup_y, m_sup_z, m_sup_x, ? }
+//
+//  to vertex_pair, in the x, y, z order
+//----------------------------------------------------------------------------------------
+
+struct vertex_0
+{
+  static void scatter( __m256d* vertex_pair, __m256d inf, __m256d m_sup )
+  {
+    __m256d y = gather_low(  inf, m_sup );  // { inf( y ), m_sup( y ), inf( x ), m_sup( x ) }
+    __m256d z = gather_high( inf, m_sup );  // { inf( z ), m_sup( z ), ? ? }
+    __m256d x = gather_lanes< 1, 1 >( y );  // { inf( x ), m_sup( x ), ? ? }
+
+    vertex_pair[ 0 ] = blend< 0, 0, 1, 1 >( x, vertex_pair[ 0 ] );
+    vertex_pair[ 1 ] = blend< 0, 0, 1, 1 >( y, vertex_pair[ 1 ] );
+    vertex_pair[ 2 ] = blend< 0, 0, 1, 1 >( z, vertex_pair[ 2 ] );
+  }
+};
+
+struct vertex_1 
+{
+  static void scatter( __m256d* vertex_pair, __m256d inf, __m256d m_sup )
+  {
+    __m256d x = gather_low(  inf, m_sup );  // { inf( y ), m_sup( y ), inf( x ), m_sup( x ) }
+    __m256d z = gather_high( inf, m_sup );  // { inf( z ), m_sup( z ),        ?,          ? }
+    __m256d y = gather_lanes< 0, 0 >( x );  // {        ?,          ?, inf( y ), m_sup( y ) }
+            z = gather_lanes< 0, 0 >( z );  // {        ?.           . inf( z ), m_sup( z ) }
+
+    vertex_pair[ 0 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 0 ], x );
+    vertex_pair[ 1 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 1 ], y );
+    vertex_pair[ 2 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 2 ], z );
+  }
+};
+
+//----------------------------------------------------------------------------------------
+// cross computes the solution ( x / z, y / z ) of the equation
+//
+// a[ 0 ] x + a[ 1 ] y = a[ 2 ]
+// b[ 0 ] x + b[ 1 ] y = b[ 2 ]
+//
+// which is
+//
+// x = a[ 2 ] b[ 1 ] - a[ 1 ] b[ 2 ]  
+// y = a[ 0 ] b[ 2 ] - a[ 2 ] b[ 0 ]
+// z = a[ 0 ] b[ 1 ] - a[ 1 ] b[ 0 ] 
+// 
+// which is convenient to order as
+//
+// y = a[ 0 ] b[ 2 ] - a[ 2 ] b[ 0 ]
+// z = a[ 0 ] b[ 1 ] - a[ 1 ] b[ 0 ] 
+// x = a[ 2 ] b[ 1 ] - a[ 1 ] b[ 2 ]  
+//
+// It assumes that x, y >= 0 and z > 0 and makes sure that the rounded values of
+// x, y and z also satisfy these lower bounds.
+//----------------------------------------------------------------------------------------
+
+template < class Vertex >
+inline void cross( __m256d* vertex_pair, __m256d a, __m256d b )
+{
+  __m256d inf_a   = _mm256_movedup_pd( a );     // { a[ 0 ], a[ 0 ], a[ 2 ], ? }
+  __m256d m_sup_b = _mm256_movedup_pd( b );     // { b[ 0 ], b[ 0 ], b[ 2 ], ? }
+
+  __m256d inf_b   = permute< 2, 1, 1, 3 >( b ); // { b[ 2 ], b[ 1 ], b[ 1 ], ? }
+  __m256d m_sup_a = permute< 2, 1, 1, 3 >( a ); // { a[ 2 ], a[ 1 ], a[ 1 ], ? }
+
+  __m256d inf_h   = _mm256_mul_pd( inf_a,   inf_b );
+  __m256d m_sup_h = _mm256_mul_pd( m_sup_a, m_sup_b );
+
+  __m256d inf   = _mm256_fnmadd_pd( m_sup_a, m_sup_b, inf_h ); // inf(x)  = rounded_down(  x )
+  __m256d m_sup = _mm256_fnmadd_pd( inf_a, inf_b, m_sup_h   ); // -sup(x) = rounded_down( -x )
+  
+  // x,y must be >= 0 and z must be positive and the computed values violate this due to 
+  // rounding. In this rare case in which inf[ i ] <= 0 I can prove that the exact value
+  // of inf[ i ] is m_sup[ i ] - inf[ i ]
+
+  __m256d zero = zero_d();
+  __m256d cmp  = _mm256_cmp_pd( inf, zero, _CMP_LE_OQ ); // rounded_down( x ) <= 0 ?
+  __m256d ds = _mm256_sub_pd( m_sup, inf );
+  inf = blend( inf, ds, cmp );
+
+  Vertex::scatter( vertex_pair, inf, m_sup );
+}
+
+//----------------------------------------------------------------------------------------
+// exact_location returns the exact location of the intersection ( x, y ) of the lines
+//
+// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ]
+// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ]
+//
+// with respect to the set
+//
+// e0 x + e1 y >= e2
+//
+// under the assumption that 
+//
+// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0 
+// 
+// where ei = edge[ i ],  b0 = border[ 0 ] and b1 = b[ 1 ]
+//
+// This amounts to evaluating the sign of
+//
+// p = e0 r + e1 s - e2 d
+//
+// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ] 
+// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] ,
+//
+// which motivates the evaluation of the six products
+//
+//   group 1                    group 2
+//  e0 b0[ 2 ] b1[ 1 ],   -e0 b0[ 1 ] b1[ 2 ] 
+//  e1 b0[ 0 ] b1[ 2 ],   -e1 b0[ 2 ] b1[ 0 ] 
+//  e2 b0[ 1 ] b1[ 0 ].   -e2 b0[ 0 ] b1[ 1 ] 
+//
+// and we want to find the sign of their sum. 
+//
+// The function assumes that the edges entries are either normal or zero, ie.,
+// there are no subnormal entries.
+//----------------------------------------------------------------------------------------
+
+location exact_location( __m256d e, __m256d b0, __m256d b1 );
+
+//----------------------------------------------------------------------------------------
+// The struct bounded convex represents a bounded convex set on the plane, which can
+// have one of the following types:
+//
+// a) empty
+// b) ṕoint
+// c) segment
+// d) polygon
+//
+//----------------------------------------------------------------------------------------
+
+struct bounded_convex final
+{
+  constexpr bounded_convex( int alloc, __m256d* edge, __m256d* vertex_p )
+  : alloc( alloc ), edge( edge ), vertex_p( vertex_p ){}
+
+  void setup( __m256d box );
+
+  int insert_vertex( int at );
+  int remove_vertices( int begin_vertex, int end_vertex );
+  
+  void set_empty()
+  {
+    end = 0;
+    begin = 0;
+    type = bounded_convex_type::empty;
+  }
+
+  void set_point( int first_vertex )
+  {
+    end = 1;
+    begin = 0;
+    type = bounded_convex_type::point;
+    edge[ 0 ] = edge[ first_vertex     ];
+    edge[ 1 ] = edge[ first_vertex + 1 ];
+
+    __m256d* src = vertex_pair( first_vertex );
+
+    if( first_vertex & 0x1 )
+    {
+      vertex_p[ 0 ] = gather_lanes< 1, 1 >( src[ 0 ] );
+      vertex_p[ 1 ] = gather_lanes< 1, 1 >( src[ 1 ] );
+      vertex_p[ 2 ] = gather_lanes< 1, 1 >( src[ 2 ] );
+    }
+    else
+    {
+      vertex_p[ 0 ] = src[ 0 ];
+      vertex_p[ 1 ] = src[ 1 ];
+      vertex_p[ 2 ] = src[ 2 ];
+    }
+  }
+
+  void set_to_last_segment();
+
+  void set_segment( int first_vertex )
+  {
+    end = 2;
+    begin = 0;
+    type = bounded_convex_type::segment;
+    edge[ 0 ] = edge[ first_vertex     ];
+    edge[ 1 ] = edge[ first_vertex + 1 ];
+    edge[ 2 ] = edge[ first_vertex + 2 ];
+
+    __m256d* src = vertex_pair( first_vertex );
+
+    if( first_vertex & 0x1 )
+    {
+      vertex_p[ 0 ] = gather_lanes< 1, 0 >( src[ 0 ], src[ 3 ] );
+      vertex_p[ 1 ] = gather_lanes< 1, 0 >( src[ 1 ], src[ 4 ] );
+      vertex_p[ 2 ] = gather_lanes< 1, 0 >( src[ 2 ], src[ 5 ] );
+    }
+    else
+    {
+      vertex_p[ 0 ] = *src;
+      vertex_p[ 1 ] = *( ++src );
+      vertex_p[ 2 ] = *( ++src );
+    }
+  }
+
+  __m256d box() const;
+  void push( __m256d* begin, __m256d* end );
+
+  __m256d plain_edge( int i ) const
+  {
+    return edge[ begin + i ];
+  }
+
+  void plain_vertex( __m256d* inf_msup, int i ) const
+  {
+    int ri = ( begin + i );
+    __m256d const* vp = vertex_pair( ri );
+    int di = 2 * ( ( begin + i ) & 0x1 );
+    inf_msup[ 0 ] = _mm256_setr_pd( vp[ 0 ][ di     ], vp[ 1 ][ di     ], vp[ 2 ][ di     ], 0.0 );
+    inf_msup[ 1 ] = _mm256_setr_pd( vp[ 0 ][ di + 1 ], vp[ 1 ][ di + 1 ], vp[ 2 ][ di + 1 ], 0.0 );
+  }
+
+  __m256d* vertex_pair( int index )
+  {
+    __m256d* p = vertex_p + 3 * ( index / 2 );
+    return p;
+  }
+
+  __m256d const* vertex_pair( int index ) const
+  {
+    __m256d* p = vertex_p + 3 * ( index / 2 );
+    return p;
+  }
+
+  int end;
+  int begin;
+  int const alloc;
+  __m256d* const edge;
+  __m256d* const vertex_p;
+  bounded_convex_type type;
+};
+
+// huge = { 0x5500'0000'0000'0000, 0x5500'0000'0000'0000, 0x5500'0000'0000'0000, 0x7FFF'7FFF'7FFF' }
+// tiny = { 0x2bd0'0000'0000'0000, 0x2bd0'0000'0000'0000, 0x2bd0'0000'0000'0000, 0x0000'0000'0000'00000 }
+
+__m256i bounds( __m256i* tiny, __m256i* huge );
+
+int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge );
+
+//----------------------------------------------------------------------------------------
+//
+// The class locator is used to find the location of a point with respect to the half
+// plane.
+//
+// a x + b y >= c z
+//
+// The template parameters indicate the signs of the coefficients a, b and c.
+//
+//----------------------------------------------------------------------------------------
+
+template < int flip_x, int flip_y, int flip_z >
+struct locator final
+{
+  //----------------------------------------------------------------------------------------
+  // The raw method assumes that 
+  //
+  //  edges[ 0 ] = { a, a, a, a }
+  //  edges[ 1 ] = { b, b, b, b }
+  //  edges[ 2 ] = { c, c, c, c }
+  //
+  //  vertex_pair[ 0 ] = { x0_inf, x0_msup, x1_inf, x1_msup }
+  //  vertex_pair[ 1 ] = { y0_inf, y0_msup, y1_inf, y1_msup }
+  //  vertex_pair[ 2 ] = { z0_inf, z0_msup, z1_inf, z1_msup }
+  //
+  // and checks the location of the vertices ( x0, y0 ) and ( x1, y1 ) with respect to
+  // the half plane 
+  //
+  // (-1)^flip_x a x + (-1)^flip_y b y >= (-1)^flip_z c z
+  //
+  // This is done by computing a x + b y - c z  rounded up and down.
+  //
+  //----------------------------------------------------------------------------------------
+
+  static int raw( __m256d const* edge, __m256d const* vertex_pair ) 
+  {
+    __m256d chk;
+    __m256d zero_d = _mm256_xor_pd( zero_d, zero_d );
+
+    if constexpr ( flip_x )
+      chk = _mm256_mul_pd( edge[ 0 ], permute< 1, 0, 3, 2 >( vertex_pair[ 0 ] ) );
+    else
+      chk = _mm256_mul_pd( edge[ 0 ], vertex_pair[ 0 ] );
+
+    if constexpr ( flip_y)
+      chk = _mm256_fmadd_pd( edge[ 1 ], permute< 1, 0, 3, 2 >( vertex_pair[ 1 ] ), chk );
+    else
+      chk = _mm256_fmadd_pd( edge[ 1 ], vertex_pair[ 1 ], chk );
+
+    if constexpr ( flip_z )
+      chk = _mm256_fmadd_pd( edge[ 2 ], vertex_pair[ 2 ], chk );
+    else
+      chk = _mm256_fmadd_pd( edge[ 2 ], permute< 1, 0, 3, 2 >( vertex_pair[ 2 ] ), chk );
+    
+    __m256d cmp = _mm256_cmp_pd( chk, zero_d, _CMP_GT_OQ );
+   
+    int cm = _mm256_movemask_pd( cmp );
+
+    return cm;
+  }
+
+  static location locate_vertex_0( __m256d const* edge, __m256d const* vertex_pair )
+  {
+    location loc;
+    int cm = raw( edge, vertex_pair );
+
+    if( cm & 0x1 ) // inf( x[ 0 ] ) > 0
+      loc = location::in;
+    else
+    {
+      if( cm & 0x2 )
+        loc = location::out;
+      else
+        loc = location::null;
+    }
+
+    return loc;
+  }
+
+  static location locate_vertex_1( __m256d const* edge, __m256d const* vertex_pair )
+  {
+    location loc;
+    int cm = raw( edge, vertex_pair );
+
+    if( cm & 0x4 ) // the second vertex is in
+      loc = location::in;
+    else
+    {
+      if( cm & 0x8 ) // the second vertex is out
+        loc = location::out;
+      else
+        loc = location::null;
+    }
+
+    return loc;
+  }
+
+  static void locate_pair( location loc[ 2 ],  __m256d const* edge, __m256d const* vertex_pair )
+  {
+    // computing a vertex[ 0 ] + b vertex[ 1 ] + c vertex[ 2 ] 
+
+    int cm = raw( edge, vertex_pair );
+    if( cm & 0x1 ) // the first vertex is in
+      loc[ 0 ] = location::in;
+    else
+    {
+      if( cm & 0x2 ) // the first vertex is out
+        loc[ 0 ] = location::out;
+      else
+        loc[ 0 ] = location::null;
+    }
+
+    if( cm & 0x4 ) // the second vertex is in
+      loc[ 1 ] = location::in;
+    else
+    {
+      if( cm & 0x8 ) // the second vertex is out
+        loc[ 1 ] = location::out;
+      else
+        loc[ 1 ] = location::null;
+    }
+  }
+
+  static void insert( bounded_convex& r, __m256d new_edge )
+  {
+    __m256i ones = _mm256_cmpeq_epi64( ones, ones );
+            ones = _mm256_slli_epi64( ones, 63 );
+    __m256d sign = _mm256_castsi256_pd( ones );
+    
+    __m256d coefficients[ 3 ] = { permute< 0, 0, 0, 0 >( new_edge ), 
+                                  permute< 1, 1, 1, 1 >( new_edge ), 
+                                  permute< 2, 2, 2, 2 >( new_edge ) };
+
+    coefficients[ 0 ] = _mm256_andnot_pd( sign, coefficients[ 0 ] );
+    coefficients[ 1 ] = _mm256_andnot_pd( sign, coefficients[ 1 ] );
+    coefficients[ 2 ] = _mm256_andnot_pd( sign, coefficients[ 2 ] );
+
+    if( r.type != bounded_convex_type::polygon )
+    {
+      if( r.type == bounded_convex_type::point )
+      {
+        location loc = locate_vertex_0( coefficients, r.vertex_p );
+
+        if( loc == location::null )
+          loc = exact_location( new_edge, r.edge[ 0 ], r.edge[ 1 ] );
+
+        if( loc == location::out )
+          r.set_empty();
+
+        return;
+      }
+
+      location loc[ 2 ];
+      locate_pair( loc, coefficients, r.vertex_p );
+
+      if( loc[ 0 ] == location::null )
+        loc[ 0 ] = exact_location( new_edge, r.edge[ 0 ], r.edge[ 1 ] );
+
+      if( loc[ 1 ] == location::null )
+        loc[ 1 ] = exact_location( new_edge, r.edge[ 1 ], r.edge[ 2 ] );
+
+      switch( loc[ 0 ] )
+      {
+        case location::in:
+        {
+          if( loc[ 1 ] == location::out )
+          {
+            r.edge[ 2 ] = new_edge;
+            cross< vertex_1 >( r.vertex_p, r.edge[ 1 ], new_edge );
+          }
+          return;
+        }
+        case location::border:
+        {
+          if( loc[ 1 ] == location::out )
+          {
+            r.end  = 1;
+            r.type = bounded_convex_type::point;
+          }
+          return;
+        }
+        default: // location::out
+        {
+          switch( loc[ 1 ] )
+          {
+            case location::in:
+            {
+              r.edge[ 0 ] = new_edge;
+              cross< vertex_0 >( r.vertex_p, new_edge, r.edge[ 1 ] );
+              return;
+            }
+            case location::border:
+            {
+              r.end = 1;
+              r.edge[ 0 ] = r.edge[ 1 ];
+              r.edge[ 1 ] = r.edge[ 2 ];
+              r.type = bounded_convex_type::point;
+              r.vertex_p[ 0 ] = gather_lanes< 1, 1 >( r.vertex_p[ 0 ] );
+              r.vertex_p[ 1 ] = gather_lanes< 1, 1 >( r.vertex_p[ 1 ] );
+              r.vertex_p[ 2 ] = gather_lanes< 1, 1 >( r.vertex_p[ 2 ] );
+              return;
+            }
+            default:
+            {
+              r.set_empty();
+              return;
+            }
+          }
+        }
+      }
+    }
+
+    location loc[ 2 ];
+
+    int index = r.begin;
+    int intersection = -1;
+    bool intersection_at_vertex;
+
+    __m256d* vertex_pair_at_index = r.vertex_pair( index );
+
+    auto scan = [ & ]()
+    {
+      if( ++index >= r.end )
+        return location::null;
+
+      if( index & 0x1 )  
+        return loc[ 1 ];
+
+      vertex_pair_at_index += 3;
+
+      if( index + 1 >= r.end )
+      {
+        loc[ 0 ] = locate_vertex_0( coefficients, vertex_pair_at_index );
+
+        if( loc[ 0 ] == location::null )
+          loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );
+      }
+      else
+      {
+        locate_pair( loc, coefficients, vertex_pair_at_index );
+
+        if( loc[ 0 ] == location::null )
+          loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ]  );
+
+        if( loc[ 1 ] == location::null )
+          loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] );
+      }
+
+      return loc[ 0 ];
+    };
+
+    if( index & 0x1 ) 
+    {
+      location begin = locate_vertex_1( coefficients, vertex_pair_at_index );
+
+      if( begin == location::null )
+        begin = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );
+
+      switch( begin )
+      {
+        case location::in:
+          goto cursor_in;
+        case location::border:
+        {
+          intersection = index;
+          intersection_at_vertex = true;
+
+          ++index;
+          vertex_pair_at_index += 3;
+
+          locate_pair( loc, coefficients, vertex_pair_at_index );
+
+          if( loc[ 0 ] == location::null )
+            loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );
+
+          if( loc[ 1 ] == location::null )
+            loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] );
+
+          switch( loc[ 0 ] )
+          {
+            case location::in:
+              goto cursor_in;
+            case location::border:
+            {
+              if( loc[ 1 ] == location::out )
+                r.set_segment( r.begin );
+              return;
+            }
+            case location::out:
+              goto cursor_out;
+          }
+        }
+        default: 
+          goto cursor_out;
+      }
+    }
+
+    locate_pair( loc, coefficients, vertex_pair_at_index );
+
+    if( loc[ 0 ] == location::null )
+      loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );
+
+    if( loc[ 1 ] == location::null )
+      loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] );
+
+    switch( loc[ 0 ] )
+    {
+      case location::in:
+        goto cursor_in;
+      case location::border:
+      {
+        intersection = index;
+        intersection_at_vertex = true;
+
+        ++index;
+
+        switch( loc[ 1 ] )
+        {
+          case location::in:
+            goto cursor_in;
+          case location::border:
+          {
+            location next = locate_vertex_0( coefficients, vertex_pair_at_index + 3 );
+
+            if( next == location::null )
+              next = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );
+
+            if( next == location::out )
+              r.set_segment( r.begin );
+
+            return;
+          }
+          default: 
+            goto cursor_out;
+        }
+      }
+      default:
+        goto cursor_out;
+    }
+
+    cursor_in:
+    {
+      location next_loc = scan();
+
+      switch( next_loc )
+      {
+        case location::in:
+          goto cursor_in;
+        case location::border:
+        {
+          if( intersection >= 0 )
+          {
+            if( !intersection_at_vertex )
+            {
+              __m256d* vp = r.vertex_pair( intersection - 1 );
+              if( intersection & 0x1 )
+                cross< vertex_0 >( vp, new_edge, r.edge[ intersection ] );
+              else
+                cross< vertex_1 >( vp, new_edge, r.edge[ intersection ] );
+              
+              --intersection;
+            }
+
+            r.end = index + 1;
+            r.begin = intersection;
+            r.edge[ r.end ]   = new_edge;
+            r.edge[ r.begin ] = new_edge;
+          }
+          else
+          {
+            next_loc = scan();
+
+            if( next_loc == location::out )
+            {
+              intersection = index - 1;
+              intersection_at_vertex = true;
+              goto cursor_out;
+            }
+          }
+          return;
+        }
+        case location::out:
+        {
+          if( intersection < 0 )
+          {
+            intersection = index;
+            intersection_at_vertex = false;
+            goto cursor_out;
+          }
+
+          if( intersection_at_vertex )
+            r.begin = intersection;
+          else
+          {
+            r.begin = intersection - 1;
+            __m256d* vp = r.vertex_pair( r.begin );
+   
+            if( r.begin & 0x1 )
+              cross< vertex_1 >( vp, new_edge, r.edge[ intersection ] );
+            else
+              cross< vertex_0 >( vp, new_edge, r.edge[ intersection ] );
+          }
+          
+          __m256d* vp = r.vertex_pair( index );
+          if( index & 0x1 )
+            cross< vertex_1 >( vp, r.edge[ index ], new_edge );
+          else
+            cross< vertex_0 >( vp, r.edge[ index ], new_edge );
+
+          r.end = index + 1;
+          r.edge[ r.end   ] = new_edge;
+          r.edge[ r.begin ] = new_edge;
+          return;
+        }
+        case location::null: 
+        {
+          assert( index == r.end );
+
+          if( intersection >= 0 )
+          {
+            if( intersection_at_vertex )
+            {
+              if( intersection == r.begin )
+                return;
+            }
+            else
+            {
+              __m256d* first_pair = r.vertex_pair( intersection - 1 );
+
+              if( intersection & 0x1 )
+                cross< vertex_0 >( first_pair, new_edge, r.edge[ intersection ] );
+              else
+                cross< vertex_1 >( first_pair, new_edge, r.edge[ intersection ] );
+
+              --intersection;
+            }
+            
+            __m256d* last_pair = r.vertex_pair( index );
+
+            if( index & 0x1 )
+              cross< vertex_1 >( last_pair, r.edge[ index ], new_edge );
+            else
+              cross< vertex_0 >( last_pair, r.edge[ index ], new_edge );
+
+            ++r.end;
+            r.begin = intersection;
+            r.edge[ r.end ]   = new_edge;
+            r.edge[ r.begin ] = new_edge;
+          }
+
+          return;
+        }
+      }
+    }
+
+    cursor_out:
+    {
+      location new_loc = scan();
+
+      switch( new_loc )
+      {
+        case location::in:
+        {
+          if( intersection < 0 )
+          {
+            intersection = index;
+            intersection_at_vertex = false;
+            goto cursor_in;
+          }
+
+          if( intersection + 1 == index )
+          {
+            assert( !intersection_at_vertex );
+            int new_vertex = r.insert_vertex( intersection );
+
+            if( new_vertex == intersection )
+            {
+              __m256d* vp = r.vertex_pair( intersection );
+              if( intersection & 0x1 )
+              {
+                cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
+                cross< vertex_0 >( vp + 3, new_edge, r.edge[ intersection + 2 ] );
+              }
+              else
+              {
+                cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
+                cross< vertex_1 >( vp, new_edge, r.edge[ intersection + 2 ] );
+              }
+
+              r.edge[ intersection + 1 ] = new_edge;
+            }
+            else
+            {
+              assert( new_vertex == intersection - 1  );
+
+              __m256d* vp = r.vertex_pair( intersection );
+
+              if( intersection & 0x1 )
+              {
+                cross< vertex_1 >( vp, new_edge, r.edge[ index ] );
+                cross< vertex_0 >( vp, r.edge[ intersection ], new_edge) ;
+              }
+              else
+              {
+                cross< vertex_0 >( vp, new_edge, r.edge[ index ] );
+                cross< vertex_1 >( vp - 3, r.edge[ intersection ], new_edge) ;
+              }
+
+              r.edge[ new_vertex ] = r.edge[ intersection ];
+              r.edge[ intersection ] = new_edge;
+            }
+            return;
+          }
+
+          if( !intersection_at_vertex )
+          {
+            __m256d* vp = r.vertex_pair( intersection );
+
+            if( intersection & 0x1 )
+              cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
+            else
+              cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
+          }
+
+          if( intersection + 2 < index )
+            index = r.remove_vertices( intersection + 1, index - 1 ) + 2;
+
+          __m256d* vp = r.vertex_pair( index - 1 );
+          if( index & 0x1 )
+            cross< vertex_0 >( vp, new_edge, r.edge[ index ] );
+          else
+            cross< vertex_1 >( vp, new_edge, r.edge[ index ] );
+
+          r.edge[ index - 1 ] = new_edge;
+          return;
+        }
+        case location::border:
+        {
+          if( intersection >= 0 )
+          {
+            if( intersection_at_vertex )
+            {
+              if( ( intersection == r.begin ) && ( index + 1 == r.end ) )
+              {
+                r.set_to_last_segment();
+                return;
+              }
+            }
+            else
+            {
+              __m256d* vp = r.vertex_pair( intersection );
+              if( intersection & 0x1 )
+                cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
+              else
+                cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
+            }
+            
+            if( intersection + 1 < index )
+              intersection = r.remove_vertices( intersection + 1, index );
+
+            r.edge[ intersection + 1 ] = new_edge;
+            return;
+          }
+
+          new_loc = scan();
+          switch( new_loc )
+          {
+            case location::in:
+            {
+              intersection = index - 1;
+              intersection_at_vertex = true;
+              goto cursor_in;
+            }
+            case location::border:
+            {
+              r.set_segment( index - 1 );
+              return;
+            }
+            default: // location::out or location::null:
+            {
+              r.set_point( index -  1 );
+              return;
+            }
+          }
+        }
+        case location::out:
+          goto cursor_out;
+        case location::null:
+        {
+          assert( index == r.end );
+
+          if( intersection < 0 )
+          {
+            r.set_empty();
+            return;
+          }
+
+          if( intersection_at_vertex )
+          {
+            if( intersection == r.begin )
+            {
+              r.set_point( r.begin );
+              return;
+            }
+          }
+          else
+          {
+            __m256d* vp = r.vertex_pair( intersection );
+
+            if( intersection & 0x1 )
+              cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
+            else
+              cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
+          }
+
+          ++intersection;
+          __m256d* vp = r.vertex_pair( intersection );
+
+          if( intersection & 0x1 )
+            cross< vertex_1 >( vp, new_edge, r.edge[ index ] );
+          else
+            cross< vertex_0 >( vp, new_edge, r.edge[ index ] );
+
+          r.end = intersection + 1;
+          r.edge[ intersection ] = new_edge;
+          r.edge[ r.end ] = r.edge[ r.begin ];
+        }
+
+        return;
+      }
+    }
+  }
+};
+
+} // namespace wm::nlp::lp2d
+
+//----------------------------------------------------------------------------------------
+//
+// The function interval_system_2d solves the system of interval equations
+//
+// ai x + bi y = ci
+//
+// for 0 <= i < n_equations and
+//
+// ai = [ eq[ 6 * i     ], eq[ 6 * i + 1 ] ]  with  eq[ 6 * i     ] <= eq[ 6 * i + 1 ] 
+// bi = [ eq[ 6 * i + 2 ], eq[ 6 * i + 3 ] ]  with  eq[ 6 * i + 2 ] <= eq[ 6 * i + 3 ] 
+// ci = [ eq[ 6 * i + 4 ], eq[ 6 * i + 5 ] ]  with  eq[ 6 * i + 4 ] <= eq[ 6 * i + 5 ] 
+//
+// The function finds the solutions inside the box
+//
+// box[ 0 ] <= x <= box[ 1 ]
+// box[ 2 ] <= y <= box[ 3 ]
+// 
+// The solution consists of up to 4 boxes bx[ 0 ], bx[ 1 ], bx[ 2 ] and bx[ 3 ], with
+//
+// bx[ i ] =  inf_x[ i ] <= x <= sup_x[ i ]
+//            inf_y[ i ] <= y <= sup_y[ i ]
+// for 
+//
+// inf_x[ i ] = solution[ 4 * i     ]
+// sup_x[ i ] = solution[ 4 * i + 1 ]
+// inf_y[ i ] = solution[ 4 * i + 2 ]
+// sup_y[ i ] = solution[ 4 * i + 3 ]
+//
+// The function returns the number of boxes in the solutions, which can be 0,1,2,3 or 4,
+// or a negative number if the input is invalid or if it cannot change the rounding mode.
+//
+// It assumes that:
+//
+// a) n_equations <= 16
+// b) all doubles are finite, ie., not nan or +/-inf
+//
+// c) box and eq are such that
+//
+//    box[ 0 ] <= box[ 1 ]
+//    box[ 2 ] <= box[ 3]
+//    eq[ 2 * i ] <= eq[ 2 i + 1 ] for i = 0,..., 3 * n_equations.
+//
+// d) box and eq are aligned at a 32 byte boundary
+// 
+// It does not verify the input and its results should not be trusted if the inputs are
+// invalid.
+//
+//----------------------------------------------------------------------------------------
+
+extern "C" 
+{
+int interval_system_2d( double* solution, double const* box, double const* eq, unsigned int n_equations );
+}
diff --git a/brush-strokes/src/lib/Math/Root/Isolation.hs b/brush-strokes/src/lib/Math/Root/Isolation.hs
index 55f8e07..9fc4d9f 100644
--- a/brush-strokes/src/lib/Math/Root/Isolation.hs
+++ b/brush-strokes/src/lib/Math/Root/Isolation.hs
@@ -28,10 +28,12 @@ module Math.Root.Isolation
   , BisectionOptions(..), BisectionCoordPicker
   , defaultBisectionOptions
 
-    -- * The interval Newton method with Gauss–Seidel step
-  , GaussSeidel
+    -- * The interval Newton method
+  , NewtonOptions(..)
+  , defaultNewtonOptions
+    -- ** Options for the Gauss–Seidel step
   , GaussSeidelOptions(..), Preconditioner(..)
-  , defaultGaussSeidelOptions
+  , GaussSeidelUpdateMethod(..)
 
     -- * Box-consistency methods
 
@@ -79,8 +81,9 @@ import Math.Monomial
 
 import Math.Root.Isolation.Bisection
 import Math.Root.Isolation.Core
-import Math.Root.Isolation.GaussSeidel
 import Math.Root.Isolation.Narrowing
+import Math.Root.Isolation.Newton
+import Math.Root.Isolation.Newton.GaussSeidel
 
 --------------------------------------------------------------------------------
 
@@ -134,16 +137,16 @@ defaultRootIsolationAlgorithms minWidth ε_eq history box =
     -- Currently: we try an interval Gauss–Seidel.
     -- (box(1)- and box(2)-consistency don't seem to help when using
     -- the complete interval union Gauss–Seidel step)
-    _ -> Right $ AlgoWithOptions @GaussSeidel _gsOptions
+    _ -> Right $ AlgoWithOptions @Newton _newtonOptions
             NE.:| []
 
   where
     verySmall = and $ ( \ cd -> width cd <= minWidth ) <$> coordinates box
 
-    _bisOptions = defaultBisectionOptions @n @d minWidth ε_eq box
-    _gsOptions = defaultGaussSeidelOptions @n @d history
-    _box1Options = defaultBox1Options @n @d ( minWidth * 100 ) ε_eq
-    _box2Options = ( defaultBox2Options @n @d minWidth ε_eq ) { box2LambdaMin = 0.001 }
+    _bisOptions    = defaultBisectionOptions @n @d minWidth ε_eq box
+    _newtonOptions = NewtonLP -- defaultNewtonOptions @n @d history
+    _box1Options   = defaultBox1Options @n @d ( minWidth * 100 ) ε_eq
+    _box2Options   = ( defaultBox2Options @n @d minWidth ε_eq ) { box2LambdaMin = 0.001 }
 
     -- Did we reduce the box width by at least ε_eq
     -- in at least one of the coordinates?
diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Core.hs b/brush-strokes/src/lib/Math/Root/Isolation/Core.hs
index e6bcb29..901443b 100644
--- a/brush-strokes/src/lib/Math/Root/Isolation/Core.hs
+++ b/brush-strokes/src/lib/Math/Root/Isolation/Core.hs
@@ -78,7 +78,8 @@ type Box n = 𝕀ℝ n
 -- NB: we require n <= d (no support for under-constrained systems).
 --
 -- NB: in practice, this constraint should specialise away.
-type BoxCt n d =
+type BoxCt n d = ( n ~ 2, d ~ 3 )
+{-
   ( KnownNat n, KnownNat d
   , 1 <= n, 1 <= d, n <= d
 
@@ -95,7 +96,7 @@ type BoxCt n d =
   , Module Double ( T ( ℝ d ) )
   , Representable Double ( ℝ d )
   )
-
+-}
 -- | Boxes we are done with and will not continue processing.
 data DoneBoxes n =
   DoneBoxes
diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Newton.hs b/brush-strokes/src/lib/Math/Root/Isolation/Newton.hs
new file mode 100644
index 0000000..a6a4072
--- /dev/null
+++ b/brush-strokes/src/lib/Math/Root/Isolation/Newton.hs
@@ -0,0 +1,171 @@
+{-# LANGUAGE ScopedTypeVariables  #-}
+{-# LANGUAGE UndecidableInstances #-}
+
+module Math.Root.Isolation.Newton
+  ( -- * The interval Newton method,
+    -- with Gauss–Seidel step or explicit linear programming
+    Newton
+  , intervalNewton
+
+    -- ** Configuration options
+  , NewtonOptions(..)
+  , defaultNewtonOptions
+  )
+  where
+
+-- base
+import Prelude hiding ( unzip )
+import Control.Arrow
+  ( first )
+import Data.Bifunctor
+  ( Bifunctor(bimap) )
+import Data.Kind
+  ( Type )
+import Data.List
+  ( partition )
+import GHC.TypeNats
+  ( Nat, KnownNat, type (<=) )
+
+-- transformers
+import Control.Monad.Trans.Writer.CPS
+  ( Writer, tell )
+
+-- MetaBrush
+import Math.Algebra.Dual
+  ( D )
+import Math.Interval
+import Math.Linear
+import Math.Module
+  ( Module(..) )
+import Math.Monomial
+  ( MonomialBasis(..), linearMonomial, zeroMonomial )
+import Math.Root.Isolation.Core
+import Math.Root.Isolation.Newton.GaussSeidel
+import Math.Root.Isolation.Newton.LP
+import Math.Root.Isolation.Utils
+
+--------------------------------------------------------------------------------
+-- Interval Newton
+
+-- | The interval Newton method; see 'intervalNewton'.
+data Newton
+instance BoxCt n d => RootIsolationAlgorithm Newton n d where
+  type instance StepDescription Newton = ()
+  type instance RootIsolationAlgorithmOptions Newton n d = NewtonOptions n d
+  rootIsolationAlgorithm opts _thisRoundHist _prevRoundsHist eqs box = do
+    res <- intervalNewton @n @d opts eqs box
+    return ( (), res )
+  {-# INLINEABLE rootIsolationAlgorithm #-}
+  {-# SPECIALISE rootIsolationAlgorithm
+        :: RootIsolationAlgorithmOptions Newton 2 3
+        -> [ ( RootIsolationStep, Box 2 ) ]
+        -> BoxHistory 2
+        -> ( 𝕀ℝ 2 -> D 1 ( 𝕀ℝ 2 ) ( 𝕀ℝ 3 ) )
+        -> Box 2
+        -> Writer ( DoneBoxes 2 ) ( StepDescription Newton, [ Box 2 ] ) #-}
+    -- NB: including this to be safe. The specialiser seems to sometimes
+    -- be able to generate this specialisation on its own, and sometimes not.
+
+-- | Options for the interval Newton method.
+type NewtonOptions :: Nat -> Nat -> Type
+data NewtonOptions n d where
+  -- | Use the Gauss–Seidel method to solve linear systems.
+  NewtonGaussSeidel
+    :: GaussSeidelOptions n d -> NewtonOptions n d
+  -- | Use linear programming to solve linear systems (2 dimensions only).
+  NewtonLP
+    :: NewtonOptions 2 d
+
+-- | Default options for the interval Newton method.
+defaultNewtonOptions
+  :: forall n d
+  .  ( KnownNat n, KnownNat d
+     , 1 <= n, 1 <= d, n <= d
+     , Representable Double ( ℝ n )
+     , Representable Double ( ℝ d )
+     )
+  => BoxHistory n
+  -> NewtonOptions n d
+defaultNewtonOptions history =
+  NewtonGaussSeidel $ defaultGaussSeidelOptions history
+{-# INLINEABLE defaultNewtonOptions #-}
+
+-- | Interval Newton method with Gauss–Seidel step.
+intervalNewton
+  :: forall n d
+  .  BoxCt n d
+  => NewtonOptions n d
+  -> ( 𝕀ℝ n -> D 1 ( 𝕀ℝ n ) ( 𝕀ℝ d ) )
+      -- ^ equations
+  -> 𝕀ℝ n
+     -- ^ box
+  -> Writer ( DoneBoxes n ) [ 𝕀ℝ n ]
+intervalNewton opts eqs x = case opts of
+  NewtonGaussSeidel
+    ( GaussSeidelOptions
+      { gsPreconditioner = precondMeth
+      , gsPickEqs = pickEqs
+      , gsUpdate
+      } ) ->
+    let x_mid = singleton $ boxMidpoint x
+        f :: 𝕀ℝ n -> 𝕀ℝ n
+        f = \ x_0 -> pickEqs $ eqs x_0 `monIndex` zeroMonomial
+        f'_x :: Vec n ( 𝕀ℝ n )
+        f'_x = fmap ( \ i -> pickEqs $ eqs x `monIndex` linearMonomial i ) ( universe @n )
+
+        -- Interval Newton method: take one Gauss–Seidel step
+        -- for the system of equations f'(x) ( x - x_mid ) = - f(x_mid).
+        minus_f_x_mid = unT $ -1 *^ T ( boxMidpoint $ f x_mid )
+
+        -- Precondition the above linear system into A ( x - x_mid ) = B.
+        ( a, b ) = precondition precondMeth
+                      f'_x ( singleton minus_f_x_mid )
+
+        -- NB: we have to change coordinates, putting the midpoint of the box
+        -- at the origin, in order to take a Gauss–Seidel step.
+        gsGuesses = map ( first ( \ x' -> unT $ x' ^+^ T x_mid ) )
+                  $ gaussSeidelUpdate gsUpdate a b ( T x ^-^ T x_mid )
+        ( done, todo ) = bimap ( map fst ) ( map fst )
+                       $ partition snd gsGuesses
+    in -- If the Gauss–Seidel step was a contraction, then the box
+       -- contains a unique solution (by the Banach fixed point theorem).
+       --
+       -- These boxes can thus be directly added to the solution set:
+       -- Newton's method is guaranteed to converge to the unique solution.
+      do tell $ noDoneBoxes { doneSolBoxes = done }
+         return todo
+  NewtonLP ->
+    -- TODO: reduce duplication with the above.
+    let x_mid = singleton $ boxMidpoint x
+        f :: 𝕀ℝ 2 -> 𝕀ℝ d
+        f = \ x_0 -> eqs x_0 `monIndex` zeroMonomial
+        f'_x :: Vec 2 ( 𝕀ℝ d )
+        f'_x = fmap ( \ i -> eqs x `monIndex` linearMonomial i ) ( universe @2 )
+
+        minus_f_x_mid = unT $ -1 *^ T ( boxMidpoint $ f x_mid )
+        ( a, b ) = ( f'_x, singleton minus_f_x_mid )
+        lpGuesses = map ( first ( \ x' -> unT $ x' ^+^ T x_mid ) )
+                  $ solveIntervalLinearEquations a b ( T x ^-^ T x_mid )
+        ( done, todo ) = bimap ( map fst ) ( map fst )
+                       $ partition snd lpGuesses
+    in do tell $ noDoneBoxes { doneSolBoxes = done }
+          return todo
+{-# INLINEABLE intervalNewton #-}
+{-
+
+mbDeg = topologicalDegree 0.005 f x
+det = case f'_x of
+        Vec [ c1, c2 ] ->
+          let a_11 = c1 `index` Fin 1
+              a_12 = c2 `index` Fin 1
+              a_21 = c1 `index` Fin 2
+              a_22 = c2 `index` Fin 2
+          in a_11 * a_22 - a_12 * a_21
+        _ -> error "TODO: just testing n=2 here"
+
+if | not $ 0 ∈ det
+   , mbDeg == Just 0
+   -> return []
+     -- If the Jacobian is invertible over the box, then the topological
+     -- degree tells us exactly how many solutions there are in the box.
+-}
diff --git a/brush-strokes/src/lib/Math/Root/Isolation/GaussSeidel.hs b/brush-strokes/src/lib/Math/Root/Isolation/Newton/GaussSeidel.hs
similarity index 58%
rename from brush-strokes/src/lib/Math/Root/Isolation/GaussSeidel.hs
rename to brush-strokes/src/lib/Math/Root/Isolation/Newton/GaussSeidel.hs
index b1b5ff1..99e8dd3 100644
--- a/brush-strokes/src/lib/Math/Root/Isolation/GaussSeidel.hs
+++ b/brush-strokes/src/lib/Math/Root/Isolation/Newton/GaussSeidel.hs
@@ -1,33 +1,26 @@
-{-# LANGUAGE ScopedTypeVariables  #-}
-{-# LANGUAGE UndecidableInstances #-}
+{-# LANGUAGE ScopedTypeVariables #-}
 
-module Math.Root.Isolation.GaussSeidel
-  ( -- * The interval Newton method with Gauss–Seidel step
-    GaussSeidel
-  , intervalGaussSeidel
-
-    -- ** Configuration options
-  , GaussSeidelOptions(..), Preconditioner(..)
+-- | The Gauss–Seidel method for solving systems of
+-- interval linear equations
+module Math.Root.Isolation.Newton.GaussSeidel
+  ( -- *  Gauss–Seidel step
+    gaussSeidelUpdate
+    -- ** Options for the Gauss–Seidel method
+  , GaussSeidelOptions(..)
+  , Preconditioner(..), GaussSeidelUpdateMethod(..)
   , defaultGaussSeidelOptions
+  , precondition
   )
   where
 
 -- base
 import Prelude hiding ( unzip )
-import Control.Arrow
-  ( first )
-import Data.Bifunctor
-  ( Bifunctor(bimap) )
 import Data.Coerce
   ( coerce )
 import Data.Kind
   ( Type )
 import Data.Foldable
- ( toList )
-import Data.List
-  ( partition )
-import Data.List.NonEmpty
-  ( unzip )
+  ( toList )
 import Data.Proxy
   ( Proxy(..) )
 import Data.Type.Ord
@@ -41,44 +34,16 @@ import qualified Eigen.Matrix as Eigen
   , generate, inverse, unsafeCoeff
   )
 
--- transformers
-import Control.Monad.Trans.Writer.CPS
-  ( Writer, tell )
-
 -- MetaBrush
-import Math.Algebra.Dual
-  ( D )
 import Math.Interval
 import Math.Linear
-import Math.Module
-  ( Module(..) )
-import Math.Monomial
-  ( MonomialBasis(..), linearMonomial, zeroMonomial )
 import Math.Root.Isolation.Core
+import Math.Root.Isolation.Utils
 
 --------------------------------------------------------------------------------
 -- Gauss–Seidel
 
--- | The interval Newton method with a Gauss–Seidel step; see 'intervalGaussSeidel'.
-data GaussSeidel
-instance BoxCt n d => RootIsolationAlgorithm GaussSeidel n d where
-  type instance StepDescription GaussSeidel = ()
-  type instance RootIsolationAlgorithmOptions GaussSeidel n d = GaussSeidelOptions n d
-  rootIsolationAlgorithm opts _thisRoundHist _prevRoundsHist eqs box = do
-    res <- intervalGaussSeidel @n @d opts eqs box
-    return ( (), res )
-  {-# INLINEABLE rootIsolationAlgorithm #-}
-  {-# SPECIALISE rootIsolationAlgorithm
-        :: RootIsolationAlgorithmOptions GaussSeidel 2 3
-        -> [ ( RootIsolationStep, Box 2 ) ]
-        -> BoxHistory 2
-        -> ( 𝕀ℝ 2 -> D 1 ( 𝕀ℝ 2 ) ( 𝕀ℝ 3 ) )
-        -> Box 2
-        -> Writer ( DoneBoxes 2 ) ( StepDescription GaussSeidel, [ Box 2 ] ) #-}
-    -- NB: including this to be safe. The specialiser seems to sometimes
-    -- be able to generate this specialisation on its own, and sometimes not.
-
--- | Options for the interval Gauss–Seidel method.
+-- | Options for the Gauss–Seidel method.
 type GaussSeidelOptions :: Nat -> Nat -> Type
 data GaussSeidelOptions n d =
   GaussSeidelOptions
@@ -98,7 +63,7 @@ data GaussSeidelUpdateMethod
   | GS_Complete
   deriving stock Show
 
--- | Default options for the interval Gauss–Seidel method.
+-- | Default options for the Gauss–Seidel step.
 defaultGaussSeidelOptions
   :: forall n d
   .  ( KnownNat n, KnownNat d
@@ -133,72 +98,6 @@ data Preconditioner
   | InverseMidpoint
   deriving stock Show
 
--- | Interval Newton method with Gauss–Seidel step.
-intervalGaussSeidel
-  :: forall n d
-  .  BoxCt n d
-  => GaussSeidelOptions n d
-  -> ( 𝕀ℝ n -> D 1 ( 𝕀ℝ n ) ( 𝕀ℝ d ) )
-      -- ^ equations
-  -> 𝕀ℝ n
-     -- ^ box
-  -> Writer ( DoneBoxes n ) [ 𝕀ℝ n ]
-intervalGaussSeidel
-  ( GaussSeidelOptions
-    { gsPreconditioner = precondMeth
-    , gsPickEqs = pickEqs
-    , gsUpdate
-    } )
-  eqs
-  x
-  | let x_mid = singleton $ boxMidpoint x
-        f :: 𝕀ℝ n -> 𝕀ℝ n
-        f = \ x_0 -> pickEqs $ eqs x_0 `monIndex` zeroMonomial
-
-        f'_x :: Vec n ( 𝕀ℝ n )
-        f'_x = fmap ( \ i -> pickEqs $ eqs x `monIndex` linearMonomial i ) ( universe @n )
-
-  = let -- Interval Newton method: take one Gauss–Seidel step
-        -- for the system of equations f'(x) ( x - x_mid ) = - f(x_mid).
-        minus_f_x_mid = unT $ -1 *^ T ( boxMidpoint $ f x_mid )
-
-        -- Precondition the above linear system into A ( x - x_mid ) = B.
-        ( a, b ) = precondition precondMeth
-                      f'_x ( singleton minus_f_x_mid )
-
-        -- NB: we have to change coordinates, putting the midpoint of the box
-        -- at the origin, in order to take a Gauss–Seidel step.
-        gsGuesses = map ( first ( \ x' -> unT $ x' ^+^ T x_mid ) )
-                  $ gaussSeidelUpdate gsUpdate a b ( T x ^-^ T x_mid )
-        ( done, todo ) = bimap ( map fst ) ( map fst )
-                       $ partition snd gsGuesses
-    in -- If the Gauss–Seidel step was a contraction, then the box
-            -- contains a unique solution (by the Banach fixed point theorem).
-            --
-            -- These boxes can thus be directly added to the solution set:
-            -- Newton's method is guaranteed to converge to the unique solution.
-      do tell $ noDoneBoxes { doneSolBoxes = done }
-         return todo
-{-# INLINEABLE intervalGaussSeidel #-}
-{-
-
-mbDeg = topologicalDegree 0.005 f x
-det = case f'_x of
-        Vec [ c1, c2 ] ->
-          let a_11 = c1 `index` Fin 1
-              a_12 = c2 `index` Fin 1
-              a_21 = c1 `index` Fin 2
-              a_22 = c2 `index` Fin 2
-          in a_11 * a_22 - a_12 * a_21
-        _ -> error "TODO: just testing n=2 here"
-
-if | not $ 0 ∈ det
-   , mbDeg == Just 0
-   -> return []
-     -- If the Jacobian is invertible over the box, then the topological
-     -- degree tells us exactly how many solutions there are in the box.
--}
-
 -- | A partial or complete Gauss–Seidel step for the equation \( A X = B \),
 -- refining the initial guess box for \( X \) into up to \( 2^n \) (disjoint) new boxes.
 gaussSeidelUpdate
@@ -291,22 +190,6 @@ gaussSeidelStep_Complete as b ( T x0 ) = coerce $ do
   return ( x', and subs )
 {-# INLINEABLE gaussSeidelStep_Complete #-}
 
-fromComponents
-  :: forall n
-  .  ( Representable Double ( ℝ n ), n ~ RepDim ( ℝ n ) )
-  => ( Fin n -> [ ( 𝕀 Double, Bool ) ] ) -> [ ( 𝕀ℝ n, Vec n Bool ) ]
-fromComponents f = do
-  ( xs, bs ) <- unzip <$> traverse f ( universe @n )
-  return $ ( tabulate $ \ i -> xs ! i, bs )
-    -- TODO: this could be more efficient.
-{-# INLINEABLE fromComponents #-}
-
--- | The midpoint of a box.
-boxMidpoint :: Representable Double ( ℝ n ) => 𝕀ℝ n -> ℝ n
-boxMidpoint box =
-  tabulate $ \ i -> midpoint ( box `index` i )
-{-# INLINEABLE boxMidpoint #-}
-
 -- | Pre-condition the system \( AX = B \).
 precondition
   :: forall n
@@ -340,16 +223,3 @@ precondition meth as b =
         )
         ( universe @n )
 {-# INLINEABLE precondition #-}
-
--- | Matrix multiplication \( A v \).
-matMulVec
-  :: forall n m
-  .  ( Representable Double ( ℝ n ), Representable Double ( ℝ m ) )
-  => Vec m ( ℝ n ) -- ^ columns of the matrix \( A )
-  -> 𝕀ℝ m          -- ^ vector \( v \)
-  -> 𝕀ℝ n
-matMulVec as v = tabulate $ \ r ->
-  sum [ scaleInterval ( a `index` r ) ( index v c )
-      | ( c, a ) <- toList ( (,) <$> universe @m <*> as )
-      ]
-{-# INLINEABLE matMulVec #-}
diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs b/brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs
new file mode 100644
index 0000000..fe2802d
--- /dev/null
+++ b/brush-strokes/src/lib/Math/Root/Isolation/Newton/LP.hs
@@ -0,0 +1,132 @@
+{-# LANGUAGE CApiFFI #-}
+{-# LANGUAGE ForeignFunctionInterface #-}
+{-# LANGUAGE ParallelListComp #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+-- | A linear programming approach for solving systems of
+-- interval linear equations.
+module Math.Root.Isolation.Newton.LP
+  ( solveIntervalLinearEquations )
+  where
+
+-- base
+import Control.Arrow
+  ( (&&&) )
+import Data.Coerce
+  ( coerce )
+import Data.Foldable
+  ( toList )
+import Foreign.C.Types
+  ( CDouble(..), CInt(..), CUInt(..) )
+import Foreign.Marshal
+  ( allocaArray, peekArray, pokeArray, with, withArray )
+import Foreign.Ptr
+  ( Ptr, castPtr )
+import Foreign.Storable
+  ( Storable(..) )
+import GHC.TypeNats
+  ( KnownNat, natVal' )
+import GHC.Exts
+  ( proxy# )
+import System.IO.Unsafe
+  ( unsafePerformIO )
+
+-- MetaBrush
+import Math.Interval
+import Math.Linear
+
+--------------------------------------------------------------------------------
+-- Linear programming approach to solving systems of linear equations
+-- (in two variables).
+
+-- | Solve the system of linear equations \( A X = B \)
+-- using linear programming.
+solveIntervalLinearEquations
+  :: ( KnownNat d, Representable Double ( ℝ d ), Show ( ℝ d ) )
+  => Vec 2 ( 𝕀ℝ d )    -- ^ columns of \( A \)
+  -> 𝕀ℝ d              -- ^ \( B \)
+  -> T ( 𝕀ℝ 2 )        -- ^ initial box \( X \)
+  -> [ ( T ( 𝕀ℝ 2 ), Bool ) ]
+solveIntervalLinearEquations a b x =
+  let !sols = unsafePerformIO $ intervalSystem2D_LP a b x
+  in if
+      | any hasNaNs sols
+      -> [ ( x, False ) ]
+      | length sols <= 1
+      -> map ( id &&& isSubBox x ) sols
+      | otherwise
+      -> map ( , False ) sols
+
+-- Assuming the second box is a subset of the second box, returns whether it
+-- is in fact a strict subset.
+isSubBox :: T ( 𝕀ℝ 2 ) -> T ( 𝕀ℝ 2 ) -> Bool
+isSubBox
+  ( T ( 𝕀 ( ℝ2 x1_min y1_min ) ( ℝ2 x1_max y1_max ) ) )
+  ( T ( 𝕀 ( ℝ2 x2_min y2_min ) ( ℝ2 x2_max y2_max ) ) )
+    =  ( ( x2_min > x1_min && x2_max < x1_max ) || x2_min == x2_max )
+    && ( ( y2_min > y1_min && y2_max < y1_max ) || y2_min == y2_max )
+
+hasNaNs :: T (𝕀ℝ 2) -> Bool
+hasNaNs ( T ( 𝕀 ( ℝ2 x_min y_min ) ( ℝ2 x_max y_max ) ) ) =
+  any ( \ x -> isNaN x || isInfinite x ) [ x_min, y_min, x_max, y_max ]
+
+intervalSystem2D_LP
+  :: forall d
+  .  ( KnownNat d, Representable Double ( ℝ d ) )
+  => Vec 2 ( 𝕀ℝ d )
+  -> 𝕀ℝ d
+  -> T ( 𝕀ℝ 2 )
+  -> IO [ T ( 𝕀ℝ 2 ) ]
+intervalSystem2D_LP a b x =
+  allocaArray 4 \ ptrSolutions ->
+    with ( CBox $ unT $ x ) \ ptrBox ->
+      withArray ( mkEquationArray a b ) \ ptrEqs -> do
+        CInt nbSols <-
+          interval_system_2d ptrSolutions ptrBox ptrEqs ( fromIntegral d )
+        if nbSols < 0
+        then
+          error $ unlines
+            [ "interval_system_2d returned with exit code " ++ show nbSols
+            , "This probably means it was given invalid input." ]
+        else
+          coerce <$> peekArray ( fromIntegral nbSols ) ptrSolutions
+  where
+    d = natVal' @d proxy#
+
+mkEquationArray
+  :: ( KnownNat d, Representable Double ( ℝ d ) )
+  => Vec 2 ( 𝕀ℝ d ) -> 𝕀ℝ d -> [ CEqn ]
+mkEquationArray ( Vec [ a_x, a_y ] ) b =
+  [ CEqn a_x_i a_y_i b_i
+  | a_x_i <- toList $ coordinates a_x
+  | a_y_i <- toList $ coordinates a_y
+  | b_i <- toList $ coordinates b
+  ]
+mkEquationArray _ _ = error "impossible"
+
+foreign import ccall "interval_system_2d"
+  interval_system_2d :: Ptr CBox -> Ptr CBox -> Ptr CEqn -> CUInt -> IO CInt
+
+data CEqn = CEqn !( 𝕀 Double ) !( 𝕀 Double ) !( 𝕀 Double )
+instance Storable CEqn where
+  sizeOf _ = 6 * sizeOf @Double undefined
+  alignment _ = 4 * alignment @Double undefined
+  peek ptr = do
+    [ CDouble a_min, CDouble a_max, CDouble b_min, CDouble b_max, CDouble c_min, CDouble c_max ]
+      <- peekArray 6 ( castPtr ptr :: Ptr CDouble )
+    return $
+      CEqn ( 𝕀 a_min a_max ) ( 𝕀 b_min b_max ) ( 𝕀 c_min c_max )
+  poke ptr ( CEqn ( 𝕀 a_min a_max ) ( 𝕀 b_min b_max ) ( 𝕀 c_min c_max ) )
+    = pokeArray ( castPtr ptr ) [ CDouble a_min, CDouble a_max, CDouble b_min, CDouble b_max, CDouble c_min, CDouble c_max ]
+
+
+newtype CBox = CBox ( 𝕀ℝ 2 )
+instance Storable CBox where
+  sizeOf _ = 4 * sizeOf @Double undefined
+  alignment _ = 4 * alignment @Double undefined
+  peek ptr = do
+    [ CDouble x_min, CDouble x_max, CDouble y_min, CDouble y_max ] <- peekArray 4 ( castPtr ptr :: Ptr CDouble )
+    return $
+      CBox ( 𝕀 ( ℝ2 x_min y_min ) ( ℝ2 x_max y_max ) )
+  poke ptr ( CBox ( 𝕀 ( ℝ2 x_min y_min ) ( ℝ2 x_max y_max ) ) ) =
+    pokeArray ( castPtr ptr ) [ CDouble x_min, CDouble x_max, CDouble y_min, CDouble y_max ]
diff --git a/brush-strokes/src/lib/Math/Root/Isolation/Utils.hs b/brush-strokes/src/lib/Math/Root/Isolation/Utils.hs
new file mode 100644
index 0000000..f273bcf
--- /dev/null
+++ b/brush-strokes/src/lib/Math/Root/Isolation/Utils.hs
@@ -0,0 +1,48 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+
+-- | Utilities for root isolation
+module Math.Root.Isolation.Utils
+  ( fromComponents, matMulVec, boxMidpoint )
+  where
+
+-- base
+import Prelude hiding ( unzip )
+import Data.Foldable
+  ( toList )
+import Data.List.NonEmpty
+  ( unzip )
+
+-- MetaBrush
+import Math.Interval
+import Math.Linear
+
+--------------------------------------------------------------------------------
+
+fromComponents
+  :: forall n
+  .  ( Representable Double ( ℝ n ), n ~ RepDim ( ℝ n ) )
+  => ( Fin n -> [ ( 𝕀 Double, Bool ) ] ) -> [ ( 𝕀ℝ n, Vec n Bool ) ]
+fromComponents f = do
+  ( xs, bs ) <- unzip <$> traverse f ( universe @n )
+  return $ ( tabulate $ \ i -> xs ! i, bs )
+    -- TODO: this could be more efficient.
+{-# INLINEABLE fromComponents #-}
+
+-- | The midpoint of a box.
+boxMidpoint :: Representable Double ( ℝ n ) => 𝕀ℝ n -> ℝ n
+boxMidpoint box =
+  tabulate $ \ i -> midpoint ( box `index` i )
+{-# INLINEABLE boxMidpoint #-}
+
+-- | Matrix multiplication \( A v \).
+matMulVec
+  :: forall n m
+  .  ( Representable Double ( ℝ n ), Representable Double ( ℝ m ) )
+  => Vec m ( ℝ n ) -- ^ columns of the matrix \( A )
+  -> 𝕀ℝ m          -- ^ vector \( v \)
+  -> 𝕀ℝ n
+matMulVec as v = tabulate $ \ r ->
+  sum [ scaleInterval ( a `index` r ) ( index v c )
+      | ( c, a ) <- toList ( (,) <$> universe @m <*> as )
+      ]
+{-# INLINEABLE matMulVec #-}