metabrush/brush-strokes/cbits/lp_2d.cpp

//----------------------------------------------------------------------------------------
// Copyright (c) 2024 Walter Mascarenhas
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability",
// and we provide no guarantee regarding the correctness of this source code.
//
//----------------------------------------------------------------------------------------

#include <algorithm>
#include <cfenv>
#include <cstdint>
#include <functional>
#include <iostream>

#include "lp_2d.hpp"

//----------------------------------------------------------------------------------------
namespace wm::nlp::lp2d {

__m256i bounds( __m256i* tiny, __m256i* huge )
{
  __m256i ones = ones_i();
  __m256i sign = _mm256_slli_epi64( ones, 63 );
  __m256i a = _mm256_srli_epi64( sign, 1 );     // 0b0100'0000'0000...
  __m256i b = _mm256_srli_epi64( sign, 3 );     // 0b0001'0000'0000...
  __m256i c = _mm256_or_si256( a, b );          // 0b0101'0000'0000... 
  __m256i d = _mm256_srli_epi64( c, 4 );        // 0b0000'0101'0000...

  *huge = _mm256_or_si256( c, d );             // 0b0101'0101'0000... 0x5500...

  __m256i e = _mm256_srli_epi64( *huge, 1 );   // 0b0010'1010'1000...
  __m256i f = _mm256_srli_epi64( d, 2 );       // 0b0000'0001'0100...
  __m256i g = _mm256_srli_epi64( d, 4 );       // 0b0000'0000'0101...
  __m256i h = _mm256_srli_epi64( ones,  1 );
  __m256i i = _mm256_srli_epi64( sign, 32 );

  __m256i j = _mm256_or_si256( f, g );         // 0b0000'0001'0101...
 
  *tiny = _mm256_or_si256( e, j );             // 0b0010'1011'1101... 0x2bd...
  __m256i k = and_not_i( i, h );
  *huge = blend< 0, 0, 0, 1 >( *huge, k     );
  *tiny = blend< 0, 0, 0, 1 >( *tiny, zero_i() );
  return sign;
}

//----------------------------------------------------------------------------------------
// exact_location returns the exact location of the intersection ( x, y ) of the lines
//
// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ]
// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ]
//
// with respect to the set
//
// e0 x + e1 y >= e2
//
// under the assumption that 
//
// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0 
// 
// where ei = edge[ i ],  b0 = border[ 0 ] and b1 = b[ 1 ]
//
// This amounts to evaluating the sign of
//
// p = e0 r + e1 s - e2 d
//
// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ] 
// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] ,
//
// which motivates the evaluation of the six products
//
//   group 1                    group 2
//  e0 b0[ 2 ] b1[ 1 ],   -e0 b0[ 1 ] b1[ 2 ] 
//  e1 b0[ 0 ] b1[ 2 ],   -e1 b0[ 2 ] b1[ 0 ] 
//  e2 b0[ 1 ] b1[ 0 ].   -e2 b0[ 0 ] b1[ 1 ] 
//
// and we want to find the sign of their sum. 
//
// The function assumes that the edges entries are either normal or zero, ie.,
// there are no subnormal entries.
//----------------------------------------------------------------------------------------

location exact_location( __m256d e, __m256d b0, __m256d b1 )
{
  //----------------------------------------------------------------------------------------
  //----------------------------------------------------------------------------------------

  __m256d b0_a = permute< 1, 2, 0, 3 >( b0 ); // { b0[ 1 ], b0[ 2 ], b0[ 0 ], ? }
  __m256d b0_b = permute< 2, 0, 1, 3 >( b0 ); // { b0[ 2 ], b0[ 0 ], b0[ 1 ], ? }
  __m256d b1_a = permute< 1, 2, 0, 3 >( b1 ); // { b1[ 1 ], b1[ 2 ], b1[ 0 ], ? }
  __m256d b1_b = permute< 2, 0, 1, 3 >( b1 ); // { b1[ 2 ], b1[ 0 ], b1[ 1 ], ? }

  __m256d g1 = _mm256_mul_pd( b0_b, b1_a ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
  __m256d g2 = _mm256_mul_pd( b0_a, b1_b ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
  
  __m256d g1b = _mm256_fmsub_pd( b0_b, b1_a, g1 ); // { b0[ 2 ] b1[ 1 ], b0[ 0 ] b1[ 2 ], b0[ 1 ] b1[ 0 ], ? }
  __m256d g2b = _mm256_fmsub_pd( b0_a, b1_b, g2 ); // { b0[ 1 ] b1[ 2 ], b0[ 2 ] b1[ 0 ], b0[ 0 ] b1[ 1 ], ? }
  
  __m256i sign_i  = _mm256_slli_epi64( ones_i(),  63 );
  __m256d sign    = cast_d( sign_i );
  __m256d minus_e = xor_d( e, sign );

  __m256d c[ 8 ];

  c[ 0 ] = _mm256_mul_pd( e,        g1 ); 
  c[ 1 ] = _mm256_mul_pd( minus_e,  g2 ); 
  c[ 2 ] = _mm256_mul_pd( e,       g1b ); 
  c[ 3 ] = _mm256_mul_pd( minus_e, g2b ); 

  c[ 4 ] = _mm256_fmsub_pd( e,        g1, c[ 0 ] ); 
  c[ 5 ] = _mm256_fmsub_pd( minus_e,  g2, c[ 1 ] ); 
  c[ 6 ] = _mm256_fmsub_pd( e,       g1b, c[ 2 ] ); 
  c[ 7 ] = _mm256_fmsub_pd( minus_e, g2b, c[ 3 ] ); 

  __m256d a[ 6 ];

  // transposing the products

  __m256d tmp_0 = gather_low(  c[ 0 ], c[ 1 ] );  
  __m256d tmp_1 = gather_high( c[ 0 ], c[ 1 ] ); 
  __m256d tmp_2 = gather_low(  c[ 2 ], c[ 3 ] ); 
  __m256d tmp_3 = gather_high( c[ 2 ], c[ 3 ] ); 

  a[ 0 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x20 ); 
  a[ 1 ] = _mm256_permute2f128_pd( tmp_1, tmp_3, 0x20 ); 
  a[ 2 ] = _mm256_permute2f128_pd( tmp_0, tmp_2, 0x31 ); 

  __m256d tmp_4 = gather_low(  c[ 4 ], c[ 5 ] );  
  __m256d tmp_5 = gather_high( c[ 4 ], c[ 5 ] ); 
  __m256d tmp_6 = gather_low(  c[ 6 ], c[ 7 ]); 
  __m256d tmp_7 = gather_high( c[ 6 ], c[ 7 ] ); 

  a[ 3 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x20 ); 
  a[ 4 ] = _mm256_permute2f128_pd( tmp_5, tmp_7, 0x20 ); 
  a[ 5 ] = _mm256_permute2f128_pd( tmp_4, tmp_6, 0x31 ); 

  __m256d gt_05 = is_greater( a[ 0 ], a[ 5 ] );
  __m256d tmp = blend_max( a[ 0 ], a[ 5 ], gt_05 );
          a[ 0 ] = blend_min( a[ 0 ], a[ 5 ], gt_05 );
          a[ 5 ] = tmp;

  //----------------------------------------------------------------------------------------
  // In the loop below we assume that a0[ j ] <= a5[ j ], i = 0,1,2,3
  //----------------------------------------------------------------------------------------
  
  while( true )
  { 
    __m256d gt_12 = is_greater( a[ 1 ], a[ 2 ] );
    __m256d gt_34 = is_greater( a[ 3 ], a[ 4 ] );

    __m256d min_12 = blend_min( a[ 1 ], a[ 2 ], gt_12 );
    __m256d max_12 = blend_max( a[ 1 ], a[ 2 ], gt_12 );

    __m256d min_34 = blend_min( a[ 3 ], a[ 4 ], gt_34 );
    __m256d max_34 = blend_max( a[ 3 ], a[ 4 ], gt_34 );

    __m256d gt_min = is_greater( min_12, min_34 );
    __m256d gt_max = is_greater( max_12, max_34 );

    __m256d min_1234 = blend_min( min_12, min_34, gt_min );
            a[ 2 ]   = blend_max( min_12, min_34, gt_min );
    
            a[ 3 ]   = blend_min( max_12, max_34, gt_max );
    __m256d max_1234 = blend_max( max_12, max_34, gt_max );

    __m256d gt_0_min_1234 = is_greater( a[ 0 ], min_1234 );
    __m256d gt_max_1234_5 = is_greater( max_1234, a[ 5 ] );

    a[ 1 ] = blend_max( a[ 0 ], min_1234, gt_0_min_1234 );
    a[ 0 ] = blend_min( a[ 0 ], min_1234, gt_0_min_1234 );

    a[ 4 ] = blend_min( max_1234, a[ 5 ], gt_max_1234_5 );
    a[ 5 ] = blend_max( max_1234, a[ 5 ], gt_max_1234_5 );

    // 1) a0[ j ] = min { ai[ j ], i = 0, .... 5 }
    // 2) a5[ j ] = max { ai[ j ], i = 0, .... 5 }

    __m256d zero = _mm256_xor_pd( zero, zero );
    __m256d cmp0 = _mm256_cmp_pd( a[ 0 ], zero, _CMP_LT_OQ );

    if( _mm256_testz_pd( cmp0, cmp0 ) ) // all a0[ j ] are >= 0
    {
      __m256d cmp = is_greater( a[ 5 ], zero );
      if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a5 is positive
        return location::border;
      else
        return location::in;
    }

    __m256d cmp5 = _mm256_cmp_pd( a[ 5 ], zero, _CMP_GT_OQ );
    
    if( _mm256_testz_pd( cmp5, cmp5 ) ) // all a5[ j ] are <= 0
    {
      __m256d cmp = is_greater( zero, a[ 0 ] );
      if( _mm256_testz_pd( cmp, cmp ) ) // no entry in a0 is negative
        return location::border;
      else
        return location::out;
    }

    // now 
    // 1) min ai[j] has its sign bit set
    // 2) max ai[j] does not have its sign bit set

    __m256d lo = gather_low(  a[ 0 ], a[ 5 ] ); // { a0[ 0 ], a5[ 0 ], a0[ 2 ], a5[ 2 ] }
    __m256d hi = gather_high( a[ 0 ], a[ 5 ] ); // { a0[ 1 ], a5[ 1 ], a0[ 3 ], a5[ 3 ] }

    __m256d gt_lo_hi = is_greater( lo, hi );
    __m256d b0 = blend_min( lo, hi, gt_lo_hi );
    __m256d b5 = blend_max( lo, hi, gt_lo_hi );

    // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 2 ] } 
    // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 1 ], b5[ 3 ] } 

    b0 = permute< 0, 2, 1, 3 >( b0 );
    b5 = permute< 3, 1, 2, 0 >( b5 );
    
    // 1) min{ ai[ j ], i,j = 0,...5 } is in { b0[ 0 ], b0[ 1 ] } 
    // 2) max{ ai[ j ], i,j = 0,...5 } is in { b5[ 0 ], b5[ 1 ] } 

    lo = gather_low(  b0, b5 ); // { b0[ 0 ], b5[ 0 ], b0[ 2 ], b5[ 2 ] }
    hi = gather_high( b0, b5 ); // { b0[ 1 ], b5[ 1 ], b0[ 3 ], b5[ 3 ] }

    gt_lo_hi = is_greater( lo, hi );
    b0 = blend_min( lo, hi, gt_lo_hi );
    b5 = blend_max( lo, hi, gt_lo_hi );

    // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ]
    // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 1 ]  

    b5 = permute< 1, 0, 3, 2 >( b5 );

    // 1) min{ ai[ j ], i,j = 0,...5 } is b0[ 0 ] <= 0
    // 2) max{ ai[ j ], i,j = 0,...5 } is b5[ 0 ] >= 0 s

    // fixing possible break of order in positions 1,2,3

    __m256d chk123 = is_greater( b0, b5 );
    __m256d aux = blend_min( b0, b5, chk123 );
    b5 = blend_max( b0, b5, chk123 );
    b0 = aux;
    
    __m256i b0i = cast_i( b0 );
    __m256i b5i = cast_i( b5 );

    uint64_t u0 = _mm256_extract_epi64( b0i, 0 ) & 0x7FFF'FFFF'FFFF'FFFFULL;
    uint64_t u5 = _mm256_extract_epi64( b5i, 0 );

    if( u0 >= u5 ) // | b0[ 0 | >= | b5[ 0 ] |
    {
      if( u0 > u5 + 0x0060'0000'0000'0000ULL )  // |b0| > 2^5 |b1| (we use 0x006... to handle subnormals properly)
        return location::out;

      __m256d sum   = _mm256_add_pd( b0,  b5 ); // b0  + b5, rounded down
      __m256d cor   = _mm256_sub_pd( sum, b0 ); // sum - b0, rounded down
      __m256d delta = _mm256_sub_pd( b5, cor ); // b5 - cor, rounded down.
      // now b0[ 0 ] + b5[ 0 ] = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ]
      a[ 0 ] = blend< 1, 0, 0, 0 >( b0, sum   );
      a[ 5 ] = blend< 1, 0, 0, 0 >( b5, delta );
    }
    else
    {
      if( u5 > u0 + 0x0060'0000'0000'0000ULL )
        return location::in;

      __m256i sign_i = _mm256_slli_epi64( ones_i(),  63 );
      __m256d sign = cast_d( sign_i );
      __m256d minus_b0 = xor_d( b0, sign );

      __m256d sum   = _mm256_sub_pd( minus_b0,  b5 ); // (-b5) + (-b0) rounded down
      __m256d cor   = _mm256_add_pd( sum,       b5 ); //   sum - (-b5) rounded down
      __m256d delta = _mm256_sub_pd( minus_b0, cor ); // (-b0) - cor,  rounded down.
      
      // -( b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with sum[ 0 ] <= 0 <= delta[ 0 ] 

      sum   = xor_d( sum,   sign );
      delta = xor_d( delta, sign );

      //  b0[ 0 ] + b5[ 0 ] ) = sum[ 0 ] + delta[ 0 ], exactly, with delta[ 0 ] <= 0 <= sum[ 0 ] 

      a[ 0 ] = blend< 1, 0, 0, 0 >( b0, delta );
      a[ 5 ] = blend< 1, 0, 0, 0 >( b5, sum   );
    }
  }
}

//----------------------------------------------------------------------------------------
// bounded_convex::box: finding the box of the bounded convex region, under the assumption
// that the normals are normalized, that is, for each edge e, either 
// e[ i ] = +0.0 or  e[ i ] >= tiny ~ 2^(-100), i.e. no signed zeros, underflow
// or overflow or negative coordinates.
//----------------------------------------------------------------------------------------

__m256d bounded_convex::box() const
{
  constexpr int non_0 = 0;
  constexpr int non_1 = 2;
  constexpr int inf_0 = 0;
  constexpr int sup_0 = 1;
  constexpr int inf_1 = 2;
  constexpr int sup_1 = 3;

  assert( type != bounded_convex_type::empty );

  __m256d den;
  __m256d num;
  __m256i sign   = _mm256_slli_epi64( ones_i(), 63 ); // { -1, -1, -1, -1 }
  __m256i sign_h = _mm256_slli_si256( sign, 8      ); // {  0, -1,  0, -1 }

  if( type != bounded_convex_type::polygon )
  {
    if( type == bounded_convex_type::point )
    {
      // { vertex_pair[ 0 ] =  { x_inf, -x_sup, ? , ? }
      // { vertex_pair[ 1 ] =  { y_inf, -y_sup, ? , ? }
      // { vertex_pair[ 2 ] =  { z_inf, -z_sup, ? , ? }

      den = xor_d( vertex_p[ 2 ], sign_h );
      den = permute< sup_0, inf_0, sup_0, inf_0 >( den );  
      num = gather_lanes< 0, 0 >( vertex_p[ 0 ], vertex_p[ 1 ] );
    }
    else
    {
      __m256d num_x;
      __m256d num_y;

      den = xor_d( vertex_p[ 2 ], sign_h ); 

      int xy_signs = high_bits_64( edge[ 1 ] ) & 0x3;

      switch( xy_signs )
      {
        case 0: // edge[ 0 ] >= 0, edge[ 1 ] >= 0: box = { x0, x1, y1, y0 }
        {
          num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); 
          num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); 

          den = permute< sup_0, inf_1, sup_1, inf_0 >( den ); 
          break;
        }
        case 1: // edge[ 0 ] <  0, edge[ 1 ] >= 0: box = { x0, x1, y0, y1 }
        {
          num_x = permute< inf_0, sup_1, non_1, non_1 >( vertex_p[ 0 ] ); 
          num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); 
          
          den = permute< sup_0, inf_0, sup_0, inf_1 >( den ); 
          break;
        }
        case 2: // edge[ 0 ] >= 0, edge[ 1 ]  < 0: box = {x1, x0, y1, y0 }
        {
          num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); 
          num_y = permute< non_0, non_0, inf_1, sup_0 >( vertex_p[ 1 ] ); 

          den = permute< sup_1, inf_0, sup_1, inf_0 >( den ); 

          break;
        }
        case 3: // edge[ 0 ]  < 0, edge[ 1 ]  < 0: box = { x1, x0, y0, y1 }
        {
          num_x = permute< inf_1, sup_0, non_1, non_1 >( vertex_p[ 0 ] ); 
          num_y = permute< non_0, non_0, inf_0, sup_1 >( vertex_p[ 1 ] ); 
          
          den = permute< sup_1, inf_0, sup_0, inf_1 >( den ); 
          break;
        }
      }

      num = gather_lanes< 0, 1 >( num_x, num_y );
    }
  }
  else
  {
    int index = begin;

    auto set_x_max = [ & ]()
    {
      int previous_vertex = ( ( index > 0 ) ? index : end );
      int previous_pair = 3 * ( --previous_vertex / 2 );

      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
              
      if( previous_vertex & 0x1 )
      {
                z = permute< non_0, inf_1, non_1, non_1 >( z );
        __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
        num = blend< 0, 1, 0, 0 >( num, x );
      }
      else
      {
        z = permute< non_0, inf_0, non_1, non_1 >( z );
        num = blend< 0, 1, 0, 0 >( num, vertex_p[ previous_pair ]  );
      }

      den = blend< 0, 1, 0, 0 >( den, z );
    };

    auto set_x_min = [ & ]()
    {
      int previous_vertex = ( ( index > 0 ) ? index : end );
      int previous_pair = 3 * ( --previous_vertex / 2 );

      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
              
      if( previous_vertex & 0x1 )
      {
                z = permute< sup_1, non_0, non_1, non_1 >( z );
        __m256d x = gather_lanes< 1, 1 >( vertex_p[ previous_pair ] );
        num = blend< 1, 0, 0, 0 >( num, x );
      }
      else
      {
        z = permute< sup_0, non_0, non_1, non_1 >( z );
        num = blend< 1, 0, 0, 0 >( num, vertex_p[ previous_pair ]  );
      }

      den = blend< 1, 0, 0, 0 >( den, z );
    };

    auto set_y_max = [ & ]()
    {
      int previous_vertex = ( ( index > 0 ) ? index : end );
      int previous_pair = 3 * ( --previous_vertex / 2 );

      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
              
      if( previous_vertex & 0x1 )
      {
        z = permute< non_0, non_0, non_1, inf_1 >( z );
        num = blend< 0, 0, 0, 1 >( num, vertex_p[ previous_pair + 1 ]  );
      }
      else
      {
                z = permute< non_0, non_0, non_1, inf_0 >( z );
        __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
        num = blend< 0, 0, 0, 1 >( num, y );
      }

      den = blend< 0, 0, 0, 1 >( den, z );
    };

    auto set_y_min = [ & ]()
    {
      int previous_vertex = ( ( index > 0 ) ? index : end );
      int previous_pair = 3 * ( --previous_vertex / 2 );

      __m256d z = xor_d( vertex_p[ previous_pair + 2 ], sign_h );
              
      if( previous_vertex & 0x1 )
      {
          z = permute< non_1, non_1, sup_1, non_1 >( z );
        num = blend< 0, 0, 1, 0 >( num, vertex_p[ previous_pair + 1 ]  );
      }
      else
      {
                z = permute< non_0, non_0, sup_0, non_0 >( z );
        __m256d y = gather_lanes< 0, 0 >( vertex_p[ previous_pair + 1 ] );
        num = blend< 0, 0, 1, 0 >( num, y );
      }

      den = blend< 0, 0, 1, 0 >( den, z );
    };

    int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

    switch( xy_signs )
    {
      case 0: // normal[ 0 ] >= 0, normal[ 1 ] >= 0: first quadrant
      {
        if( index & 0x1 )
          goto quadrant_1_odd;
        else
          goto quadrant_1_even;
      }
      case 1: // normal[ 0 ] < 0, normal[ 1 ] >= 0: second quadrant
      {
        if( index & 0x1 )
          goto quadrant_2_odd;
        else
          goto quadrant_2_even;
      }
      case 2: // normal[ 0 ] >= 0, normal[ 1 ] < 0: fourth quadrant
      {
        if( index & 0x1 )
          goto quadrant_4_odd;
        else
          goto quadrant_4_even;
      }
      case 3: // normal[ 0 ] < 0, normal[ 1 ] < 0: third quadrant
      {
        if( index & 0x1 )
          goto quadrant_3_odd;
        else
          goto quadrant_3_even;
      }
    }

    quadrant_1_even:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 0: // quadrant_1
          goto quadrant_1_odd;
        case 1: // quadrant_2
        {
          set_y_min();
          goto quadrant_2_odd;
        }
        case 3: // quadrant_3
        {
          set_x_max();
          set_y_min();
          goto quadrant_3_odd;
        }
      }
    }

    quadrant_1_odd:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 0: // quadrant_1
          goto quadrant_1_even;
        case 1: // quadrant_2
        {
          set_y_min();
          goto quadrant_2_even;
        }
        case 3: // quadrant_3
        {
          set_x_max();
          set_y_min();
          goto quadrant_3_even;
        }
      }
    }

    quadrant_2_even:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 1: // quadrant_2
          goto quadrant_2_odd;
        case 2:// quadrant_4
        {
          set_x_max();
          set_y_max();
          goto quadrant_4_odd;
        }
        case 3: // quadrant_3
        {
          set_x_max();
          goto quadrant_3_odd;
        }
      }
    }

    quadrant_2_odd:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 1: // quadrant_2
          goto quadrant_2_even;
        case 2:// quadrant_4
        {
          set_x_max();
          set_y_max();
          goto quadrant_4_even;
        }
        case 3: // quadrant_3
        {
          set_x_max();
          goto quadrant_3_even;
        }
      }
    }

    quadrant_3_even:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 0: // quadrant_1
        {
          set_y_max();
          set_x_min();
          goto quadrant_1_odd;
        }
        case 2:// quadrant_4
        {
          set_y_max();
          goto quadrant_4_odd;
        }
        case 3: // quadrant_3
          goto quadrant_3_odd;
      }
    }

    quadrant_3_odd:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 0: // quadrant_1
        {
          set_x_min();
          set_y_max();
          goto quadrant_1_even;
        }
        case 2:// quadrant_4
        {
          set_y_max();
          goto quadrant_4_even;
        }
        case 3: // quadrant_3
          goto quadrant_3_even;
      }
    }

    quadrant_4_even:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 0: // quadrant_1
        {
          set_x_min();
          goto quadrant_1_odd;
        }
        case 1: // quadrant_2
        {
          set_x_min();
          set_y_min();
          goto quadrant_2_odd;
        }
        case 2:// quadrant_4
          goto quadrant_4_odd;
      }
    }

    quadrant_4_odd:
    {
      if( ++index > end ) 
        goto done;

      int xy_signs = high_bits_64( edge[ index ] ) & 0x3;

      switch( xy_signs )
      {
        case 0: // quadrant_1
        {
          set_x_min();
          goto quadrant_1_even;
        }
        case 1: // quadrant_2
        {
          set_x_min();
          set_y_min();
          goto quadrant_2_even;
        }
        case 2:// quadrant_4
          goto quadrant_4_even;
      }
    }

    done: {}
  }
  
  num = _mm256_div_pd( num, den );
  num = xor_d( num, sign_h );
  return num;
}

int bounded_convex::insert_vertex( int at )
{
  int high = end - at;
  int low  = at - begin;
  assert( ( low > 0 ) && ( high > 0 ) );

  if( low <= high )
  {
    __m256d* ed = edge + ( begin - 1 );
    __m256d* ed_max = ed + low;

    for( ; ed < ed_max; ++ed )
    {
      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed + 1 ) );
      _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
    }

    int to_load;

    __m256d* vp = vertex_pair( begin );

    __m256d a0 = _mm256_load_pd( reinterpret_cast< double* >( vp     ) );
    __m256d a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
    __m256d a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );

    if( ( begin & 0x1 ) == 0 )
    {
      __m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
      __m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
      __m256d c2 = gather_lanes< clear_lane, 0 >( a2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), c0 );
      _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), c1 );
      _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), c2 );

      if( low == 1 )
      {
        --begin;
        return at - 1;
      }
      to_load = low - 2;
    }
    else
      to_load = low - 1;

    int q = to_load / 4;
    __m256d* vp_max = vp + 6 * q;
    
    while( vp < vp_max )
    {
      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); 
      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); 
      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); 

      __m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
      __m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
      __m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
            
      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); 

      vp += 3;

      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );

      __m256d d0 = gather_lanes< 1, 0 >( b0, a0 );
      __m256d d1 = gather_lanes< 1, 0 >( b1, a1 );
      __m256d d2 = gather_lanes< 1, 0 >( b2, a2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp     ), d0 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 ); 

      vp += 3;
    }

    int r = to_load % 4;

    if( r == 0 )
    {
      __m256d c0 = gather_lanes< 1, clear_lane >( a0 );
      __m256d c1 = gather_lanes< 1, clear_lane >( a1 );
      __m256d c2 = gather_lanes< 1, clear_lane >( a2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); 
    }
    else
    {
      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) ); 
      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) ); 
      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) ); 

      __m256d c0 = gather_lanes< 1, 0 >( a0, b0 );
      __m256d c1 = gather_lanes< 1, 0 >( a1, b1 );
      __m256d c2 = gather_lanes< 1, 0 >( a2, b2 );
            
      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 ); 
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 ); 

      switch( r )
      {
        case 1: 
          goto low_end;
        case 2:
        {
          __m256d c0 = gather_lanes< 1, clear_lane >( b0 );
          __m256d c1 = gather_lanes< 1, clear_lane >( b1 );
          __m256d c2 = gather_lanes< 1, clear_lane >( b2 );
                
          _mm256_store_pd( reinterpret_cast< double* >( vp + 3 ), c0 ); 
          _mm256_store_pd( reinterpret_cast< double* >( vp + 4 ), c1 ); 
          _mm256_store_pd( reinterpret_cast< double* >( vp + 5 ), c2 ); 
          goto low_end;
        }
        default:
        {
          vp += 3;
          a0 = _mm256_load_pd( reinterpret_cast< double* >( vp + 3 ) );
          a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 4 ) );
          a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 5 ) );

          __m256d c3 = gather_lanes< 1, 0 >( b0, a0 );
          __m256d c4 = gather_lanes< 1, 0 >( b1, a1 );
          __m256d c5 = gather_lanes< 1, 0 >( b2, a2 );

          _mm256_store_pd( reinterpret_cast< double* >( vp     ), c3 ); 
          _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c4 ); 
          _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c5 ); 
        }
      }
    }

    low_end:
    {
      --begin;
      return at - 1;
    }
  }
  else
  {
    __m256d* ed = edge + end + 1;
    __m256d* ed_min = ed - ( high + 1 );

    for( ; ed > ed_min; --ed )
    {
      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( ed - 1 ) );
      _mm256_store_pd( reinterpret_cast< double* >( ed ), tmp );
    }

    __m256d* vp = vertex_pair( end );

    __m256d a0;
    __m256d a1;
    __m256d a2;

    int to_load;

    if( ( end & 0x1 ) == 0 )
    {
      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );

      __m256d c0 = gather_lanes< 1, clear_lane >( a0 );
      __m256d c1 = gather_lanes< 1, clear_lane >( a1 );
      __m256d c2 = gather_lanes< 1, clear_lane >( a2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );

      if( high == 1 )
      {
        ++end;
        return at;
      }

      vp -= 3;
      to_load = high - 2;
    }
    else
    {
      to_load = high - 1;
      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp     ) );
      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp + 1 ) );
      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp + 2 ) );
    }
    
    int q = to_load / 4;

    __m256d* vp_min = vp - 6 * q;
    
    while( vp > vp_min )
    {
      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); 
      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); 
      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); 

      __m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
      __m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
      __m256d c2 = gather_lanes< 1, 0 >( b2, a2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );

      vp -= 3;

      a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
      a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
      a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );

      __m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
      __m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
      __m256d d2 = gather_lanes< 1, 0 >( a2, b2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp     ), d0 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );

      vp -= 3;
    }

    int r = to_load % 4;

    if( r == 0 )
    {
      __m256d c0 = gather_lanes< clear_lane, 0 >( a0 );
      __m256d c1 = gather_lanes< clear_lane, 0 >( a1 );
      __m256d c2 = gather_lanes< clear_lane, 0 >( a2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );
    }
    else
    {
      __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) ); 
      __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) ); 
      __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) ); 

      __m256d c0 = gather_lanes< 1, 0 >( b0, a0 );
      __m256d c1 = gather_lanes< 1, 0 >( b1, a1 );
      __m256d c2 = gather_lanes< 1, 0 >( b2, a2 );

      _mm256_store_pd( reinterpret_cast< double* >( vp     ), c0 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), c1 );
      _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), c2 );

      switch( r )
      {
        case 1:
          goto high_end;
        case 2:
        {
          __m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
          __m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
          __m256d d2 = gather_lanes< clear_lane, 0 >( b2 );

          _mm256_store_pd( reinterpret_cast< double* >( vp - 3 ), d0 );
          _mm256_store_pd( reinterpret_cast< double* >( vp - 2 ), d1 );
          _mm256_store_pd( reinterpret_cast< double* >( vp - 1 ), d2 );
          goto high_end;
        }
        default: // case 3
        {
          vp -= 3;

          a0 = _mm256_load_pd( reinterpret_cast< double* >( vp - 3 ) );
          a1 = _mm256_load_pd( reinterpret_cast< double* >( vp - 2 ) );
          a2 = _mm256_load_pd( reinterpret_cast< double* >( vp - 1 ) );

          __m256d d0 = gather_lanes< 1, 0 >( a0, b0 );
          __m256d d1 = gather_lanes< 1, 0 >( a1, b1 );
          __m256d d2 = gather_lanes< 1, 0 >( a2, b2 );

          _mm256_store_pd( reinterpret_cast< double* >( vp     ), d0 );
          _mm256_store_pd( reinterpret_cast< double* >( vp + 1 ), d1 );
          _mm256_store_pd( reinterpret_cast< double* >( vp + 2 ), d2 );
        }
      }
    }

    high_end:
    {
      ++end;
      return at;
    }
  }
}

void bounded_convex::push( __m256d* begin, __m256d* end )
{
  for( ; begin < end; ++begin )
  {
    int sign = _mm256_movemask_pd( *begin ) & 0x7;

    switch( sign )
    {
      case 0b000: { locator< 0, 0, 0 >::insert( *this, *begin ); break; }
      case 0b001: { locator< 1, 0, 0 >::insert( *this, *begin ); break; }
      case 0b010: { locator< 0, 1, 0 >::insert( *this, *begin ); break; }
      case 0b011: { locator< 1, 1, 0 >::insert( *this, *begin ); break; }
      case 0b100: { locator< 0, 0, 1 >::insert( *this, *begin ); break; }
      case 0b101: { locator< 1, 0, 1 >::insert( *this, *begin ); break; }
      case 0b110: { locator< 0, 1, 1 >::insert( *this, *begin ); break; }
      case 0b111: { locator< 1, 1, 1 >::insert( *this, *begin ); break; }
    }

    if( type == bounded_convex_type::empty )
      return;
  }
}

int bounded_convex::remove_vertices( int begin_vertex, int end_vertex )
{
  int re   = end_vertex   & 0x1;
  int rb   = begin_vertex & 0x1;
  int low  = begin_vertex - begin;
  int high = end - end_vertex;

  if( low <= high )
  {
    __m256d* e_dst     = edge + end_vertex;
    __m256d* e_dst_min = e_dst - low;
    __m256d* e_src     = edge + begin_vertex;

    while( --e_dst >= e_dst_min )
    {
      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( --e_src ) );
      _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); 
    } 

    __m256d* v_dst = vertex_pair( end_vertex   );
    __m256d* v_src = vertex_pair( begin_vertex );

    if( re == rb )
    {
      int q;

      if( re & 0x1 )
      {
        __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );

        __m256d b0 = blend< 0, 0, 1, 1 >( s0, d0 );
        __m256d b1 = blend< 0, 0, 1, 1 >( s1, d1 );
        __m256d b2 = blend< 0, 0, 1, 1 >( s2, d2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), b0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );

        q = low;
      }
      else
        q = ( low + 1 );

      q /= 2;
      __m256d* v_src_min = v_src - 3 * q;

      while( v_src > v_src_min )
      {
        --v_src;
        --v_dst;
        __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); 
      }
    }
    else
    {
      __m256d s0;
      __m256d s1;
      __m256d s2;

      int to_load;

      if( rb == 0 )
      {
        v_src -= 3;

        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d p0 = gather_lanes< 1, clear_lane >( s0 );
        __m256d p1 = gather_lanes< 1, clear_lane >( s1 );
        __m256d p2 = gather_lanes< 1, clear_lane >( s2 );

        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );

        __m256d b0 = blend< 0, 0, 1, 1 >( p0, d0 );
        __m256d b1 = blend< 0, 0, 1, 1 >( p1, d1 );
        __m256d b2 = blend< 0, 0, 1, 1 >( p2, d2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), b0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), b1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), b2 );
        
        if( low == 1 )
          goto low_end;

        to_load = low - 2;
      }
      else
      {
        to_load = low - 1;

        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );
      }

      int q = to_load / 4;
      __m256d* v_src_min = v_src - 6 * q;

      while( v_src > v_src_min )
      {
        v_src -= 3;
        v_dst -= 3;

        __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
        __m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
        __m256d c2 = gather_lanes< 1, 0 >( b2, s2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );

        v_dst -= 3;
        v_src -= 3;

        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
        __m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
        __m256d d2 = gather_lanes< 1, 0 >( s2, b2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), d0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
      }

      v_dst -= 3;
      int r = to_load % 4;

      if( r == 0 )
      {
        __m256d c0 = gather_lanes< clear_lane, 0 >( s0 );
        __m256d c1 = gather_lanes< clear_lane, 0 >( s1 );
        __m256d c2 = gather_lanes< clear_lane, 0 >( s2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );
      }
      else
      {
        v_src -= 3;
        __m256d b0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        __m256d b1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        __m256d b2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d c0 = gather_lanes< 1, 0 >( b0, s0 );
        __m256d c1 = gather_lanes< 1, 0 >( b1, s1 );
        __m256d c2 = gather_lanes< 1, 0 >( b2, s2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );

        switch( r )
        {
          case 1:
            goto low_end;
          case 2:
          {
            v_dst -= 3;

            __m256d d0 = gather_lanes< clear_lane, 0 >( b0 );
            __m256d d1 = gather_lanes< clear_lane, 0 >( b1 );
            __m256d d2 = gather_lanes< clear_lane, 0 >( b2 );

            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), d0 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );
            goto low_end;
          }
          default:
          {
            v_dst -= 3;
            v_src -= 3;

            s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
            s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
            s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

            __m256d d0 = gather_lanes< 1, 0 >( s0, b0 );
            __m256d d1 = gather_lanes< 1, 0 >( s1, b1 );
            __m256d d2 = gather_lanes< 1, 0 >( s2, b2 );

            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), d0 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), d1 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), d2 );

            v_dst -= 3;

            s0 = gather_lanes< clear_lane, 0 >( s0, b0 );
            s1 = gather_lanes< clear_lane, 0 >( s1, b1 );
            s2 = gather_lanes< clear_lane, 0 >( s2, b2 );

            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), s0 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), s1 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), s2 );
          }
        }
      }
    }
    
    low_end:
    {
      begin += ( end_vertex - begin_vertex );
      return end_vertex - 1;
    }
  }
  else
  {
    __m256d* e_dst = edge + begin_vertex;
    __m256d* e_src = edge + end_vertex;
    __m256d* e_src_max = e_src + high + 1;

    for( ; e_src < e_src_max; ++e_src, ++e_dst )
    {
      __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( e_src ) );
      _mm256_store_pd( reinterpret_cast< double* >( e_dst ), tmp ); 
    } 

    __m256d s0;
    __m256d s1;
    __m256d s2;

    __m256d* v_dst = vertex_pair( begin_vertex );
    __m256d* v_src = vertex_pair( end_vertex );
    
    if( re == rb )
    {
      if( re & 0x1 )
      {
        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );

        __m256d c0 = blend< 0, 0, 1, 1 >( d0, s0 );
        __m256d c1 = blend< 0, 0, 1, 1 >( d1, s1 );
        __m256d c2 = blend< 0, 0, 1, 1 >( d2, s2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), c0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), c1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), c2 );

        --high;
        v_src += 3;
        v_dst += 3;
      }

      int q = 3 * ( ( high +  1 ) / 2 );

      __m256d* v_src_max = vertex_pair( end + 1 );
      for( ; v_src < v_src_max; ++v_src, ++v_dst )
      {
        __m256d tmp = _mm256_load_pd( reinterpret_cast< double* >( v_src ) );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst ), tmp ); 
      }
    }
    else
    {
      int to_load;

      __m256d s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
      __m256d s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
      __m256d s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

      if( rb ) // re = 0 =:
      {
        __m256d d0 = _mm256_load_pd( reinterpret_cast< double* >( v_dst     ) );
        __m256d d1 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 1 ) );
        __m256d d2 = _mm256_load_pd( reinterpret_cast< double* >( v_dst + 2 ) );
        
        __m256d g0 = gather_lanes< 0, 0 >( d0, s0 );
        __m256d g1 = gather_lanes< 0, 0 >( d1, s1 );
        __m256d g2 = gather_lanes< 0, 0 >( d2, s2 );
        
        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), g0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), g1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), g2 );
      
        if( high == 1 )
          goto high_end;

        v_dst += 3;
        to_load = high - 2;
      }
      else // re = 1 => only the second lane of the si are used
        to_load = high - 1;
    
      int q = to_load / 4;
      __m256d* v_src_max = v_src + 6 * q;

      while( v_src < v_src_max )
      {
        v_src += 3;

        __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
        __m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
        __m256d u2 = gather_lanes< 1, 0 >( s2, t2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), u0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );

        v_dst += 3;
        v_src += 3;

        s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
        __m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
        __m256d v2 = gather_lanes< 1, 0 >( t2, s2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), v0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );

        v_dst += 3;
      }

      int r = to_load % 4;

      if( r == 0 )
      {
        __m256d u0 = gather_lanes< 1, clear_lane >( s0 );
        __m256d u1 = gather_lanes< 1, clear_lane >( s1 );
        __m256d u2 = gather_lanes< 1, clear_lane >( s2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), u0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );
      }
      else
      {
        v_src += 3;

        __m256d t0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
        __m256d t1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
        __m256d t2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

        __m256d u0 = gather_lanes< 1, 0 >( s0, t0 );
        __m256d u1 = gather_lanes< 1, 0 >( s1, t1 );
        __m256d u2 = gather_lanes< 1, 0 >( s2, t2 );

        _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), u0 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), u1 );
        _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), u2 );

        switch( r )
        {
          case 1: 
            goto high_end;
          case 2:
          {
            v_dst += 3;
            __m256d v0 = gather_lanes< 1, clear_lane >( t0 );
            __m256d v1 = gather_lanes< 1, clear_lane >( t1 );
            __m256d v2 = gather_lanes< 1, clear_lane >( t2 );

            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), v0 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );

            goto high_end;
          }
          default: // case 3
          {
            v_dst += 3;
            v_src += 3;

            s0 = _mm256_load_pd( reinterpret_cast< double* >( v_src     ) );
            s1 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 1 ) );
            s2 = _mm256_load_pd( reinterpret_cast< double* >( v_src + 2 ) );

            __m256d v0 = gather_lanes< 1, 0 >( t0, s0 );
            __m256d v1 = gather_lanes< 1, 0 >( t1, s1 );
            __m256d v2 = gather_lanes< 1, 0 >( t2, s2 );

            _mm256_store_pd( reinterpret_cast< double* >( v_dst     ), v0 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 1 ), v1 );
            _mm256_store_pd( reinterpret_cast< double* >( v_dst + 2 ), v2 );
          }
        }
      }
    }

    high_end:
    {
      end -= ( end_vertex - begin_vertex );
      return begin_vertex - 1;
    }
  }
}

void bounded_convex::set_to_last_segment()
{
  int last = end - 1;
  edge[ 0 ] = edge[ last      ];
  edge[ 1 ] = edge[ end       ];
  edge[ 2 ] = edge[ begin + 1 ];

  __m256d* vl = vertex_pair( last  );
  __m256d* vb = vertex_pair( begin );

  if( last & 0x1 )
  {
    if( begin & 0x1 )
    {
      vertex_p[ 0 ] = gather_lanes< 1, 1 >( vl[ 0 ], vb[ 0 ] );
      vertex_p[ 1 ] = gather_lanes< 1, 1 >( vl[ 1 ], vb[ 1 ] );
      vertex_p[ 2 ] = gather_lanes< 1, 1 >( vl[ 2 ], vb[ 2 ] );
    }
    else
    {
      vertex_p[ 0 ] = gather_lanes< 1, 0 >( vl[ 0 ], vb[ 0 ] );
      vertex_p[ 1 ] = gather_lanes< 1, 0 >( vl[ 1 ], vb[ 1 ] );
      vertex_p[ 2 ] = gather_lanes< 1, 0 >( vl[ 2 ], vb[ 2 ] );
    }
  }
  else
  {
    if( begin & 0x1 )
    {
      vertex_p[ 0 ] = blend< 0, 0, 1, 1 >( vl[ 0 ], vb[ 0 ] );
      vertex_p[ 1 ] = blend< 0, 0, 1, 1 >( vl[ 1 ], vb[ 1 ] );
      vertex_p[ 2 ] = blend< 0, 0, 1, 1 >( vl[ 2 ], vb[ 2 ] );
    }
    else
    {
      vertex_p[ 0 ] = gather_lanes< 0, 0 >( vl[ 0 ], vb[ 0 ] );
      vertex_p[ 1 ] = gather_lanes< 0, 0 >( vl[ 1 ], vb[ 1 ] );
      vertex_p[ 2 ] = gather_lanes< 0, 0 >( vl[ 2 ], vb[ 2 ] );
    }
  }
  
  end = 2;
  begin = 0;
  type = bounded_convex_type::segment;
}

void bounded_convex::setup( __m256d box )
{
  type = bounded_convex_type::polygon;

  __m256d zero    = _mm256_xor_pd(       zero, zero );
  __m256i ones    = _mm256_cmpeq_epi64(  ones, ones );
  __m256i signi   = _mm256_slli_epi64(   ones,   63 );
  __m256i signi_h = _mm256_slli_si256(   signi,   8 );
  __m256d sign    = _mm256_castsi256_pd( signi      );
  __m256d sign_h  = _mm256_castsi256_pd( signi_h    );
  __m256i onei    = _mm256_srli_epi64(   ones,   54 );
          onei    = _mm256_slli_epi64(   onei,   52 ); // { 1.0, 1.0, 1.0, 1.0 }
  __m256d one     = _mm256_castsi256_pd( onei       );
  
  box = _mm256_max_pd( box, zero );

  begin = alloc / 2;
  end = begin + 4;
  __m256d* e = edge + begin;
  __m256d x_box = gather_lanes< 0, 0 >( box );      // { x_min, x_max, x_min, x_max }
  __m256d y_box = gather_lanes< 1, 1 >( box );      // { y_min, y_max, y_min, y_max }

  e[ 0 ] = blend< 0, 1, 0, 0 >( one,    zero   );  // { 1, 0,     0,      0}, 
  e[ 0 ] = blend< 0, 0, 1, 1 >( e[ 0 ], x_box  );  // { 1, 0, x_min, x_max }, for x >=  x_min

  e[ 1 ] = permute< 1, 0, 2, 3 >( e[ 0 ] );         // { 0 , 1, x_min, x_max }  //  y >=  y_min                        
  e[ 1 ] = blend< 0, 0, 1, 1 >( e[ 1 ], y_box   );  // { 0 , 1, y_min, y_max }  //  y >=  y_min

  e[ 2 ] = permute< 0, 1, 3, 2 >( e[ 0 ]  );         //  {  1 ,  0,  x_max,  x_min }
  e[ 2 ] = _mm256_xor_pd( e[ 2 ], sign );            //  { -1 , -, -x_max, -y_min }   -x >= -x_max 

  e[ 3 ] = permute< 0, 1, 3, 2 >( e[ 1 ] );          // {  0, 1,  y_max, ? } 
  e[ 3 ] = _mm256_xor_pd( e[ 3 ], sign );            // {  0, 1, -y_max, ? }  -y >= -y_max 
  
  e[ 4 ] = e[ 0 ];                                   //  { 1, 0,  x_min,  } }

  // initial vertices

  __m256d* vp = vertex_pair( begin );
  
  vp[ 0 ] = permute< 0, 0, 3, 3 >( x_box );      // { x_min,  x_min, x_max,  x_max } : vertices 0 and 1: 
  vp[ 0 ] = _mm256_xor_pd( vp[ 0 ], sign_h );    // { x_min, -x_min, x_max, -x_max } : vertices 0 and 1: 

  vp[ 1 ] = permute< 0, 0, 2, 2 >( y_box );      // { y_min,  y_min, y_min,  y_min }
  vp[ 1 ] = _mm256_xor_pd( vp[ 1 ], sign_h );    // { y_min, -y_min, y_min, -y_min }

  vp[ 2 ] = _mm256_xor_pd( one, sign_h );        // {    1.0,   -1.0,   1.0,  -1.0 }

  vp[ 3 ] = permute< 1, 1, 2, 2 >( x_box  );    // { x_max,  x_max, x_min, x_min } : vertices 2 and 3: 
  vp[ 3 ] = _mm256_xor_pd( vp[ 3 ], sign_h );   // { x_max, -x_max, x_min, -x_min }

  vp[ 4 ] = permute< 1, 1, 3, 3 >( y_box );     // { y_max,  y_max, y_max,  y_max }
  vp[ 4 ] = _mm256_xor_pd( vp[ 4 ], sign_h );   // { y_max, -y_max, y_max, -y_max }

  vp[ 5 ] = vp[ 2 ];
}

int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge )
{
  constexpr unsigned long long x_index_epi8 = 0x1 <<  4;  
  constexpr unsigned long long y_index_epi8 = 0x1 << 12;  
  constexpr unsigned long long z_index_epi8 = 0x1 << 16;  

  int ret = 0;
  __m256i ones     = ones_i();                              // { 0xFFFF'FFFF'FFFF'FFFF, ...   }
  __m256i exp_mask = _mm256_srli_epi64(     ones, 53 );     // { 0x0000'0000'0000'07FF, ....  }
  __m256i one      = _mm256_srli_epi64(     ones, 54 );     // { 0x0000'0000'0000'03FF, ....  }
  __m256i sign     = _mm256_slli_epi64(     ones, 63 );     // { 0x8000'0000'0000'0000, ....  }
          exp_mask = _mm256_slli_epi64( exp_mask, 52 );     // { 0x7FF0'0000'0000'0000, ....  }
          one      = _mm256_slli_epi64(      one, 52 );     // { 0x3FF0'0000'0000'0000, ....  }
          sign     = gather_lanes< 1, clear_lane >( sign ); // { sign_bit, sign_bit, 0, 0 }

  for( int iq = 0; iq < 4; ++iq )
  {
    __m256d* end = quadrant[ iq ] + n;

    while( quadrant[ iq ] < end )
    {
      __m256d* q       = quadrant[ iq ];
      __m256i exp      = and_i( *q, exp_mask );
      __m256i cmp_huge = _mm256_cmpgt_epi32( exp, huge );
      
      if( _mm256_testz_si256( cmp_huge, cmp_huge ) )  // no huge entries
      {
        __m256i cmp_tiny = _mm256_cmpgt_epi32( tiny, exp );

        if( _mm256_testz_si256( cmp_tiny, cmp_tiny ) )  // no entries are huge or tiny
        {
          ++quadrant[ iq ];
          continue;
        }

        // no entries are huge, but aome are tiny. We scale by 2^(huge - max_exp[ i ] )

        __m256i exp_b = permute< 1, 0, 2, 3 >( exp );   // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
        __m256i exp_c = gather_lanes< 1, 1 >( exp );    // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
        __m256i max   = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
                max   = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}

        int64_t int_max = _mm256_extract_epi64( max, 0 );
        if( int_max == 0 ) // all exponents are zero.
        {
          __m256d cmp = _mm256_cmp_pd( *q, zero_d(), _CMP_EQ_OQ );
          if( ( _mm256_movemask_pd( cmp ) & 0x7 ) == 0x7  ) // all coefficients are zero
          {                                                 // drop the constraint
            *q = *( --end );
            continue;
          }
        }

        max = permute< 0, 0, 0, 0 >( max );
        __m256i de = _mm256_sub_epi32( huge, max );    
        
        // de may be larger than 0x3ff and we must scale in two steps

        *q = xor_d( *q, sign );  

        __m256i hs    = _mm256_srli_epi64( de, 1 );       // de/2
                hs    = _mm256_and_si256( hs, exp_mask ); // throwing away the remainder
        __m256d scale = cast_d( _mm256_add_epi64( hs, one ) );
         *q = _mm256_mul_pd( scale, *q ); 

        __m256i ls    = _mm256_sub_epi64( de, hs ); // de - hs
        scale = cast_d( _mm256_add_epi64( ls, one ) );
        *q = _mm256_mul_pd( scale, *q ); 
      }
      else // some huge entries
      {
        __m256i exp_b = permute< 1, 0, 2, 3 >( exp );   // { exp[ 1 ], exp[ 0 ], exp[ 2 ], ? }
        __m256i exp_c = gather_lanes< 1, 1 >( exp );    // { exp[ 2 ], exp[ 3 ], exp[ 2 ], ? }
        __m256i max   = _mm256_max_epi32( exp, exp_b ); // { max( exp[ 0 ], exp[ 1 ] ), ?. ?. ? }
                max   = _mm256_max_epi32( max, exp_c ); // { max{ exp[ 0 ], exp[ 1 ], exp[ 2 ] ), ....}

        max = permute< 0, 0, 0, 0 >( max );
        __m256i de = _mm256_sub_epi32( huge, max );    // de > -0x3ff
        __m256d scale = cast_d( _mm256_add_epi64( de, one ) ); // this does not underflow

        *q = xor_d( *q, sign );
        *q = _mm256_mul_pd( scale, *q ); // this may be inexact, but the inexact entries will be dropped later
      }

      // cleaning up the remaining tiny entries

      __m256i exp_d  = and_i( *q, exp_mask );
      __m256i cmp_d  = _mm256_cmpgt_epi32( exp_d, tiny ); // 8 x (4 byte masks)
      int cmp_d_mask = _mm256_movemask_epi8( cmp_d ); // 8 bits for x, 8 bits for y, 8 bits for z

      if( cmp_d_mask & ( x_index_epi8 | y_index_epi8 ) )
      {
        // In order to preserve the solution set, we must round the right hand side down and 
        // the left hand side up, so that every solution of the old problem satisfies the new contraints.
        // To do that, we want to round the left hand side up and the right hand side down. Since
        // the lhs were flipped, we simply round all entries down, that is:
        //
        // we replace x in ( -tiny, 0 ) by -tiny and x in [ 0, tiny  ) by +0.0
        // (and fix the zeros latter )

        __m256d zero = zero_d();
        __m256i exp_f        = and_i( *q, exp_mask );
        __m256i full_sign    = gather_lanes< 0, 0 >( sign );
        __m256i minus_tiny   = _mm256_or_si256( tiny, full_sign );
        __m256i is_tiny_f    = _mm256_cmpgt_epi32( tiny, exp_f );
        __m256d non_negative = _mm256_cmp_pd( *q, zero, _CMP_GE_OQ );

        // tiny negatives should be replaced by -tiny, tiny non negatives are replace by 0.
        __m256d fix = blend( cast_d( minus_tiny ), zero, non_negative );

        // fixing the tiny entries 
        *q = blend( *q, fix, is_tiny_f );

        // restoring the rhs sign
        *q = xor_d( *q, sign );

        // cleaning up the zeros
        __m256i abs     = _mm256_slli_epi64( cast_i( *q ), 1 ); // throwing away the sign bit
        __m256i is_zero = _mm256_cmpeq_epi64( abs, cast_i( zero ) );

        *q = blend( *q, zero, is_zero );

        ++quadrant[ iq ];
      }
      else
      {
        // | a | <= tiny and | b | <= tiny: and |c| >= huge,
        // The constraint is either irrelevant or fatal.

        int pb_sign = _mm256_movemask_pd( *q );
        if( !( pb_sign & 0x4 ) ) // drop the quadrant: the right hand side is too large
        { 
          ret |= ( 0x1 << iq );
          break;
        }
        
        // else, drop the constraint: the right hand side is too small

        *q = *( --end );
      }
    }
  }

  return ret;
}

} // namespace wm::nlp::lp2d
//----------------------------------------------------------------------------------------

extern "C" 
{

int interval_system_2d( double* solution, double const* box, double const* eq, 
                        unsigned int n_equations )
{
  constexpr int max_quadrant_size = 32;

  using namespace wm::nlp::lp2d;

  double* solsPtr = solution;

  printf("box: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", box[0], box[1], box[2], box[3]);
  printf("n_equations = %u\n", n_equations);
  for (int i = 0; i < n_equations; i++) {
      printf("equation %u: a_min = %f, a_max = %f, b_min = %f, b_max = %f, c_min = %f, c_max = %f\n", i, eq[6*i], eq[6*i + 1], eq[6*i + 2], eq[6*i + 3], eq[6*i + 4], eq[6*i + 5]);
  }

  if( n_equations > ( max_quadrant_size / 2 ) ) 
    return -1;

  if( std::bit_cast< uint64_t >( box ) & 0x1F ) 
    return -2;

  if( std::bit_cast< uint64_t >( eq ) & 0x1F ) 
    return -3;


  int ret = 0;
  int n_equations_q = n_equations / 2;
  int n_equations_r = n_equations % 2;
  
  double const* eq_max = eq + n_equations_q;

  __m256i huge;
  __m256i tiny;
  __m256i signi    = bounds( &tiny, &huge );
  __m256i exp_mask = _mm256_srli_epi64( ones_i(), 53 );
          exp_mask = _mm256_slli_epi64( exp_mask, 52 ); // { 0x7FF0'0000'0000'0000... }

  __m256d sign     = cast_d( signi   );
  __m256d sign_01  = blend< 0, 0, 1, 1 >( sign, zero_d() ); // { -1: -1, 0, 0 }
  
  int old_rounding_mode = std::fegetround();

  if( std::fesetround( FE_DOWNWARD ) )
    return -4;

  __m256d quadrants[ 4 * max_quadrant_size ];
  
  __m256d* q[ 4 ] = { quadrants, 
                      quadrants +     max_quadrant_size, 
                      quadrants + 2 * max_quadrant_size, 
                      quadrants + 3 * max_quadrant_size };

  int empty_quadrant = 0;
  double const* p_eq = eq;
  int quadrant_is_not_empty = 0xF;

  for( ; p_eq < eq_max; p_eq += 12 )
  {
    __m256d  a0 = _mm256_load_pd( p_eq     );
    __m256d aux = _mm256_load_pd( p_eq + 4 );
    __m256d bc1 = _mm256_load_pd( p_eq + 8 );

    // the equations in this group are
    //
    //  a0[ 0, 1 ] x +  a0[ 2, 3 ] y = aux[ 0, 1 ] 
    // aux[ 2, 3 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] 

    __m256d bc0 = gather_lanes< 1, 0 >( a0, aux );
    __m256d  a1 = gather_lanes< 1, 0 >( aux );
    
    // the equations in this group are now 
    //
    // a0[ 0, 1 ] x + bc0[ 0, 1 ] y = bc0[ 2, 3 ] 
    // a1[ 0, 1 ] x + bc1[ 0, 1 ] y = bc1[ 2, 3 ] 

    __m256d max_c0 = permute< 0, 1, 3, 2 >( bc0 ); //{ ?, ?, bc0[ 3 ], ? }
    __m256d max_c1 = permute< 0, 1, 3, 2 >( bc1 ); //{ ?, ?, bc1[ 3 ], ? }

    for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
    {
      q[ iq ][ 0 ] = gather_low(  a0, bc0 ); // { a0[ 0 ], bc0[ 0 ], ?, ? }
      q[ iq ][ 1 ] = gather_high( a0, bc0 ); // { a0[ 1 ], bc0[ 1 ], ?, ? }
      q[ iq ][ 2 ] = gather_low(  a1, bc1 ); // { a1[ 0 ], bc1[ 0 ], ?, ? }
      q[ iq ][ 3 ] = gather_high( a1, bc1 ); // { a1[ 1 ], bc1[ 1 ], ?, ? }

      q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c0 ); // { a0[ 0 ], bc0[ 0 ], bc0[ 3 ], ? }
      q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ], bc0    ); // { a0[ 1 ], bc0[ 1 ], bc0[ 2 ], ? }
      q[ iq ][ 2 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 2 ], max_c1 ); // { a1[ 0 ], bc1[ 0 ], bc1[ 3 ], ? }
      q[ iq ][ 3 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 3 ], bc1    ); // { a1[ 1 ], bc1[ 1 ], bc1[ 2 ], ? }

      q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -a0[ 0 ], -bc0[ 0 ], -bc0[ 3 ], ? }
      q[ iq ][ 2 ] = xor_d( q[ iq ][ 2 ], sign ); // { -a1[ 0 ], -bc1[ 0 ], -bc1[ 3 ], ? }

      // moving to the next quadrant 

      if( iq & 0x1 ) // replacing y by - y
      {
        bc0 = xor_d( bc0, sign_01 );
        bc1 = xor_d( bc1, sign_01 );
        bc0 = permute< 1, 0, 2, 3 >( bc0 );
        bc1 = permute< 1, 0, 2, 3 >( bc1 ); 
      }
      else // replacing x by - x
      {
        a0 = xor_d( a0, sign_01 );
        a1 = xor_d( a1, sign_01 );
        a0 = permute< 1, 0, 2, 3 >( a0 ); 
        a1 = permute< 1, 0, 2, 3 >( a1 );
      }
    }
    
    //----------------------------------------------------------------------------------------    
    // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we 
    // normalize equation in which some elements are either +0.0 or have absolute value outside 
    // the range [ tiny, huge ]. In this process we may detect that the intersection  of the 
    // feasible region with some quadrants is empty, or that some equations are irrelevant.
    //----------------------------------------------------------------------------------------

    __m256d ra  = gather_lanes< 0, 0 >( a0, a1 ); // { row[ 0 ][ 0 ], row[ 0 ][ 1 ], row[ 3 ][ 0 ], row[ 3 ][ 1 ] }
    __m256i expa = and_i(  ra, exp_mask );
    __m256i exp0 = and_i( bc0, exp_mask );
    __m256i exp1 = and_i( bc1, exp_mask );

    __m256i min = _mm256_min_epi32( expa, exp0 );
    __m256i max = _mm256_max_epi32( expa, exp0 );
            min = _mm256_min_epi32( min, exp1 );
            max = _mm256_max_epi32( max, exp1 );

    __m256i c_min = _mm256_cmpgt_epi32( tiny, min );
    __m256i c_max = _mm256_cmpgt_epi32( max, huge );
    __m256i bad   = or_i( c_min, c_max );

    if( _mm256_testz_si256( bad, bad ) ) 
    {
      q[ 0 ] += 4;
      q[ 1 ] += 4;
      q[ 2 ] += 4;
      q[ 3 ] += 4;
    }
    else
      empty_quadrant |= normalize_equations( q, 4, tiny, huge );
  }

  if( n_equations_r )
  {
    __m256d  ab = _mm256_load_pd( p_eq     );
    __m128d aux = _mm_load_pd(    p_eq + 4 );

    __m256d sign_23 = gather_lanes< 1, 0 >( sign_01 );

    // the equations in this group are
    //  ab[ 0, 1 ] x + ab[ 2, 3 ] y = aux[ 0, 1 ] 
    
    __m256d c = _mm256_castpd128_pd256( aux );
            c = gather_lanes< 0, 0 >( c );    
    // the equations in this group are now 
    // ab[ 0, 1 ] x + ab[ 2, 3 ] y = c[ 0, 1 ] 
    
    __m256d max_c = permute< 0, 1, 3, 3 >( c ); //{ ?, ?, c[ 1 ], ? }
    
    for( int iq = 0; iq < 4; ++iq ) // iqth quadrant
    {
      q[ iq ][ 0 ] = permute< 0, 2, 2, 3 >( ab ); // { ab[ 0 ], ab[ 2 ], ?, ? }
      q[ iq ][ 1 ] = permute< 1, 3, 2, 3 >( ab ); // { ab[ 1 ], ab[ 3 ], ?, ? }

      q[ iq ][ 0 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 0 ], max_c ); // { ab[ 0 ], ab[ 2 ], c[ 1 ], ? }
      q[ iq ][ 1 ] = blend< 0, 0, 1, 1 >( q[ iq ][ 1 ],     c ); // { ab[ 1 ], ab[ 3 ], c[ 0 ], ? }

      q[ iq ][ 0 ] = xor_d( q[ iq ][ 0 ], sign ); // { -ab[ 0 ], -ab[ 2 ], -c[ 1 ], ? }
      
      // moving to the next quadrant 

      if( iq & 0x1 ) // replacing y by - y
      {
        ab = xor_d( ab, sign_23 );
        ab = permute< 0, 1, 3, 2 >( ab );
      }
      else // replacing x by - x
      {
        ab = xor_d( ab, sign_01 );
        ab = permute< 1, 0, 2, 3 >( ab ); 
      }
    }
    
    //----------------------------------------------------------------------------------------    
    // Checking for overflow and underflow: to avoid headaches with underflow and overflow, we 
    // normalize equation in which some elements are either +0.0 or have absolute value outside 
    // the range [ tiny, huge ]. In this process we may detect that the intersection  of the 
    // feasible region with some quadrants is empty, or that some equations are irrelevant.
    //----------------------------------------------------------------------------------------

    __m256i exp_ab = and_i( ab, exp_mask );
    __m256i exp_c  = and_i( c,  exp_mask );
            exp_c  = blend< 0, 0, 1, 1 >( exp_c, exp_ab );

    __m256i min = _mm256_min_epi32( exp_ab, exp_c );
    __m256i max = _mm256_max_epi32( exp_ab, exp_c );

    __m256i c_min = _mm256_cmpgt_epi32( tiny, min );
    __m256i c_max = _mm256_cmpgt_epi32( max, huge );
    __m256i bad   = or_i( c_min, c_max );

    if( _mm256_testz_si256( bad, bad ) ) 
    {
      q[ 0 ] += 2;
      q[ 1 ] += 2;
      q[ 2 ] += 2;
      q[ 3 ] += 2;
    }
    else
      empty_quadrant |= normalize_equations( q, 2, tiny, huge );
  }

  __m256d avx_box  = _mm256_load_pd( box );
  __m256d cmp      = _mm256_cmp_pd( avx_box, zero_d(), _CMP_GE_OQ );
  int positive_box = _mm256_movemask_pd( cmp );

  __m256d edge[        2 * ( max_quadrant_size + 2 ) ];
  __m256d vertex_pair[ 3 * ( max_quadrant_size + 2 ) ];
  bounded_convex bc( 2 * ( max_quadrant_size + 2 ), edge, vertex_pair );

  constexpr uint x_min_field = 0x1;
  constexpr uint x_max_field = 0x2;
  constexpr uint y_min_field = 0x4;
  constexpr uint y_max_field = 0x8;

  constexpr uint first_quadrant_field  = 0x1;
  constexpr uint second_quadrant_field = 0x2;
  constexpr uint third_quadrant_field  = 0x4;
  constexpr uint fourth_quadrant_field = 0x8;

  if( positive_box & x_max_field ) // x_max >= 0 
  {
    if( !( empty_quadrant & first_quadrant_field ) &&
         ( positive_box   & y_max_field ) ) // y_max >= 0, first quadrant
    {
      bc.setup( avx_box );
      bc.push( quadrants, q[ 0 ] );
      if( bc.type != bounded_convex_type::empty )
      {
        ++ret;
        __m256d bc_box = bc.box();
        _mm256_store_pd( solution, bc_box );
        solution += 4;
      }
    }

    if( !( empty_quadrant & fourth_quadrant_field ) && 
        !( positive_box & y_min_field ) ) // y_min < 0, fourth quadrant
    {
      __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
              y_sign = gather_lanes< clear_lane, 1 >( y_sign );

      __m256d q_box  = permute< 0, 1, 3, 2 >( avx_box );
              q_box  = xor_d( q_box, y_sign );

      bc.setup( q_box );
      bc.push( quadrants + 3 * max_quadrant_size, q[ 3 ] );

      if( bc.type != bounded_convex_type::empty )
      {
        ++ret;
        __m256i y_sign = _mm256_slli_epi64( ones_i(), 63 );
        y_sign = gather_lanes< clear_lane, 1 >( y_sign );

        __m256d bc_box = bc.box();
        bc_box = xor_d( bc_box, y_sign );
        bc_box  = permute< 0, 1, 3, 2 >( bc_box );
        _mm256_store_pd( solution, bc_box );
        solution += 4;
      }
    }
  }

  if( !( positive_box & x_min_field ) ) // x_min < 0 
  {
    if( !( empty_quadrant & second_quadrant_field) && 
         ( positive_box & y_max_field ) ) // y_max >= 0, second quadrant
    {
      __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
      x_sign = gather_lanes< 0, clear_lane >( x_sign );

      __m256d q_box = permute< 1, 0, 2, 3 >( avx_box );
              q_box = xor_d( q_box, x_sign );

      bc.setup( q_box );
      bc.push( quadrants + max_quadrant_size, q[ 1 ] );

      if( bc.type != bounded_convex_type::empty )
      {
        ++ret;
        __m256d bc_box = bc.box();
        __m256i x_sign = _mm256_slli_epi64( ones_i(), 63 );
        x_sign = gather_lanes< 0, clear_lane >( x_sign );
        bc_box = xor_d( bc_box, x_sign );
        bc_box = permute< 1, 0, 2, 3 >( bc_box );
        _mm256_store_pd( solution, bc_box );
        solution += 4;
      }
    }

    if( !( empty_quadrant & third_quadrant_field ) && 
        !( positive_box   & y_min_field ) ) // y_min < 0, third quadrant
    {
      __m256i sign = _mm256_slli_epi64( ones_i(), 63 );
      avx_box = permute< 1, 0, 3, 2 >( avx_box );
      avx_box = xor_d( avx_box, sign );
      bc.setup( avx_box );
      bc.push( quadrants + 2 * max_quadrant_size, q[ 2 ] );

      if( bc.type != bounded_convex_type::empty )
      {
        ++ret;
        __m256d bc_box = bc.box();
        __m256i sign = _mm256_slli_epi64( ones_i(), 63 );
        bc_box = xor_d( bc_box, sign );
        bc_box  = permute< 1, 0, 3, 2 >( bc_box );
        _mm256_store_pd( solution, bc_box );
      }
    }
  }

  std::fesetround( old_rounding_mode );

  for (int i = 0; i < ret; i++) {
    printf("solution  %u: x_min = %f, x_max = %f, y_min = %f, y_max = %f\n", i, solsPtr[4*i], solsPtr[4*i + 1], solsPtr[4*i + 2], solsPtr[4*i + 3]);
  }

  return ret;
}

} // extern "C"
//----------------------------------------------------------------------------------------