metabrush/brush-strokes/cbits/lp_2d.hpp

#pragma once

//----------------------------------------------------------------------------------------
// Copyright (c) 2024 Walter Mascarenhas
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The MPL has a "Disclaimer of Warranty" and a "Limitation of Liability",
// and we provide no guarantee regarding the correctness of this source code.
//
//----------------------------------------------------------------------------------------

#include <bit>
#include <cassert>
#include <immintrin.h>

typedef unsigned int uint;

//----------------------------------------------------------------------------------------
namespace wm::nlp::lp2d {

//----------------------------------------------------------------------------------------
// avx2 wrappers, to simplify the notation a bit and clarify the use of immediates
//----------------------------------------------------------------------------------------

inline __m256d and_d( __m256d x, __m256d y )
{
  return _mm256_and_pd( x, y );
}

inline __m256i and_i( __m256i x, __m256d y )
{
  __m256i yi = _mm256_castpd_si256( y );
  return _mm256_and_si256( x, yi );
}

inline __m256i and_i( __m256d x, __m256i y )
{
  __m256i xi = _mm256_castpd_si256( x );
  __m256i  r = _mm256_and_si256( xi, y );
  return r;
}

inline __m256i and_i( __m256i x, __m256i y )
{
  return _mm256_and_si256( x, y );
}

inline __m256d and_not_d( __m256i x, __m256i y )
{
  __m256d xd = _mm256_castsi256_pd( x );
  __m256d yd = _mm256_castsi256_pd( y );
  return _mm256_andnot_pd( xd, yd );
}

inline __m256d and_not_d( __m256d x, __m256d y )
{
  return _mm256_andnot_pd( x, y );
}

inline __m256i and_not_i( __m256i x, __m256i y )
{
  __m256d xd = _mm256_castsi256_pd( x );
  __m256d yd = _mm256_castsi256_pd( y );
  __m256d rd = _mm256_andnot_pd( xd, yd );
  __m256i r  = _mm256_castpd_si256( rd );
  return r;
}

template < uint I0, uint I1, uint I2, uint I3 >
requires ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 < 2 ) && ( I3 < 2 ) )
__m256d blend( __m256d x, __m256d y )
{
  __m256d p = _mm256_blend_pd( x, y, ( I0 | ( I1 << 1 ) | ( I2 << 2 ) | ( I3 << 3 ) ) );
  return p;
}

template < uint I0, uint I1, uint I2, uint I3 >
requires ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 < 2 ) && ( I3 < 2 ) )
__m256i blend( __m256i x, __m256i y )
{
  __m256d xd = _mm256_castsi256_pd( x );
  __m256d yd = _mm256_castsi256_pd( y );
  __m256d pd = _mm256_blend_pd( xd, yd, ( I0 | ( I1 << 1 ) | ( I2 << 2 ) | ( I3 << 3 ) ) );
  __m256i p = _mm256_castpd_si256( pd );
  return p;
}

inline __m256d blend( __m256d a, __m256d b, __m256d mask )
{
  __m256d blended = _mm256_blendv_pd( a, b, mask );
  return blended;
}

inline __m256d blend( __m256d a, __m256d b, __m256i mask )
{
  __m256d mask_d  = _mm256_castsi256_pd( mask );
  __m256d blended = _mm256_blendv_pd( a, b, mask_d );
  return blended;
}

inline __m256i blend( __m256i a, __m256i b, __m256i mask )
{
  __m256d a_d       = _mm256_castsi256_pd( a );
  __m256d b_d       = _mm256_castsi256_pd( b );
  __m256d mask_d    = _mm256_castsi256_pd( mask );
  __m256d blended_d = _mm256_blendv_pd( a_d, b_d, mask_d );
  __m256i blended   = _mm256_castpd_si256( blended_d );
  return blended;
}

inline __m256d blend_max( __m256d x, __m256d y, __m256d greater_mask )
{
  __m256d max = blend( y, x, greater_mask );
  return max;
}

inline __m256d blend_min( __m256d x, __m256d y, __m256d greater_mask )
{
  __m256d min = blend( x, y, greater_mask );
  return min;
}

inline __m256d cast_d( __m256i x )
{
  return _mm256_castsi256_pd( x );
}

inline __m256i cast_i( __m256d x )
{
  return _mm256_castpd_si256( x );
}

//----------------------------------------------------------------------------------------
//
// gather_lanes< a, b >( x ), with a,b in { 0, 1, 2 } returns { x.lane( a ), x.lane( b ) }
// with x.lane( 2 ) = 0
//
//----------------------------------------------------------------------------------------

constexpr int clear_lane = 8;

template < uint I0, uint I1 >
requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) )
__m256i gather_lanes( __m256i x )
{
  constexpr int lane_0 = I0;
  constexpr int lane_1 = I1 << 4;
  __m256d xd = _mm256_castsi256_pd( x );
  __m256d gd = _mm256_permute2f128_pd( xd, xd, lane_0 | lane_1 );
  __m256i g  = _mm256_castpd_si256( gd );
  return g;
}

template < uint I0, uint I1 >
requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) )
__m256d gather_lanes( __m256d x )
{
  constexpr int lane_0 = I0;
  constexpr int lane_1 = I1 << 4;

  __m256d p;
  if constexpr ( ( I0 == 0 ) && ( I1 == 1 ) )
    p = x;
  else
    p = _mm256_permute2f128_pd( x, x, lane_0 | lane_1 );
  return p;
}

//----------------------------------------------------------------------------------------
//
// gather_lanes< a, b >( x, y ), with a,b in { 0, 1, clear_lane } returns { x.lane( a ), y.lane( b ) }
// with x.lane( clear_lane ) = 0
//
//----------------------------------------------------------------------------------------

template < uint I0, uint I1 >
requires ( ( ( I0 < 2 ) || ( I0 == clear_lane ) ) && ( ( I1 < 2 ) || ( I1 == clear_lane ) ) )
__m256d gather_lanes( __m256d x, __m256d y )
{
  constexpr int lane_0 = I0;
  constexpr int lane_1 = ( (I1 == clear_lane) ? clear_lane : ( I1 + 2 ) ) << 4;

  __m256d p;
  if( ( I0 == 0 ) && ( I1 == 1 ) )
    p = blend< 0, 0, 1, 1 >( x, y );
  else
    p = _mm256_permute2f128_pd( x, y, lane_0 | lane_1 );
  return p;
}

//----------------------------------------------------------------------------------------
// gather_high returns { x[ 1 ], y[ 1 ], x[ 3 ], y[ 3 ] }
//----------------------------------------------------------------------------------------

inline __m256d gather_high( __m256d x, __m256d y )
{
  return _mm256_unpackhi_pd( x, y );
}

//----------------------------------------------------------------------------------------
// gather_low returns { x[ 0 ], y[ 0 ], x[ 2 ], y[ 2 ] }
//----------------------------------------------------------------------------------------

inline __m256d gather_low( __m256d x, __m256d y )
{
  return _mm256_unpacklo_pd( x, y );
}

//----------------------------------------------------------------------------------------
// high_bits_64 returns the bits 63, 64 + 63, 128 + 63 and 196 + 63 of x
//----------------------------------------------------------------------------------------

inline int high_bits_64( __m256d x )
{
  return _mm256_movemask_pd( x );
}

inline int high_bits_64( __m256i x )
{
  __m256d xd = cast_d( x );
  return _mm256_movemask_pd( xd );
}

//----------------------------------------------------------------------------------------
// gather_low returns { x[ 0 ], y[ 0 ], x[ 2 ], y[ 2 ] }
//----------------------------------------------------------------------------------------

inline __m256d is_greater( __m256d x, __m256d y )
{
  __m256d gt = _mm256_cmp_pd( x, y, _CMP_GT_OQ );
  return gt;
}

inline __m256i ones_i()
{
  __m256i r;
  return _mm256_cmpeq_epi64( r, r );
}

inline __m256d or_d( __m256d x, __m256d y )
{
  return _mm256_or_pd( x, y );
}

inline __m256i or_i( __m256i x, __m256i y )
{
  return _mm256_or_si256( x, y );
}

template < uint I0, uint I1, uint I2, uint I3 >
requires ( ( I0 < 4 ) && ( I1 < 4 ) && ( I2 < 4 ) && ( I3 < 4 ) )
__m256d permute( __m256d x )
{
  __m256d p;
  if constexpr ( ( I0 < 2 ) && ( I1 < 2 ) && ( I2 > 1 ) && ( I3 > 1 ) )
    p = _mm256_permute_pd( x, ( I0 | ( I1 << 1 ) | ( ( I2 - 2 ) << 2 ) | ( ( I3 - 2 ) << 3 ) ) );
  else
    p = _mm256_permute4x64_pd( x, ( I0 | ( I1 << 2 ) | ( I2 << 4 ) | ( I3 << 6 ) ) );
  return p;
}

template < uint I0, uint I1, uint I2, uint I3 >
requires ( ( I0 < 4 ) && ( I1 < 4 ) && ( I2 < 4 ) && ( I3 < 4 ) )
__m256i permute( __m256i x )
{
  __m256d xd = _mm256_castsi256_pd( x );
  __m256d pd = permute< I0, I1, I2, I3 >( xd );
  __m256i p = _mm256_castpd_si256( pd );
  return p;
}

inline __m256d xor_d( __m256d x, __m256d y )
{
  return _mm256_xor_pd( x, y );
}

inline __m256d xor_d( __m256d x, __m256i y )
{
  __m256d yd = cast_d( y );
  return _mm256_xor_pd( x, yd );
}

inline __m256d xor_d( __m256i x, __m256d y )
{
  __m256d xd = cast_d( x );
  return _mm256_xor_pd( xd, y );
}

inline __m256i xor_i( __m256i x, __m256i y )
{
  return _mm256_xor_si256( x, y );
}

inline __m256d zero_d()
{
  __m256d r;
  return _mm256_xor_pd( r, r );
}

inline __m256i zero_i()
{
  __m256i r;
  return _mm256_xor_si256( r, r );
}

//----------------------------------------------------------------------------------------
// avx2 wrappers, to simplify the notation a bit and clarify the use of immediates
//----------------------------------------------------------------------------------------

enum class location
{
  null   =  -1024,
  out    =  -1,
  border =   0,
  in     =   1
};

enum class bounded_convex_type
{
  empty = 0,
  point = 1,
  segment = 2,
  polygon = 3
};

//----------------------------------------------------------------------------------------
// vertex_0 and vertex_1 scatter the arrays
//
// {   inf_y,   inf_z, inf_x,   ? } and
// { m_sup_y, m_sup_z, m_sup_x, ? }
//
//  to vertex_pair, in the x, y, z order
//----------------------------------------------------------------------------------------

struct vertex_0
{
  static void scatter( __m256d* vertex_pair, __m256d inf, __m256d m_sup )
  {
    __m256d y = gather_low(  inf, m_sup );  // { inf( y ), m_sup( y ), inf( x ), m_sup( x ) }
    __m256d z = gather_high( inf, m_sup );  // { inf( z ), m_sup( z ), ? ? }
    __m256d x = gather_lanes< 1, 1 >( y );  // { inf( x ), m_sup( x ), ? ? }

    vertex_pair[ 0 ] = blend< 0, 0, 1, 1 >( x, vertex_pair[ 0 ] );
    vertex_pair[ 1 ] = blend< 0, 0, 1, 1 >( y, vertex_pair[ 1 ] );
    vertex_pair[ 2 ] = blend< 0, 0, 1, 1 >( z, vertex_pair[ 2 ] );
  }
};

struct vertex_1
{
  static void scatter( __m256d* vertex_pair, __m256d inf, __m256d m_sup )
  {
    __m256d x = gather_low(  inf, m_sup );  // { inf( y ), m_sup( y ), inf( x ), m_sup( x ) }
    __m256d z = gather_high( inf, m_sup );  // { inf( z ), m_sup( z ),        ?,          ? }
    __m256d y = gather_lanes< 0, 0 >( x );  // {        ?,          ?, inf( y ), m_sup( y ) }
            z = gather_lanes< 0, 0 >( z );  // {        ?.           . inf( z ), m_sup( z ) }

    vertex_pair[ 0 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 0 ], x );
    vertex_pair[ 1 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 1 ], y );
    vertex_pair[ 2 ] = blend< 0, 0, 1, 1 >( vertex_pair[ 2 ], z );
  }
};

//----------------------------------------------------------------------------------------
// cross computes the solution ( x / z, y / z ) of the equation
//
// a[ 0 ] x + a[ 1 ] y = a[ 2 ]
// b[ 0 ] x + b[ 1 ] y = b[ 2 ]
//
// which is
//
// x = a[ 2 ] b[ 1 ] - a[ 1 ] b[ 2 ]
// y = a[ 0 ] b[ 2 ] - a[ 2 ] b[ 0 ]
// z = a[ 0 ] b[ 1 ] - a[ 1 ] b[ 0 ]
//
// which is convenient to order as
//
// y = a[ 0 ] b[ 2 ] - a[ 2 ] b[ 0 ]
// z = a[ 0 ] b[ 1 ] - a[ 1 ] b[ 0 ]
// x = a[ 2 ] b[ 1 ] - a[ 1 ] b[ 2 ]
//
// It assumes that x, y >= 0 and z > 0 and makes sure that the rounded values of
// x, y and z also satisfy these lower bounds.
//----------------------------------------------------------------------------------------

template < class Vertex >
inline void cross( __m256d* vertex_pair, __m256d a, __m256d b )
{
  __m256d inf_a   = _mm256_movedup_pd( a );     // { a[ 0 ], a[ 0 ], a[ 2 ], ? }
  __m256d m_sup_b = _mm256_movedup_pd( b );     // { b[ 0 ], b[ 0 ], b[ 2 ], ? }

  __m256d inf_b   = permute< 2, 1, 1, 3 >( b ); // { b[ 2 ], b[ 1 ], b[ 1 ], ? }
  __m256d m_sup_a = permute< 2, 1, 1, 3 >( a ); // { a[ 2 ], a[ 1 ], a[ 1 ], ? }

  __m256d inf_h   = _mm256_mul_pd( inf_a,   inf_b );
  __m256d m_sup_h = _mm256_mul_pd( m_sup_a, m_sup_b );

  __m256d inf   = _mm256_fnmadd_pd( m_sup_a, m_sup_b, inf_h ); // inf(x)  = rounded_down(  x )
  __m256d m_sup = _mm256_fnmadd_pd( inf_a, inf_b, m_sup_h   ); // -sup(x) = rounded_down( -x )

  // x,y must be >= 0 and z must be positive and the computed values violate this due to
  // rounding. In this rare case in which inf[ i ] <= 0 I can prove that the exact value
  // of inf[ i ] is m_sup[ i ] - inf[ i ]

  __m256d zero = zero_d();
  __m256d cmp  = _mm256_cmp_pd( inf, zero, _CMP_LE_OQ ); // rounded_down( x ) <= 0 ?
  __m256d ds = _mm256_sub_pd( m_sup, inf );
  inf = blend( inf, ds, cmp );

  Vertex::scatter( vertex_pair, inf, m_sup );
}

//----------------------------------------------------------------------------------------
// exact_location returns the exact location of the intersection ( x, y ) of the lines
//
// b0[ 0 ] x + b0[ 1 ] y >= b0[ 2 ]
// b1[ 1 ] x + b1[ 1 ] y >= b1[ 2 ]
//
// with respect to the set
//
// e0 x + e1 y >= e2
//
// under the assumption that
//
// d = b0[ 0 ] b1[ 1 ] - b0[ 1 ] b1[ 1 ] > 0
//
// where ei = edge[ i ],  b0 = border[ 0 ] and b1 = b[ 1 ]
//
// This amounts to evaluating the sign of
//
// p = e0 r + e1 s - e2 d
//
// r = b0[ 2 ] b1[ 1 ] - b0[ 1 ] b1[ 2 ]
// s = b0[ 0 ] b1[ 2 ] - b0[ 2 ] b1[ 0 ] ,
//
// which motivates the evaluation of the six products
//
//   group 1                    group 2
//  e0 b0[ 2 ] b1[ 1 ],   -e0 b0[ 1 ] b1[ 2 ]
//  e1 b0[ 0 ] b1[ 2 ],   -e1 b0[ 2 ] b1[ 0 ]
//  e2 b0[ 1 ] b1[ 0 ].   -e2 b0[ 0 ] b1[ 1 ]
//
// and we want to find the sign of their sum.
//
// The function assumes that the edges entries are either normal or zero, ie.,
// there are no subnormal entries.
//----------------------------------------------------------------------------------------

location exact_location( __m256d e, __m256d b0, __m256d b1 );

//----------------------------------------------------------------------------------------
// The struct bounded convex represents a bounded convex set on the plane, which can
// have one of the following types:
//
// a) empty
// b) ṕoint
// c) segment
// d) polygon
//
//----------------------------------------------------------------------------------------

struct bounded_convex final
{
  constexpr bounded_convex( int alloc, __m256d* edge, __m256d* vertex_p )
  : alloc( alloc ), edge( edge ), vertex_p( vertex_p ){}

  void setup( __m256d box );

  int insert_vertex( int at );
  int remove_vertices( int begin_vertex, int end_vertex );

  void set_empty()
  {
    end = 0;
    begin = 0;
    type = bounded_convex_type::empty;
  }

  void set_point( int first_vertex )
  {
    end = 1;
    begin = 0;
    type = bounded_convex_type::point;
    edge[ 0 ] = edge[ first_vertex     ];
    edge[ 1 ] = edge[ first_vertex + 1 ];

    __m256d* src = vertex_pair( first_vertex );

    if( first_vertex & 0x1 )
    {
      vertex_p[ 0 ] = gather_lanes< 1, 1 >( src[ 0 ] );
      vertex_p[ 1 ] = gather_lanes< 1, 1 >( src[ 1 ] );
      vertex_p[ 2 ] = gather_lanes< 1, 1 >( src[ 2 ] );
    }
    else
    {
      vertex_p[ 0 ] = src[ 0 ];
      vertex_p[ 1 ] = src[ 1 ];
      vertex_p[ 2 ] = src[ 2 ];
    }
  }

  void set_to_last_segment();

  void set_segment( int first_vertex )
  {
    end = 2;
    begin = 0;
    type = bounded_convex_type::segment;
    edge[ 0 ] = edge[ first_vertex     ];
    edge[ 1 ] = edge[ first_vertex + 1 ];
    edge[ 2 ] = edge[ first_vertex + 2 ];

    __m256d* src = vertex_pair( first_vertex );

    if( first_vertex & 0x1 )
    {
      vertex_p[ 0 ] = gather_lanes< 1, 0 >( src[ 0 ], src[ 3 ] );
      vertex_p[ 1 ] = gather_lanes< 1, 0 >( src[ 1 ], src[ 4 ] );
      vertex_p[ 2 ] = gather_lanes< 1, 0 >( src[ 2 ], src[ 5 ] );
    }
    else
    {
      vertex_p[ 0 ] = *src;
      vertex_p[ 1 ] = *( ++src );
      vertex_p[ 2 ] = *( ++src );
    }
  }

  __m256d box() const;
  void push( __m256d* begin, __m256d* end );

  __m256d plain_edge( int i ) const
  {
    return edge[ begin + i ];
  }

  void plain_vertex( __m256d* inf_msup, int i ) const
  {
    int ri = ( begin + i );
    __m256d const* vp = vertex_pair( ri );
    int di = 2 * ( ( begin + i ) & 0x1 );
    inf_msup[ 0 ] = _mm256_setr_pd( vp[ 0 ][ di     ], vp[ 1 ][ di     ], vp[ 2 ][ di     ], 0.0 );
    inf_msup[ 1 ] = _mm256_setr_pd( vp[ 0 ][ di + 1 ], vp[ 1 ][ di + 1 ], vp[ 2 ][ di + 1 ], 0.0 );
  }

  __m256d* vertex_pair( int index )
  {
    __m256d* p = vertex_p + 3 * ( index / 2 );
    return p;
  }

  __m256d const* vertex_pair( int index ) const
  {
    __m256d* p = vertex_p + 3 * ( index / 2 );
    return p;
  }

  int end;
  int begin;
  int const alloc;
  __m256d* const edge;
  __m256d* const vertex_p;
  bounded_convex_type type;
};

// huge = { 0x5500'0000'0000'0000, 0x5500'0000'0000'0000, 0x5500'0000'0000'0000, 0x7FFF'7FFF'7FFF' }
// tiny = { 0x2bd0'0000'0000'0000, 0x2bd0'0000'0000'0000, 0x2bd0'0000'0000'0000, 0x0000'0000'0000'00000 }

__m256i bounds( __m256i* tiny, __m256i* huge );

int normalize_equations( __m256d* quadrant[ 4 ], int n, __m256i tiny, __m256i huge );

//----------------------------------------------------------------------------------------
//
// The class locator is used to find the location of a point with respect to the half
// plane.
//
// a x + b y >= c z
//
// The template parameters indicate the signs of the coefficients a, b and c.
//
//----------------------------------------------------------------------------------------

template < int flip_x, int flip_y, int flip_z >
struct locator final
{
  //----------------------------------------------------------------------------------------
  // The raw method assumes that
  //
  //  edges[ 0 ] = { a, a, a, a }
  //  edges[ 1 ] = { b, b, b, b }
  //  edges[ 2 ] = { c, c, c, c }
  //
  //  vertex_pair[ 0 ] = { x0_inf, x0_msup, x1_inf, x1_msup }
  //  vertex_pair[ 1 ] = { y0_inf, y0_msup, y1_inf, y1_msup }
  //  vertex_pair[ 2 ] = { z0_inf, z0_msup, z1_inf, z1_msup }
  //
  // and checks the location of the vertices ( x0, y0 ) and ( x1, y1 ) with respect to
  // the half plane
  //
  // (-1)^flip_x a x + (-1)^flip_y b y >= (-1)^flip_z c z
  //
  // This is done by computing a x + b y - c z  rounded up and down.
  //
  //----------------------------------------------------------------------------------------

  static int raw( __m256d const* edge, __m256d const* vertex_pair )
  {
    __m256d chk;
    __m256d zero_d = _mm256_xor_pd( zero_d, zero_d );

    if constexpr ( flip_x )
      chk = _mm256_mul_pd( edge[ 0 ], permute< 1, 0, 3, 2 >( vertex_pair[ 0 ] ) );
    else
      chk = _mm256_mul_pd( edge[ 0 ], vertex_pair[ 0 ] );

    if constexpr ( flip_y)
      chk = _mm256_fmadd_pd( edge[ 1 ], permute< 1, 0, 3, 2 >( vertex_pair[ 1 ] ), chk );
    else
      chk = _mm256_fmadd_pd( edge[ 1 ], vertex_pair[ 1 ], chk );

    if constexpr ( flip_z )
      chk = _mm256_fmadd_pd( edge[ 2 ], vertex_pair[ 2 ], chk );
    else
      chk = _mm256_fmadd_pd( edge[ 2 ], permute< 1, 0, 3, 2 >( vertex_pair[ 2 ] ), chk );

    __m256d cmp = _mm256_cmp_pd( chk, zero_d, _CMP_GT_OQ );

    int cm = _mm256_movemask_pd( cmp );

    return cm;
  }

  static location locate_vertex_0( __m256d const* edge, __m256d const* vertex_pair )
  {
    location loc;
    int cm = raw( edge, vertex_pair );

    if( cm & 0x1 ) // inf( x[ 0 ] ) > 0
      loc = location::in;
    else
    {
      if( cm & 0x2 )
        loc = location::out;
      else
        loc = location::null;
    }

    return loc;
  }

  static location locate_vertex_1( __m256d const* edge, __m256d const* vertex_pair )
  {
    location loc;
    int cm = raw( edge, vertex_pair );

    if( cm & 0x4 ) // the second vertex is in
      loc = location::in;
    else
    {
      if( cm & 0x8 ) // the second vertex is out
        loc = location::out;
      else
        loc = location::null;
    }

    return loc;
  }

  static void locate_pair( location loc[ 2 ],  __m256d const* edge, __m256d const* vertex_pair )
  {
    // computing a vertex[ 0 ] + b vertex[ 1 ] + c vertex[ 2 ]

    int cm = raw( edge, vertex_pair );
    if( cm & 0x1 ) // the first vertex is in
      loc[ 0 ] = location::in;
    else
    {
      if( cm & 0x2 ) // the first vertex is out
        loc[ 0 ] = location::out;
      else
        loc[ 0 ] = location::null;
    }

    if( cm & 0x4 ) // the second vertex is in
      loc[ 1 ] = location::in;
    else
    {
      if( cm & 0x8 ) // the second vertex is out
        loc[ 1 ] = location::out;
      else
        loc[ 1 ] = location::null;
    }
  }

  static void insert( bounded_convex& r, __m256d new_edge )
  {
    __m256i ones = _mm256_cmpeq_epi64( ones, ones );
            ones = _mm256_slli_epi64( ones, 63 );
    __m256d sign = _mm256_castsi256_pd( ones );

    __m256d coefficients[ 3 ] = { permute< 0, 0, 0, 0 >( new_edge ),
                                  permute< 1, 1, 1, 1 >( new_edge ),
                                  permute< 2, 2, 2, 2 >( new_edge ) };

    coefficients[ 0 ] = _mm256_andnot_pd( sign, coefficients[ 0 ] );
    coefficients[ 1 ] = _mm256_andnot_pd( sign, coefficients[ 1 ] );
    coefficients[ 2 ] = _mm256_andnot_pd( sign, coefficients[ 2 ] );

    if( r.type != bounded_convex_type::polygon )
    {
      if( r.type == bounded_convex_type::point )
      {
        location loc = locate_vertex_0( coefficients, r.vertex_p );

        if( loc == location::null )
          loc = exact_location( new_edge, r.edge[ 0 ], r.edge[ 1 ] );

        if( loc == location::out )
          r.set_empty();

        return;
      }

      location loc[ 2 ];
      locate_pair( loc, coefficients, r.vertex_p );

      if( loc[ 0 ] == location::null )
        loc[ 0 ] = exact_location( new_edge, r.edge[ 0 ], r.edge[ 1 ] );

      if( loc[ 1 ] == location::null )
        loc[ 1 ] = exact_location( new_edge, r.edge[ 1 ], r.edge[ 2 ] );

      switch( loc[ 0 ] )
      {
        case location::in:
        {
          if( loc[ 1 ] == location::out )
          {
            r.edge[ 2 ] = new_edge;
            cross< vertex_1 >( r.vertex_p, r.edge[ 1 ], new_edge );
          }
          return;
        }
        case location::border:
        {
          if( loc[ 1 ] == location::out )
          {
            r.end  = 1;
            r.type = bounded_convex_type::point;
          }
          return;
        }
        default: // location::out
        {
          switch( loc[ 1 ] )
          {
            case location::in:
            {
              r.edge[ 0 ] = new_edge;
              cross< vertex_0 >( r.vertex_p, new_edge, r.edge[ 1 ] );
              return;
            }
            case location::border:
            {
              r.end = 1;
              r.edge[ 0 ] = r.edge[ 1 ];
              r.edge[ 1 ] = r.edge[ 2 ];
              r.type = bounded_convex_type::point;
              r.vertex_p[ 0 ] = gather_lanes< 1, 1 >( r.vertex_p[ 0 ] );
              r.vertex_p[ 1 ] = gather_lanes< 1, 1 >( r.vertex_p[ 1 ] );
              r.vertex_p[ 2 ] = gather_lanes< 1, 1 >( r.vertex_p[ 2 ] );
              return;
            }
            default:
            {
              r.set_empty();
              return;
            }
          }
        }
      }
    }

    location loc[ 2 ];

    int index = r.begin;
    int intersection = -1;
    bool intersection_at_vertex;

    __m256d* vertex_pair_at_index = r.vertex_pair( index );

    auto scan = [ & ]()
    {
      if( ++index >= r.end )
        return location::null;

      if( index & 0x1 )
        return loc[ 1 ];

      vertex_pair_at_index += 3;

      if( index + 1 >= r.end )
      {
        loc[ 0 ] = locate_vertex_0( coefficients, vertex_pair_at_index );

        if( loc[ 0 ] == location::null )
          loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );
      }
      else
      {
        locate_pair( loc, coefficients, vertex_pair_at_index );

        if( loc[ 0 ] == location::null )
          loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ]  );

        if( loc[ 1 ] == location::null )
          loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] );
      }

      return loc[ 0 ];
    };

    if( index & 0x1 )
    {
      location begin = locate_vertex_1( coefficients, vertex_pair_at_index );

      if( begin == location::null )
        begin = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );

      switch( begin )
      {
        case location::in:
          goto cursor_in;
        case location::border:
        {
          intersection = index;
          intersection_at_vertex = true;

          ++index;
          vertex_pair_at_index += 3;

          locate_pair( loc, coefficients, vertex_pair_at_index );

          if( loc[ 0 ] == location::null )
            loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );

          if( loc[ 1 ] == location::null )
            loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] );

          switch( loc[ 0 ] )
          {
            case location::in:
              goto cursor_in;
            case location::border:
            {
              if( loc[ 1 ] == location::out )
                r.set_segment( r.begin );
              return;
            }
            case location::out:
              goto cursor_out;
          }
        }
        default:
          goto cursor_out;
      }
    }

    locate_pair( loc, coefficients, vertex_pair_at_index );

    if( loc[ 0 ] == location::null )
      loc[ 0 ] = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );

    if( loc[ 1 ] == location::null )
      loc[ 1 ] = exact_location( new_edge, r.edge[ index + 1 ], r.edge[ index + 2 ] );

    switch( loc[ 0 ] )
    {
      case location::in:
        goto cursor_in;
      case location::border:
      {
        intersection = index;
        intersection_at_vertex = true;

        ++index;

        switch( loc[ 1 ] )
        {
          case location::in:
            goto cursor_in;
          case location::border:
          {
            location next = locate_vertex_0( coefficients, vertex_pair_at_index + 3 );

            if( next == location::null )
              next = exact_location( new_edge, r.edge[ index ], r.edge[ index + 1 ] );

            if( next == location::out )
              r.set_segment( r.begin );

            return;
          }
          default:
            goto cursor_out;
        }
      }
      default:
        goto cursor_out;
    }

    cursor_in:
    {
      location next_loc = scan();

      switch( next_loc )
      {
        case location::in:
          goto cursor_in;
        case location::border:
        {
          if( intersection >= 0 )
          {
            if( !intersection_at_vertex )
            {
              __m256d* vp = r.vertex_pair( intersection - 1 );
              if( intersection & 0x1 )
                cross< vertex_0 >( vp, new_edge, r.edge[ intersection ] );
              else
                cross< vertex_1 >( vp, new_edge, r.edge[ intersection ] );

              --intersection;
            }

            r.end = index + 1;
            r.begin = intersection;
            r.edge[ r.end ]   = new_edge;
            r.edge[ r.begin ] = new_edge;
          }
          else
          {
            next_loc = scan();

            if( next_loc == location::out )
            {
              intersection = index - 1;
              intersection_at_vertex = true;
              goto cursor_out;
            }
          }
          return;
        }
        case location::out:
        {
          if( intersection < 0 )
          {
            intersection = index;
            intersection_at_vertex = false;
            goto cursor_out;
          }

          if( intersection_at_vertex )
            r.begin = intersection;
          else
          {
            r.begin = intersection - 1;
            __m256d* vp = r.vertex_pair( r.begin );

            if( r.begin & 0x1 )
              cross< vertex_1 >( vp, new_edge, r.edge[ intersection ] );
            else
              cross< vertex_0 >( vp, new_edge, r.edge[ intersection ] );
          }

          __m256d* vp = r.vertex_pair( index );
          if( index & 0x1 )
            cross< vertex_1 >( vp, r.edge[ index ], new_edge );
          else
            cross< vertex_0 >( vp, r.edge[ index ], new_edge );

          r.end = index + 1;
          r.edge[ r.end   ] = new_edge;
          r.edge[ r.begin ] = new_edge;
          return;
        }
        case location::null:
        {
          assert( index == r.end );

          if( intersection >= 0 )
          {
            if( intersection_at_vertex )
            {
              if( intersection == r.begin )
                return;
            }
            else
            {
              __m256d* first_pair = r.vertex_pair( intersection - 1 );

              if( intersection & 0x1 )
                cross< vertex_0 >( first_pair, new_edge, r.edge[ intersection ] );
              else
                cross< vertex_1 >( first_pair, new_edge, r.edge[ intersection ] );

              --intersection;
            }

            __m256d* last_pair = r.vertex_pair( index );

            if( index & 0x1 )
              cross< vertex_1 >( last_pair, r.edge[ index ], new_edge );
            else
              cross< vertex_0 >( last_pair, r.edge[ index ], new_edge );

            ++r.end;
            r.begin = intersection;
            r.edge[ r.end ]   = new_edge;
            r.edge[ r.begin ] = new_edge;
          }

          return;
        }
      }
    }

    cursor_out:
    {
      location new_loc = scan();

      switch( new_loc )
      {
        case location::in:
        {
          if( intersection < 0 )
          {
            intersection = index;
            intersection_at_vertex = false;
            goto cursor_in;
          }

          if( intersection + 1 == index )
          {
            assert( !intersection_at_vertex );
            int new_vertex = r.insert_vertex( intersection );

            if( new_vertex == intersection )
            {
              __m256d* vp = r.vertex_pair( intersection );
              if( intersection & 0x1 )
              {
                cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
                cross< vertex_0 >( vp + 3, new_edge, r.edge[ intersection + 2 ] );
              }
              else
              {
                cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
                cross< vertex_1 >( vp, new_edge, r.edge[ intersection + 2 ] );
              }

              r.edge[ intersection + 1 ] = new_edge;
            }
            else
            {
              assert( new_vertex == intersection - 1  );

              __m256d* vp = r.vertex_pair( intersection );

              if( intersection & 0x1 )
              {
                cross< vertex_1 >( vp, new_edge, r.edge[ index ] );
                cross< vertex_0 >( vp, r.edge[ intersection ], new_edge) ;
              }
              else
              {
                cross< vertex_0 >( vp, new_edge, r.edge[ index ] );
                cross< vertex_1 >( vp - 3, r.edge[ intersection ], new_edge) ;
              }

              r.edge[ new_vertex ] = r.edge[ intersection ];
              r.edge[ intersection ] = new_edge;
            }
            return;
          }

          if( !intersection_at_vertex )
          {
            __m256d* vp = r.vertex_pair( intersection );

            if( intersection & 0x1 )
              cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
            else
              cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
          }

          if( intersection + 2 < index )
            index = r.remove_vertices( intersection + 1, index - 1 ) + 2;

          __m256d* vp = r.vertex_pair( index - 1 );
          if( index & 0x1 )
            cross< vertex_0 >( vp, new_edge, r.edge[ index ] );
          else
            cross< vertex_1 >( vp, new_edge, r.edge[ index ] );

          r.edge[ index - 1 ] = new_edge;
          return;
        }
        case location::border:
        {
          if( intersection >= 0 )
          {
            if( intersection_at_vertex )
            {
              if( ( intersection == r.begin ) && ( index + 1 == r.end ) )
              {
                r.set_to_last_segment();
                return;
              }
            }
            else
            {
              __m256d* vp = r.vertex_pair( intersection );
              if( intersection & 0x1 )
                cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
              else
                cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
            }

            if( intersection + 1 < index )
              intersection = r.remove_vertices( intersection + 1, index );

            r.edge[ intersection + 1 ] = new_edge;
            return;
          }

          new_loc = scan();
          switch( new_loc )
          {
            case location::in:
            {
              intersection = index - 1;
              intersection_at_vertex = true;
              goto cursor_in;
            }
            case location::border:
            {
              r.set_segment( index - 1 );
              return;
            }
            default: // location::out or location::null:
            {
              r.set_point( index -  1 );
              return;
            }
          }
        }
        case location::out:
          goto cursor_out;
        case location::null:
        {
          assert( index == r.end );

          if( intersection < 0 )
          {
            r.set_empty();
            return;
          }

          if( intersection_at_vertex )
          {
            if( intersection == r.begin )
            {
              r.set_point( r.begin );
              return;
            }
          }
          else
          {
            __m256d* vp = r.vertex_pair( intersection );

            if( intersection & 0x1 )
              cross< vertex_1 >( vp, r.edge[ intersection ], new_edge );
            else
              cross< vertex_0 >( vp, r.edge[ intersection ], new_edge );
          }

          ++intersection;
          __m256d* vp = r.vertex_pair( intersection );

          if( intersection & 0x1 )
            cross< vertex_1 >( vp, new_edge, r.edge[ index ] );
          else
            cross< vertex_0 >( vp, new_edge, r.edge[ index ] );

          r.end = intersection + 1;
          r.edge[ intersection ] = new_edge;
          r.edge[ r.end ] = r.edge[ r.begin ];
        }

        return;
      }
    }
  }
};

} // namespace wm::nlp::lp2d

//----------------------------------------------------------------------------------------
//
// The function interval_system_2d solves the system of interval equations
//
// ai x + bi y = ci
//
// for 0 <= i < n_equations and
//
// ai = [ eq[ 6 * i     ], eq[ 6 * i + 1 ] ]  with  eq[ 6 * i     ] <= eq[ 6 * i + 1 ]
// bi = [ eq[ 6 * i + 2 ], eq[ 6 * i + 3 ] ]  with  eq[ 6 * i + 2 ] <= eq[ 6 * i + 3 ]
// ci = [ eq[ 6 * i + 4 ], eq[ 6 * i + 5 ] ]  with  eq[ 6 * i + 4 ] <= eq[ 6 * i + 5 ]
//
// The function finds the solutions inside the box
//
// box[ 0 ] <= x <= box[ 1 ]
// box[ 2 ] <= y <= box[ 3 ]
//
// The solution consists of up to 4 boxes bx[ 0 ], bx[ 1 ], bx[ 2 ] and bx[ 3 ], with
//
// bx[ i ] =  inf_x[ i ] <= x <= sup_x[ i ]
//            inf_y[ i ] <= y <= sup_y[ i ]
// for
//
// inf_x[ i ] = solution[ 4 * i     ]
// sup_x[ i ] = solution[ 4 * i + 1 ]
// inf_y[ i ] = solution[ 4 * i + 2 ]
// sup_y[ i ] = solution[ 4 * i + 3 ]
//
// The function returns the number of boxes in the solutions, which can be 0,1,2,3 or 4,
// or a negative number if the input is invalid or if it cannot change the rounding mode.
//
// It assumes that:
//
// a) n_equations <= 16
// b) all doubles are finite, ie., not nan or +/-inf
//
// c) box and eq are such that
//
//    box[ 0 ] <= box[ 1 ]
//    box[ 2 ] <= box[ 3]
//    eq[ 2 * i ] <= eq[ 2 i + 1 ] for i = 0,..., 3 * n_equations.
//
// d) box and eq are aligned at a 32 byte boundary
//
// It does not verify the input and its results should not be trusted if the inputs are
// invalid.
//
//----------------------------------------------------------------------------------------

extern "C"
{
int interval_system_2d( double* solution, double const* box, double const* eq, unsigned int n_equations );
}