goldendict-ng/ftshelpers.cc

/* This file is (c) 2014 Abs62
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifdef USE_XAPIAN
#include "xapian.h"
#include <stdlib.h>
#endif
#include "fulltextsearch.hh"
#include "ftshelpers.hh"
#include "wstring_qt.hh"
#include "file.hh"
#include "gddebug.hh"
#include "folding.hh"
#include "utils.hh"

#include <vector>
#include <string>

#include <QVector>

#include <QRegularExpression>

#include "wildcard.hh"
#include <QtConcurrent>
#include "base/globalregex.hh"
#include <QFutureSynchronizer>
#include <QSemaphoreReleaser>

using std::vector;
using std::string;

DEF_EX( exUserAbort, "User abort", Dictionary::Ex )

namespace FtsHelpers
{
// finished  reversed   dehsinif
const static std::string finish_mark = std::string( "dehsinif" );

bool ftsIndexIsOldOrBad( string const & indexFile,
                         BtreeIndexing::BtreeDictionary * dict )
{
#ifdef USE_XAPIAN
  try
  {
    Xapian::WritableDatabase db( dict->ftsIndexName() );
    auto docid = db.get_lastdocid();
    auto document = db.get_document(docid);

    qDebug()<<document.get_data().c_str();
    //use a special document to mark the end of the index.
    return document.get_data().compare(finish_mark)!=0;
  }
  catch( Xapian::Error & e )
  {
    qWarning() << e.get_description().c_str();
    //the file is corrupted,remove it.
    QFile::remove(QString::fromStdString(dict->ftsIndexName()));
    return true;
  }
  catch( ... )
  {
    return true;
  }
  return false;
#endif

  File::Class idx( indexFile, "rb" );

  FtsIdxHeader header;

  return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
         header.signature != FtsSignature ||
         header.formatVersion != CurrentFtsFormatVersion + dict->getFtsIndexVersion();
}

static QString makeHiliteRegExpString( QStringList const & words,
                                       int searchMode, int distanceBetweenWords, bool hasCJK = false, bool ignoreWordsOrder = false )
{
  QString searchString( "(" );

  QString stripWords( "(?:\\W+\\w+){0," );

  if( hasCJK )
  {
    stripWords = "(?:[\\W\\w]){0,";
  }

  if( distanceBetweenWords >= 0 )
    stripWords += QString::number( distanceBetweenWords );
  stripWords += "}";

  if(!hasCJK)
  {
    stripWords += "\\W+";
  }

  QString boundWord( searchMode == FTS::WholeWords ? "\\b" : "(?:\\w*)");
  if(hasCJK)
  {
    //no boundary for CJK
    boundWord.clear();
  }

  for( int x = 0; x < words.size(); x++ )
  {
    if( x )
    {
      searchString += stripWords;
      if(ignoreWordsOrder)
        searchString += "(";
    }

    searchString += boundWord + words[ x ] + boundWord;

    if( x )
    {
      if( ignoreWordsOrder )
        searchString += ")?";
    }

  }

  searchString += ")";
  return searchString;
}

void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStringList list )
{
  QStringList wordList, hieroglyphList;
  for( int i = 0; i < list.size(); i ++ )
  {
    QString word = list.at( i );

    // Check for CJK symbols in word
    bool parsed = false;
    QString hieroglyph;
    for( int x = 0; x < word.size(); x++ )
      if( isCJKChar( word.at( x ).unicode() ) )
      {
        parsed = true;
        hieroglyph.append( word[ x ] );

        if( QChar( word.at( x ) ).isHighSurrogate()
            &&  QChar( word[ x + 1 ] ).isLowSurrogate() )
          hieroglyph.append( word[ ++x ] );

        hieroglyphList.append( hieroglyph );
        hieroglyph.clear();
      }

    // If word don't contains CJK symbols put it in list as is
    if( !parsed )
      wordList.append( word );
  }

  indexWords = wordList.filter( wordRegExp );
  indexWords.removeDuplicates();

  hieroglyphList.removeDuplicates();
  indexWords += hieroglyphList;
}

bool containCJK( QString const & str)
{
  bool hasCJK = false;
  for( int x = 0; x < str.size(); x++ )
    if( isCJKChar( str.at( x ).unicode() ) )
    {
      hasCJK = true;
      break;
    }
  return hasCJK;
}

bool parseSearchString( QString const & str, QStringList & indexWords,
                        QStringList & searchWords,
                        QRegExp & searchRegExp, int searchMode,
                        bool matchCase,
                        int distanceBetweenWords,
                        bool & hasCJK,
                        bool ignoreWordsOrder )
{
  searchWords.clear();
  indexWords.clear();
  // QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
  // QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
  // QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
  // QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);

  hasCJK = containCJK( str );

  if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
  {
    // Make words list for search in article text
    searchWords = str.normalized( QString::NormalizationForm_C ).split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
    // Make words list for index search
    QStringList list =
      str.normalized( QString::NormalizationForm_C ).toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );

    QString searchString;
    if( hasCJK )
    {
      tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
      // QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
      searchString         = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder);
    }
    else
    {
      indexWords = list.filter( RX::Ftx::wordRegExp );
      indexWords.removeDuplicates();

      // Make regexp for results hilite

      QStringList allWords = str.split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
      searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder );
    }
    searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
    searchRegExp.setMinimal( true );
    return !indexWords.isEmpty();
  }
  else
  {
    // Make words list for index search

    QString tmp = str;

    // Remove RegExp commands
    if( searchMode == FTS::RegExp )
      tmp.replace( RX::Ftx::regexRegExp, " " );

    // Remove all symbol sets
    tmp.replace( RX::Ftx::setsRegExp, " " );

    QStringList list = tmp.normalized( QString::NormalizationForm_C )
                          .toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );

    if( hasCJK )
    {
      tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
    }
    else
    {
      indexWords = list.filter( RX::Ftx::wordRegExp );
      indexWords.removeDuplicates();
    }

    searchRegExp = QRegExp( str, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive,
                            searchMode == FTS::Wildcards ? QRegExp::WildcardUnix : QRegExp::RegExp2 );
    searchRegExp.setMinimal( true );
  }

  return true;
}
//definition;
Mutex lockMutex;

void parseArticleForFts( uint32_t articleAddress, QString & articleText,
                         QMap< QString, QVector< uint32_t > > & words,
                         bool handleRoundBrackets )
{
  if( articleText.isEmpty() )
    return;

  QStringList articleWords = articleText.normalized( QString::NormalizationForm_C )
      .split( handleRoundBrackets ? RX::Ftx::handleRoundBracket : RX::Ftx::noRoundBracket,
                                                Qt::SkipEmptyParts );

  QVector< QString > setOfWords;
  setOfWords.reserve( articleWords.size() );

  for( int x = 0; x < articleWords.size(); x++ )
  {
    QString word = articleWords.at( x ).toLower();

    bool hasCJK = false;
    QString hieroglyph;

    // If word contains CJK symbols we add to index only these symbols separately
    for( int y = 0; y < word.size(); y++ )
      if( isCJKChar( word.at( y ).unicode() ) )
      {
        hasCJK = true;
        hieroglyph.append( word[ y ] );

        if( QChar( word.at( y ) ).isHighSurrogate()
            &&  QChar( word[ y + 1 ] ).isLowSurrogate() )
          hieroglyph.append( word[ ++y ] );

        //if( !setOfWords.contains( hieroglyph ) )
        {
          setOfWords.push_back( hieroglyph );
          /*Mutex::Lock _( _mapLock );
          words[ hieroglyph ].push_back( articleAddress );*/
        }

        hieroglyph.clear();
      }

    if( !hasCJK )
    {
      // Else we add word to index as is
      if( word.size() < FTS::MinimumWordSize )
        continue;

      if( handleRoundBrackets && ( word.indexOf( '(' ) >= 0 || word.indexOf( ')' ) >= 0 ) )
      {
        // Special handle for words with round brackets - DSL feature
        QStringList list;

        QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts );
        for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it )
          if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) )
            list.append( *it );

        QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word );
        if( match.hasMatch() )
        {
          QStringList parts = match.capturedTexts();
          // Add empty strings for compatibility with QRegExp behaviour
          for( int i = match.lastCapturedIndex() + 1; i < 6; i++ )
            parts.append( QString() );
          QString parsedWord = parts[ 2 ] + parts[ 4 ]; // Brackets removed

          if( parsedWord.size() >= FTS::MinimumWordSize && !list.contains( parsedWord ) )
            list.append( parsedWord );

          parsedWord = parts[ 1 ].remove( '(' ).remove( ')' )
                       + parts[ 2 ]
                       + parts[ 3 ].remove( '(' ).remove( ')' )
                       + parts[ 4 ]
                       + parts[ 5 ].remove( '(' ).remove( ')' ); // Brackets expansed

          if( parsedWord.size() >= FTS::MinimumWordSize && !list.contains( parsedWord ) )
            list.append( parsedWord );
        }

        for( QStringList::iterator it = list.begin(); it != list.end(); ++it )
        {
          //if( !setOfWords.contains( *it ) )
          {
            setOfWords.push_back( *it );
            /*Mutex::Lock _( _mapLock );
            words[ *it ].push_back( articleAddress );*/
          }
        }
      }
      else
      //if( !setOfWords.contains( word ) )
      {
        setOfWords.push_back( word );
        /*Mutex::Lock _( _mapLock );
        words[ word ].push_back( articleAddress );*/
      }
    }
  }

  {
    Mutex::Lock _( lockMutex );

    for( const QString & word : setOfWords )
    {
      words[ word ].push_back( articleAddress );
    }
  }
}

void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{
#ifdef USE_XAPIAN
  return makeFTSIndexXapian(dict,isCancelled);
#endif

  Mutex::Lock _( dict->getFtsMutex() );

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  File::Class ftsIdx( dict->ftsIndexName(), "wb" );

  FtsIdxHeader ftsIdxHeader;
  memset( &ftsIdxHeader, 0, sizeof( ftsIdxHeader ) );

  // We write a dummy header first. At the end of the process the header
  // will be rewritten with the right values.

  ftsIdx.write( ftsIdxHeader );

  ChunkedStorage::Writer chunks( ftsIdx );

  BtreeIndexing::IndexedWords indexedWords;

  QSet< uint32_t > setOfOffsets;
  setOfOffsets.reserve( dict->getArticleCount() );

  dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  QVector< uint32_t > offsets;
  offsets.resize( setOfOffsets.size() );
  uint32_t * ptr = &offsets.front();

  for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
       it != setOfOffsets.constEnd(); ++it )
  {
    *ptr = *it;
    ptr++;
  }

  // Free memory
  setOfOffsets.clear();

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  dict->sortArticlesOffsetsForFTS( offsets, isCancelled );

  QMap< QString, QVector< uint32_t > > ftsWords;

  bool needHandleBrackets;
  {
    QString name = QString::fromUtf8( dict->getDictionaryFilenames()[ 0 ].c_str() ).toLower();
    needHandleBrackets = name.endsWith( ".dsl" ) || name.endsWith( "dsl.dz" );
  }

  QSemaphore sem( QThread::idealThreadCount() );
  //QFutureSynchronizer< void > synchronizer;

  long indexedFtsDoc=0;
  for( auto & address : offsets )
  {
    indexedFtsDoc++;
    if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    {
        //wait the future to be finished.
      sem.acquire( QThread::idealThreadCount() );
      return;
    }

    sem.acquire();
    QFuture< void > f = QtConcurrent::run(
      [ & ]()
      {
        QSemaphoreReleaser releaser( sem );
        if( Utils::AtomicInt::loadAcquire( isCancelled ) )
        {
          return;
        }

        QString headword, articleStr;

        dict->getArticleText( address, headword, articleStr );

        parseArticleForFts( address, articleStr, ftsWords, needHandleBrackets );
      } );
    //synchronizer.addFuture( f );

    dict->setIndexedFtsDoc(indexedFtsDoc);
  }
  sem.acquire( QThread::idealThreadCount() );
  // Free memory
  offsets.clear();

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  QMap< QString, QVector< uint32_t > >::iterator it = ftsWords.begin();
  while( it != ftsWords.end() )
  {
    if( Utils::AtomicInt::loadAcquire( isCancelled ) )
      throw exUserAbort();

    uint32_t offset = chunks.startNewBlock();
    uint32_t size = it.value().size();

    chunks.addToBlock( &size, sizeof(uint32_t) );
    chunks.addToBlock( it.value().data(), size * sizeof(uint32_t) );

    indexedWords.addSingleWord( gd::toWString( it.key() ), offset );

    it = ftsWords.erase( it );
  }

  ftsIdxHeader.chunksOffset = chunks.finish();
  ftsIdxHeader.wordCount = indexedWords.size();

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  BtreeIndexing::IndexInfo ftsIdxInfo = BtreeIndexing::buildIndex( indexedWords, ftsIdx );

  // Free memory
  indexedWords.clear();

  ftsIdxHeader.indexBtreeMaxElements = ftsIdxInfo.btreeMaxElements;
  ftsIdxHeader.indexRootOffset = ftsIdxInfo.rootOffset;

  ftsIdxHeader.signature = FtsHelpers::FtsSignature;
  ftsIdxHeader.formatVersion = FtsHelpers::CurrentFtsFormatVersion + dict->getFtsIndexVersion();

  ftsIdx.rewind();
  ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 );
}

// use xapian to create the index
#ifdef USE_XAPIAN
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{
  Mutex::Lock _( dict->getFtsMutex() );

  try {
  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  // Open the database for update, creating a new database if necessary.
  Xapian::WritableDatabase db(dict->ftsIndexName(), Xapian::DB_CREATE_OR_OPEN);

  Xapian::TermGenerator indexer;
//  Xapian::Stem stemmer("english");
//  indexer.set_stemmer(stemmer);
//  indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS);
  indexer.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM);

  BtreeIndexing::IndexedWords indexedWords;

  QSet< uint32_t > setOfOffsets;
  setOfOffsets.reserve( dict->getArticleCount() );

  dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  QVector< uint32_t > offsets;
  offsets.resize( setOfOffsets.size() );
  uint32_t * ptr = &offsets.front();

  for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
       it != setOfOffsets.constEnd(); ++it )
  {
    *ptr = *it;
    ptr++;
  }

  // Free memory
  setOfOffsets.clear();

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    throw exUserAbort();

  dict->sortArticlesOffsetsForFTS( offsets, isCancelled );

  // incremental build the index.
  // get the last address.
  bool skip            = true;
  uint32_t lastAddress = -1;
  try
  {
    Xapian::Document lastDoc = db.get_document( db.get_lastdocid() );
    lastAddress              = atoi( lastDoc.get_data().c_str() );
  }
  catch( Xapian::Error & e )
  {
    qDebug() << e.get_description().c_str();
    skip = false;
  }

  long indexedDoc=0L;

  for( auto & address : offsets )
  {
    indexedDoc++;

    if(address==lastAddress){
      skip = false;
    }
    //skip until to the lastAddress;
    if((address!=lastAddress)&&skip){
      continue;
    }

    if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    {
      return;
    }

    QString headword, articleStr;

    dict->getArticleText( address, headword, articleStr );

    Xapian::Document doc;

    indexer.set_document( doc );
    indexer.index_text_without_positions( articleStr.toStdString() );
    doc.set_data( std::to_string( address ) );
    // Add the document to the database.
    db.add_document( doc );

    dict->setIndexedFtsDoc(indexedDoc);
  }

  //add a special document to mark the end of the index.
  Xapian::Document doc;
  doc.set_data( finish_mark );
  // Add the document to the database.
  db.add_document( doc );

  // Free memory
  offsets.clear();

  db.commit();
  } catch (Xapian::Error & e) {
    qWarning()<<QString::fromStdString(e.get_description());
  }
}
#endif

bool isCJKChar( ushort ch )
{
  return Utils::isCJKChar(ch);
}

void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
                                       QStringList const & words,
                                       QRegExp const & searchRegexp )
{
  // const int parallel_count = QThread::idealThreadCount()/2;
  // QSemaphore sem( parallel_count  < 1 ? 1 : parallel_count  );
  //
  // QFutureSynchronizer< void > synchronizer;
  const auto searchRegularExpression = createMatchRegex( searchRegexp );

  for( auto & address : offsets )
  {
    if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    {
      return;
    }
    checkSingleArticle( address, words, searchRegularExpression );
  }
}

QRegularExpression FTSResultsRequest::createMatchRegex( QRegExp const & searchRegexp )
{
  QRegularExpression searchRegularExpression;

  if( searchMode == FTS::Wildcards )
    searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) );
  else
    searchRegularExpression.setPattern( searchRegexp.pattern() );
  QRegularExpression::PatternOptions patternOptions =
    QRegularExpression::DotMatchesEverythingOption | QRegularExpression::UseUnicodePropertiesOption
    | QRegularExpression::MultilineOption | QRegularExpression::InvertedGreedinessOption;
  if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive )
    patternOptions |= QRegularExpression::CaseInsensitiveOption;
  searchRegularExpression.setPatternOptions( patternOptions );
  if( !searchRegularExpression.isValid() )
    searchRegularExpression.setPattern( "" );
  return searchRegularExpression;
}

void FTSResultsRequest::checkSingleArticle( uint32_t  offset,
                                            QStringList const & words,
                                            QRegularExpression const & searchRegularExpression )
{
  // int results = 0;
  QString headword, articleText;
  QList< uint32_t > offsetsForHeadwords;
  QVector< QStringList > hiliteRegExps;

  QString id = QString::fromUtf8( dict.getId().c_str() );

  // RegExp mode

  if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp )
  {
    // for( int i = 0; i < offsets.size(); i++ )
      if( Utils::AtomicInt::loadAcquire( isCancelled ) )
        return;

      // auto article_address = offsets.at( i );
      dict.getArticleText( offset, headword, articleText );
      articleText = articleText.normalized( QString::NormalizationForm_C );

      if( ignoreDiacritics )
        articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );

      if( articleText.contains( searchRegularExpression ) )
      {
        if( headword.isEmpty() )
          offsetsForHeadwords.append( offset );
        else
        {
          Mutex::Lock _( dataMutex );
          foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
        }

        ++results;
        if( maxResults > 0 && results >= maxResults )
          return;
      }
  }
  else
  {
    // Words mode

    QVector< QPair< QString, bool > > wordsList;
    if( ignoreWordsOrder )
    {
      for( QStringList::const_iterator it = words.begin(); it != words.end(); ++it )
        wordsList.append( QPair< QString, bool >( *it, true ) );
    }

    // for( int i = 0; i < offsets.size(); i++ )
      if( Utils::AtomicInt::loadAcquire( isCancelled ) )
        return;

      if( ignoreWordsOrder )
      {
        for( int i = 0; i < wordsList.size(); i++ )
          wordsList[ i ].second = true;
      }

      dict.getArticleText( offset, headword, articleText );

      articleText = articleText.normalized( QString::NormalizationForm_C );

      if( ignoreDiacritics )
        articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );

      if( ignoreWordsOrder )
      {
        bool allMatch = true;
        foreach( QString word, words )
        {
          if( containCJK( word ) || searchMode == FTS::PlainText )
          {
            if( !articleText.contains( word ) )
            {
              allMatch = false;
              break;
            }
          }
          else if( searchMode == FTS::WholeWords )
          {
            QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),
                                       QRegularExpression::CaseInsensitiveOption
                                         | QRegularExpression::UseUnicodePropertiesOption );
            if( !articleText.contains( tmpReg ) )
            {
              allMatch = false;
              break;
            }
          }
        }

        if( !allMatch )
        {
          return;
        }

        if( distanceBetweenWords >= 0 )
        {
          // the article text contains all the needed words.
          // determine if distance restriction is meet
          const QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
                                               QRegularExpression::CaseInsensitiveOption
                                               | QRegularExpression::UseUnicodePropertiesOption );
          // use a string that could not be presented in the article.
          articleText = articleText.replace( replaceReg, "=@XXXXX@=" );

          auto hasCJK = false;
          foreach( QString word, words )
          {
            if( containCJK( word ) )
            {
              hasCJK = true;
              break;
            }
          }

          // hascjk value ,perhaps should depend on each word
          const auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ),
                                                            searchMode,
                                                            distanceBetweenWords,
                                                            hasCJK );
          const QRegularExpression distanceOrderReg( searchRegStr,
                                                     QRegularExpression::CaseInsensitiveOption
                                                     | QRegularExpression::UseUnicodePropertiesOption );
          // use a string that could not be presented in the article.
          if( articleText.contains( distanceOrderReg ) )
          {
            if( headword.isEmpty() )
              offsetsForHeadwords.append( offset );
            else
            {
              Mutex::Lock _( dataMutex );
              foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
            }

            ++results;
            if( maxResults > 0 && results >= maxResults )
              return;
          }
        }
      }
      else
      {
        if( articleText.contains( searchRegularExpression ) )
        {
          if( headword.isEmpty() )
            offsetsForHeadwords.append( offset );
          else
          {
            Mutex::Lock _( dataMutex );
            foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
          }

          ++results;
          if( maxResults > 0 && results >= maxResults )
            return;
        }
      }
  }
  if( !offsetsForHeadwords.isEmpty() )
  {
    QVector< QString > headwords;
    Mutex::Lock _( dataMutex );

    dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
    for( int x = 0; x < headwords.size(); x++ )
    {
      foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ),
                                                id,
                                                x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(),
                                                matchCase ) );
    }
  }
}

void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
                                     sptr< ChunkedStorage::Reader > chunks,
                                     QStringList & indexWords,
                                     QStringList & searchWords, QRegExp & regexp )
{
  // Find articles which contains all requested words

  QSet< uint32_t > setOfOffsets;

  if( indexWords.isEmpty() )
    return;

  QList< QSet< uint32_t > > addressLists;

  auto findLinks = [ & ]( const QString & word )
  {
    QSet< uint32_t > tmp;
    uint32_t size;

    if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    {
      addressLists << tmp;
      return;
    }

    vector< BtreeIndexing::WordArticleLink > links =
      ftsIndex.findArticles( gd::toWString( word ), ignoreDiacritics );
    for( unsigned x = 0; x < links.size(); x++ )
    {
      if( Utils::AtomicInt::loadAcquire( isCancelled ) )
      {
        addressLists << tmp;
        return;
      }

      vector< char > chunk;
      char * linksPtr;
      {
        // Mutex::Lock _( dict.getFtsMutex() );
        linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
      }

      memcpy( &size, linksPtr, sizeof( uint32_t ) );
      linksPtr += sizeof( uint32_t );
      for( uint32_t y = 0; y < size; y++ )
      {
        tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
        linksPtr += sizeof( uint32_t );
      }
    }

    links.clear();
    {
      Mutex::Lock _( dataMutex );
      addressLists << tmp;
    }
  };
  // int n = indexWords.length();
  // QtConcurrent::blockingMap( indexWords, findLinks );

  for(QString word:indexWords)
  {
    if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    {
      return;
    }
    findLinks( word );
  }

  // blocked execution.

  int i = 0;
  for( auto & elem : addressLists )
  {
    if( i++ == 0 )
      setOfOffsets = elem;
    else
      setOfOffsets = setOfOffsets.intersect( elem );
  }


  if( setOfOffsets.isEmpty() )
    return;

  QVector< uint32_t > offsets;
  offsets.resize( setOfOffsets.size() );
  uint32_t * ptr = &offsets.front();

  for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
       it != setOfOffsets.constEnd(); ++it )
  {
    *ptr = *it;
    ptr++;
  }

  setOfOffsets.clear();

  dict.sortArticlesOffsetsForFTS( offsets, isCancelled );

  checkArticles( offsets, searchWords, regexp );
}

void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
                                             sptr< ChunkedStorage::Reader > chunks,
                                             QStringList & indexWords,
                                             QStringList & searchWords,
                                             QRegExp & regexp )
{
  // Special case - combination of index search for hieroglyphs
  // and full index search for other words

  QSet< uint32_t > setOfOffsets;
  uint32_t size;

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    return;

  if( indexWords.isEmpty() )
    return;

  QStringList wordsList, hieroglyphsList;

  for( int x = 0; x < indexWords.size(); x++ )
  {
    QString const & word = indexWords.at( x );
    if( isCJKChar( word[ 0 ].unicode() ) )
      hieroglyphsList.append( word );
    else
      wordsList.append( word );
  }

  QVector< QSet< uint32_t > > allWordsLinks;

  int n = wordsList.size();
  if( !hieroglyphsList.isEmpty() )
  {
    wordsList += hieroglyphsList;
    n += 1;
  }

  allWordsLinks.resize( n );

  if( !wordsList.empty() )
  {
    QList< QSet< uint32_t > > sets;
    auto fn_wordLink = [ & ](const QString & word )
    {
      QSet< uint32_t > tmp;
      vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::toWString( word ) );
      for( unsigned x = 0; x < links.size(); x++ )
      {
        if( Utils::AtomicInt::loadAcquire( isCancelled ) )
        {
          Mutex::Lock _( dataMutex );
          sets << tmp;
          return;
        }

        vector< char > chunk;
        char * linksPtr;
        {
          // Mutex::Lock _( dict.getFtsMutex() );
          linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
        }

        memcpy( &size, linksPtr, sizeof( uint32_t ) );
        linksPtr += sizeof( uint32_t );
        // across chunks, need further investigation
        uint32_t max = ( chunk.size() - ( linksPtr - &chunk.front() )) / 4;

        tmp.reserve( size );
        uint32_t q_max = qMin(size,max);
        for( uint32_t y = 0; y < q_max; y++ )
        {
          tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
          linksPtr += sizeof( uint32_t );
        }
      }

      links.clear();

      {
        Mutex::Lock _( dataMutex );
        sets << tmp;
      }
    };
    // QtConcurrent::blockingMap( wordsList, fn_wordLink );

    {

      for(const auto & word : wordsList )
      {
        if( Utils::AtomicInt::loadAcquire( isCancelled ) )
        {
          return;
        }
        fn_wordLink( word );
      }
    }
    //blocked execution.

    int i = 0;
    for( auto & elem : sets )
    {
      if( i++ == 0 )
        setOfOffsets = elem;
      else
        setOfOffsets = setOfOffsets.intersect( elem );
    }

    // allWordsLinks[ wordNom ] = setOfOffsets;
    // setOfOffsets.clear();
    // wordNom += 1;
  }

  if( setOfOffsets.isEmpty() )
    return;

  allWordsLinks.clear();

  QVector< uint32_t > offsets( setOfOffsets.begin(),setOfOffsets.end() );
  // offsets.resize( setOfOffsets.size() );
  // uint32_t * ptr = &offsets.front();
  //
  // for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
  //      it != setOfOffsets.constEnd(); ++it )
  // {
  //   *ptr = *it;
  //   ptr++;
  // }

  setOfOffsets.clear();

  dict.sortArticlesOffsetsForFTS( offsets, isCancelled );

  checkArticles( offsets, searchWords, regexp );
}

void FTSResultsRequest::fullIndexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
                                         sptr< ChunkedStorage::Reader > chunks,
                                         QStringList & indexWords,
                                         QStringList & searchWords,
                                         QRegExp & regexp )
{
  QSet< uint32_t > setOfOffsets;
  uint32_t size;
  QVector< BtreeIndexing::WordArticleLink > links;

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    return;

  if( indexWords.isEmpty() )
    return;

  links.reserve( wordsInIndex );
  ftsIndex.findArticleLinks( &links, 0, 0, &isCancelled );

  QVector< QSet< uint32_t > > allWordsLinks;
  allWordsLinks.resize( indexWords.size() );

  for( int x = 0; x < links.size(); x++ )
  {
    if( Utils::AtomicInt::loadAcquire( isCancelled ) )
      return;

    QString word = QString::fromUtf8( links[ x ].word.data(), links[ x ].word.size() );

    if( ignoreDiacritics )
      word = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( word ) ) );

    for( int i = 0; i < indexWords.size(); i++ )
    {
      if( word.length() >= indexWords.at( i ).length() && word.contains( indexWords.at( i ) ) )
      {
        vector< char > chunk;
        char * linksPtr;
        {
          // Mutex::Lock _( dict.getFtsMutex() );
          linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
        }

        memcpy( &size, linksPtr, sizeof(uint32_t) );
        linksPtr += sizeof(uint32_t);
        for( uint32_t y = 0; y < size; y++ )
        {
          allWordsLinks[ i ].insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
          linksPtr += sizeof(uint32_t);
        }
        if( searchMode == FTS::PlainText )
          break;
      }
    }
  }

  links.clear();

  for( int i = 0; i < allWordsLinks.size(); i++ )
  {
    if( i == 0 )
      setOfOffsets = allWordsLinks.at( i );
    else
      setOfOffsets = setOfOffsets.intersect( allWordsLinks.at( i ) );
  }

  if( setOfOffsets.isEmpty() )
    return;

  allWordsLinks.clear();

  QVector< uint32_t > offsets( setOfOffsets.begin(), setOfOffsets.end() );
  // offsets.resize( setOfOffsets.size() );
  // uint32_t * ptr = &offsets.front();
  //
  // for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
  //      it != setOfOffsets.constEnd(); ++it )
  // {
  //   *ptr = *it;
  //   ptr++;
  // }

  setOfOffsets.clear();

  dict.sortArticlesOffsetsForFTS( offsets, isCancelled );

  checkArticles( offsets, searchWords, regexp );
}

void FTSResultsRequest::fullSearch( QStringList & searchWords, QRegExp & regexp )
{
  // Whole file survey

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    return;

  QSet< uint32_t > setOfOffsets;
  setOfOffsets.reserve( dict.getArticleCount() );
  dict.findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    return;

  QVector< uint32_t > offsets;
  offsets.resize( setOfOffsets.size() );
  uint32_t * ptr = &offsets.front();

  for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
       it != setOfOffsets.constEnd(); ++it )
  {
    *ptr = *it;
    ptr++;
  }

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    return;

  setOfOffsets.clear();

  dict.sortArticlesOffsetsForFTS( offsets, isCancelled );

  if( Utils::AtomicInt::loadAcquire( isCancelled ) )
    return;

  checkArticles( offsets, searchWords, regexp );
}

void FTSResultsRequest::run()
{
#ifdef USE_XAPIAN
  return runXapian();
#endif

  if ( dict.ensureInitDone().size() )
  {
    setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
    finish();
    return;
  }

  try
  {
    QStringList indexWords, searchWords;
    QRegExp searchRegExp;

    if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
                                        searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
    {
      finish();
      return;
    }

    if( dict.haveFTSIndex() && !indexWords.isEmpty() )
    {
      FtsIdxHeader ftsIdxHeader;
      BtreeIndexing::BtreeIndex ftsIndex;
      sptr< ChunkedStorage::Reader > chunks;

      File::Class ftsIdx( dict.ftsIndexName(), "rb" );

      {
        Mutex::Lock _( dict.getFtsMutex() );

        ftsIdxHeader = ftsIdx.read< FtsIdxHeader >();

        wordsInIndex = ftsIdxHeader.wordCount;

        ftsIndex.openIndex( BtreeIndexing::IndexInfo( ftsIdxHeader.indexBtreeMaxElements,
                                                      ftsIdxHeader.indexRootOffset ),
                            ftsIdx, dict.getFtsMutex() );

        chunks = new ChunkedStorage::Reader( ftsIdx, ftsIdxHeader.chunksOffset );
      }

      if( hasCJK )
        combinedIndexSearch( ftsIndex, chunks, indexWords, searchWords, searchRegExp );
      else
      {
        if( searchMode == FTS::WholeWords )
          indexSearch( ftsIndex, chunks, indexWords, searchWords, searchRegExp );
        else
          fullIndexSearch( ftsIndex, chunks, indexWords, searchWords, searchRegExp );
      }
    }
    else
    {
      fullSearch( searchWords, searchRegExp );
    }

    if( foundHeadwords && foundHeadwords->size() > 0 )
    {
      Mutex::Lock _( dataMutex );
      data.resize( sizeof( foundHeadwords ) );
      memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) );
      foundHeadwords = 0;
      hasAnyData = true;
    }
  }
  catch( std::exception &ex )
  {
    gdWarning( "FTS: Failed full-text search for \"%s\", reason: %s\n",
               dict.getName().c_str(), ex.what() );
    // Results not loaded -- we don't set the hasAnyData flag then
  }

  finish();
}

#ifdef USE_XAPIAN
void FTSResultsRequest::runXapian()
{
  if ( dict.ensureInitDone().size() )
  {
    setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
    finish();
    return;
  }

  try
  {
    if( dict.haveFTSIndex() )
    {
      //no need to parse the search string,  use xapian directly.
      //if the search mode is wildcard, change xapian search query flag?
      // Open the database for searching.
      Xapian::Database db(dict.ftsIndexName());

      // Start an enquire session.
      Xapian::Enquire enquire( db );

      // Combine the rest of the command line arguments with spaces between
      // them, so that simple queries don't have to be quoted at the shell
      // level.
      string query_string( searchString.toStdString() );

      // Parse the query string to produce a Xapian::Query object.
      Xapian::QueryParser qp;
      qp.set_database( db );
      Xapian::QueryParser::feature_flag flag = Xapian::QueryParser::FLAG_DEFAULT;
      if( searchMode == FTS::Wildcards )
        flag = Xapian::QueryParser::FLAG_WILDCARD;
      Xapian::Query query = qp.parse_query( query_string, flag|Xapian::QueryParser::FLAG_CJK_NGRAM );
      qDebug() << "Parsed query is: " << query.get_description().c_str();

      // Find the top 100 results for the query.
      enquire.set_query( query );
      Xapian::MSet matches = enquire.get_mset( 0, 100 );

      emit matchCount(matches.get_matches_estimated());
      // Display the results.
      qDebug() << matches.get_matches_estimated() << " results found.\n";
      qDebug() << "Matches 1-" << matches.size() << ":\n\n";
      QList< uint32_t > offsetsForHeadwords;
      for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
      {
        qDebug() << i.get_rank() + 1 << ": " << i.get_weight() << " docid=" << *i << " ["
                 << i.get_document().get_data().c_str() << "]";
        if(i.get_document().get_data()==finish_mark)
          continue;
        offsetsForHeadwords.append( atoi( i.get_document().get_data().c_str() ) );
      }

      if( !offsetsForHeadwords.isEmpty() )
      {
        QVector< QString > headwords;
        Mutex::Lock _( dataMutex );
        QString id = QString::fromUtf8( dict.getId().c_str() );
        dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
        for( int x = 0; x < headwords.size(); x++ )
        {
          foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, QStringList(), matchCase ) );
        }
      }
    }
    else
    {
      QStringList indexWords, searchWords;
      QRegExp searchRegExp;
      if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
                                          searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
      {
        finish();
        return;
      }
      fullSearch( searchWords, searchRegExp );
    }

    if( foundHeadwords && foundHeadwords->size() > 0 )
    {
      Mutex::Lock _( dataMutex );
      data.resize( sizeof( foundHeadwords ) );
      memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) );
      foundHeadwords = 0;
      hasAnyData = true;
    }
  }
  catch (const Xapian::Error &e) {
    qWarning() << e.get_description().c_str();
  }
  catch( std::exception &ex )
  {
    gdWarning( "FTS: Failed full-text search for \"%s\", reason: %s\n",
               dict.getName().c_str(), ex.what() );
    // Results not loaded -- we don't set the hasAnyData flag then
  }

  finish();
}
#endif

} // namespace