goldendict-ng/ftshelpers.cc

1391 lines
39 KiB
C++
Raw Normal View History

2014-04-16 16:18:28 +00:00
/* This file is (c) 2014 Abs62
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifdef USE_XAPIAN
#include "xapian.h"
#include <stdlib.h>
#endif
2014-04-16 16:18:28 +00:00
#include "fulltextsearch.hh"
#include "ftshelpers.hh"
#include "wstring_qt.hh"
#include "file.hh"
#include "gddebug.hh"
#include "folding.hh"
#include "utils.hh"
2014-04-16 16:18:28 +00:00
#include <vector>
#include <string>
#include <QVector>
#include <QRegularExpression>
#include "wildcard.hh"
2022-06-04 15:22:14 +00:00
#include <QtConcurrent>
#include "base/globalregex.hh"
2022-06-16 12:34:32 +00:00
#include <QFutureSynchronizer>
#include <QSemaphoreReleaser>
2014-04-16 16:18:28 +00:00
using std::vector;
using std::string;
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
namespace FtsHelpers
{
2022-10-07 02:34:27 +00:00
// finished reversed dehsinif
const static std::string finish_mark = std::string( "dehsinif" );
2014-04-16 16:18:28 +00:00
bool ftsIndexIsOldOrBad( string const & indexFile,
BtreeIndexing::BtreeDictionary * dict )
2014-04-16 16:18:28 +00:00
{
#ifdef USE_XAPIAN
try
{
Xapian::WritableDatabase db( dict->ftsIndexName() );
2022-10-06 11:32:45 +00:00
auto docid = db.get_lastdocid();
auto document = db.get_document(docid);
2022-10-06 14:35:10 +00:00
qDebug()<<document.get_data().c_str();
2022-10-06 11:32:45 +00:00
//use a special document to mark the end of the index.
2022-10-06 14:35:10 +00:00
return document.get_data().compare(finish_mark)!=0;
}
2022-10-06 01:21:03 +00:00
catch( Xapian::Error & e )
{
qWarning() << e.get_description().c_str();
//the file is corrupted,remove it.
QFile::remove(QString::fromStdString(dict->ftsIndexName()));
return true;
}
catch( ... )
{
return true;
}
return false;
#endif
2014-04-16 16:18:28 +00:00
File::Class idx( indexFile, "rb" );
FtsIdxHeader header;
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
header.signature != FtsSignature ||
header.formatVersion != CurrentFtsFormatVersion + dict->getFtsIndexVersion();
2014-04-16 16:18:28 +00:00
}
static QString makeHiliteRegExpString( QStringList const & words,
int searchMode, int distanceBetweenWords, bool hasCJK = false, bool ignoreWordsOrder = false )
{
QString searchString( "(" );
QString stripWords( "(?:\\W+\\w+){0," );
2022-06-04 09:32:33 +00:00
if( hasCJK )
{
stripWords = "(?:[\\W\\w]){0,";
}
if( distanceBetweenWords >= 0 )
stripWords += QString::number( distanceBetweenWords );
2022-06-03 07:19:58 +00:00
stripWords += "}";
if(!hasCJK)
{
stripWords += "\\W+";
}
QString boundWord( searchMode == FTS::WholeWords ? "\\b" : "(?:\\w*)");
2022-06-03 07:19:58 +00:00
if(hasCJK)
{
//no boundary for CJK
boundWord.clear();
}
for( int x = 0; x < words.size(); x++ )
{
if( x )
{
searchString += stripWords;
if(ignoreWordsOrder)
searchString += "(";
}
searchString += boundWord + words[ x ] + boundWord;
if( x )
{
if( ignoreWordsOrder )
searchString += ")?";
}
}
searchString += ")";
return searchString;
}
2022-06-03 07:19:58 +00:00
void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStringList list )
{
QStringList wordList, hieroglyphList;
for( int i = 0; i < list.size(); i ++ )
{
QString word = list.at( i );
// Check for CJK symbols in word
bool parsed = false;
QString hieroglyph;
for( int x = 0; x < word.size(); x++ )
if( isCJKChar( word.at( x ).unicode() ) )
{
parsed = true;
hieroglyph.append( word[ x ] );
if( QChar( word.at( x ) ).isHighSurrogate()
&& QChar( word[ x + 1 ] ).isLowSurrogate() )
hieroglyph.append( word[ ++x ] );
hieroglyphList.append( hieroglyph );
hieroglyph.clear();
}
// If word don't contains CJK symbols put it in list as is
if( !parsed )
wordList.append( word );
}
indexWords = wordList.filter( wordRegExp );
indexWords.removeDuplicates();
hieroglyphList.removeDuplicates();
indexWords += hieroglyphList;
}
bool containCJK( QString const & str)
{
bool hasCJK = false;
for( int x = 0; x < str.size(); x++ )
if( isCJKChar( str.at( x ).unicode() ) )
{
hasCJK = true;
break;
}
return hasCJK;
}
2014-04-16 16:18:28 +00:00
bool parseSearchString( QString const & str, QStringList & indexWords,
QStringList & searchWords,
QRegExp & searchRegExp, int searchMode,
bool matchCase,
int distanceBetweenWords,
bool & hasCJK,
bool ignoreWordsOrder )
2014-04-16 16:18:28 +00:00
{
searchWords.clear();
indexWords.clear();
// QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
// QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
// QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
// QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
2014-04-16 16:18:28 +00:00
2022-06-03 07:19:58 +00:00
hasCJK = containCJK( str );
2014-04-16 16:18:28 +00:00
if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
{
// Make words list for search in article text
searchWords = str.normalized( QString::NormalizationForm_C ).split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
2014-04-16 16:18:28 +00:00
// Make words list for index search
2022-06-03 07:19:58 +00:00
QStringList list =
str.normalized( QString::NormalizationForm_C ).toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
2014-04-16 16:18:28 +00:00
2022-06-03 07:19:58 +00:00
QString searchString;
if( hasCJK )
{
tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
2022-06-03 07:19:58 +00:00
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder);
2022-06-03 07:19:58 +00:00
}
else
{
indexWords = list.filter( RX::Ftx::wordRegExp );
2022-06-03 07:19:58 +00:00
indexWords.removeDuplicates();
2022-06-03 07:19:58 +00:00
// Make regexp for results hilite
QStringList allWords = str.split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder );
2022-06-03 07:19:58 +00:00
}
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
searchRegExp.setMinimal( true );
2014-04-16 16:18:28 +00:00
return !indexWords.isEmpty();
}
else
{
// Make words list for index search
QString tmp = str;
// Remove RegExp commands
if( searchMode == FTS::RegExp )
tmp.replace( RX::Ftx::regexRegExp, " " );
2014-04-16 16:18:28 +00:00
// Remove all symbol sets
tmp.replace( RX::Ftx::setsRegExp, " " );
2014-04-16 16:18:28 +00:00
QStringList list = tmp.normalized( QString::NormalizationForm_C )
.toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
if( hasCJK )
{
tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
}
else
{
indexWords = list.filter( RX::Ftx::wordRegExp );
indexWords.removeDuplicates();
}
2014-04-16 16:18:28 +00:00
searchRegExp = QRegExp( str, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive,
searchMode == FTS::Wildcards ? QRegExp::WildcardUnix : QRegExp::RegExp2 );
searchRegExp.setMinimal( true );
2014-04-16 16:18:28 +00:00
}
return true;
}
//definition;
Mutex lockMutex;
2014-04-16 16:18:28 +00:00
void parseArticleForFts( uint32_t articleAddress, QString & articleText,
QMap< QString, QVector< uint32_t > > & words,
bool handleRoundBrackets )
2014-04-16 16:18:28 +00:00
{
if( articleText.isEmpty() )
return;
QStringList articleWords = articleText.normalized( QString::NormalizationForm_C )
.split( handleRoundBrackets ? RX::Ftx::handleRoundBracket : RX::Ftx::noRoundBracket,
Qt::SkipEmptyParts );
QVector< QString > setOfWords;
setOfWords.reserve( articleWords.size() );
2014-04-16 16:18:28 +00:00
for( int x = 0; x < articleWords.size(); x++ )
{
QString word = articleWords.at( x ).toLower();
bool hasCJK = false;
QString hieroglyph;
2014-04-16 16:18:28 +00:00
// If word contains CJK symbols we add to index only these symbols separately
for( int y = 0; y < word.size(); y++ )
if( isCJKChar( word.at( y ).unicode() ) )
{
hasCJK = true;
hieroglyph.append( word[ y ] );
if( QChar( word.at( y ) ).isHighSurrogate()
&& QChar( word[ y + 1 ] ).isLowSurrogate() )
hieroglyph.append( word[ ++y ] );
//if( !setOfWords.contains( hieroglyph ) )
{
setOfWords.push_back( hieroglyph );
/*Mutex::Lock _( _mapLock );
words[ hieroglyph ].push_back( articleAddress );*/
}
hieroglyph.clear();
}
if( !hasCJK )
2014-04-16 16:18:28 +00:00
{
// Else we add word to index as is
if( word.size() < FTS::MinimumWordSize )
continue;
if( handleRoundBrackets && ( word.indexOf( '(' ) >= 0 || word.indexOf( ')' ) >= 0 ) )
{
// Special handle for words with round brackets - DSL feature
QStringList list;
QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts );
for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it )
if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) )
list.append( *it );
QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word );
if( match.hasMatch() )
{
QStringList parts = match.capturedTexts();
// Add empty strings for compatibility with QRegExp behaviour
for( int i = match.lastCapturedIndex() + 1; i < 6; i++ )
parts.append( QString() );
QString parsedWord = parts[ 2 ] + parts[ 4 ]; // Brackets removed
if( parsedWord.size() >= FTS::MinimumWordSize && !list.contains( parsedWord ) )
list.append( parsedWord );
parsedWord = parts[ 1 ].remove( '(' ).remove( ')' )
+ parts[ 2 ]
+ parts[ 3 ].remove( '(' ).remove( ')' )
+ parts[ 4 ]
+ parts[ 5 ].remove( '(' ).remove( ')' ); // Brackets expansed
if( parsedWord.size() >= FTS::MinimumWordSize && !list.contains( parsedWord ) )
list.append( parsedWord );
}
for( QStringList::iterator it = list.begin(); it != list.end(); ++it )
{
//if( !setOfWords.contains( *it ) )
{
setOfWords.push_back( *it );
/*Mutex::Lock _( _mapLock );
words[ *it ].push_back( articleAddress );*/
}
}
}
else
//if( !setOfWords.contains( word ) )
{
setOfWords.push_back( word );
/*Mutex::Lock _( _mapLock );
words[ word ].push_back( articleAddress );*/
}
2014-04-16 16:18:28 +00:00
}
}
{
Mutex::Lock _( lockMutex );
for( const QString & word : setOfWords )
{
words[ word ].push_back( articleAddress );
}
2014-04-16 16:18:28 +00:00
}
}
void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{
#ifdef USE_XAPIAN
return makeFTSIndexXapian(dict,isCancelled);
#endif
2014-04-16 16:18:28 +00:00
Mutex::Lock _( dict->getFtsMutex() );
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
2014-04-16 16:18:28 +00:00
File::Class ftsIdx( dict->ftsIndexName(), "wb" );
FtsIdxHeader ftsIdxHeader;
memset( &ftsIdxHeader, 0, sizeof( ftsIdxHeader ) );
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
ftsIdx.write( ftsIdxHeader );
ChunkedStorage::Writer chunks( ftsIdx );
BtreeIndexing::IndexedWords indexedWords;
QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( dict->getArticleCount() );
2014-04-16 16:18:28 +00:00
dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
2014-04-16 16:18:28 +00:00
QVector< uint32_t > offsets;
offsets.resize( setOfOffsets.size() );
uint32_t * ptr = &offsets.front();
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
it != setOfOffsets.constEnd(); ++it )
{
*ptr = *it;
ptr++;
}
// Free memory
setOfOffsets.clear();
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
dict->sortArticlesOffsetsForFTS( offsets, isCancelled );
2014-04-16 16:18:28 +00:00
QMap< QString, QVector< uint32_t > > ftsWords;
bool needHandleBrackets;
{
QString name = QString::fromUtf8( dict->getDictionaryFilenames()[ 0 ].c_str() ).toLower();
needHandleBrackets = name.endsWith( ".dsl" ) || name.endsWith( "dsl.dz" );
}
QSemaphore sem( QThread::idealThreadCount() );
//QFutureSynchronizer< void > synchronizer;
2022-10-06 11:32:45 +00:00
long indexedFtsDoc=0;
for( auto & address : offsets )
2014-04-16 16:18:28 +00:00
{
2022-10-06 11:32:45 +00:00
indexedFtsDoc++;
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
//wait the future to be finished.
sem.acquire( QThread::idealThreadCount() );
return;
}
sem.acquire();
QFuture< void > f = QtConcurrent::run(
[ & ]()
{
QSemaphoreReleaser releaser( sem );
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
return;
}
2014-04-16 16:18:28 +00:00
QString headword, articleStr;
2014-04-16 16:18:28 +00:00
dict->getArticleText( address, headword, articleStr );
2014-04-16 16:18:28 +00:00
parseArticleForFts( address, articleStr, ftsWords, needHandleBrackets );
} );
//synchronizer.addFuture( f );
2022-10-06 11:32:45 +00:00
dict->setIndexedFtsDoc(indexedFtsDoc);
2014-04-16 16:18:28 +00:00
}
sem.acquire( QThread::idealThreadCount() );
2014-04-16 16:18:28 +00:00
// Free memory
offsets.clear();
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QMap< QString, QVector< uint32_t > >::iterator it = ftsWords.begin();
while( it != ftsWords.end() )
2014-04-16 16:18:28 +00:00
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2014-04-16 16:18:28 +00:00
throw exUserAbort();
uint32_t offset = chunks.startNewBlock();
uint32_t size = it.value().size();
chunks.addToBlock( &size, sizeof(uint32_t) );
chunks.addToBlock( it.value().data(), size * sizeof(uint32_t) );
indexedWords.addSingleWord( gd::toWString( it.key() ), offset );
it = ftsWords.erase( it );
}
2014-04-16 16:18:28 +00:00
ftsIdxHeader.chunksOffset = chunks.finish();
ftsIdxHeader.wordCount = indexedWords.size();
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
2014-04-16 16:18:28 +00:00
BtreeIndexing::IndexInfo ftsIdxInfo = BtreeIndexing::buildIndex( indexedWords, ftsIdx );
// Free memory
indexedWords.clear();
ftsIdxHeader.indexBtreeMaxElements = ftsIdxInfo.btreeMaxElements;
ftsIdxHeader.indexRootOffset = ftsIdxInfo.rootOffset;
ftsIdxHeader.signature = FtsHelpers::FtsSignature;
ftsIdxHeader.formatVersion = FtsHelpers::CurrentFtsFormatVersion + dict->getFtsIndexVersion();
2014-04-16 16:18:28 +00:00
ftsIdx.rewind();
ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 );
}
// use xapian to create the index
#ifdef USE_XAPIAN
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{
Mutex::Lock _( dict->getFtsMutex() );
try {
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
// Open the database for update, creating a new database if necessary.
Xapian::WritableDatabase db(dict->ftsIndexName(), Xapian::DB_CREATE_OR_OPEN);
Xapian::TermGenerator indexer;
2022-10-08 12:02:02 +00:00
// Xapian::Stem stemmer("english");
// indexer.set_stemmer(stemmer);
// indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS);
2022-10-07 01:27:27 +00:00
indexer.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM);
BtreeIndexing::IndexedWords indexedWords;
QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( dict->getArticleCount() );
dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QVector< uint32_t > offsets;
offsets.resize( setOfOffsets.size() );
uint32_t * ptr = &offsets.front();
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
it != setOfOffsets.constEnd(); ++it )
{
*ptr = *it;
ptr++;
}
// Free memory
setOfOffsets.clear();
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
dict->sortArticlesOffsetsForFTS( offsets, isCancelled );
2022-10-06 14:35:10 +00:00
// incremental build the index.
// get the last address.
bool skip = true;
uint32_t lastAddress = -1;
try
{
Xapian::Document lastDoc = db.get_document( db.get_lastdocid() );
lastAddress = atoi( lastDoc.get_data().c_str() );
}
catch( Xapian::Error & e )
{
qDebug() << e.get_description().c_str();
skip = false;
}
2022-10-06 11:32:45 +00:00
long indexedDoc=0L;
for( auto & address : offsets )
{
2022-10-06 11:32:45 +00:00
indexedDoc++;
if(address==lastAddress){
skip = false;
}
//skip until to the lastAddress;
if((address!=lastAddress)&&skip){
continue;
}
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
return;
}
QString headword, articleStr;
dict->getArticleText( address, headword, articleStr );
Xapian::Document doc;
indexer.set_document( doc );
indexer.index_text_without_positions( articleStr.toStdString() );
doc.set_data( std::to_string( address ) );
// Add the document to the database.
db.add_document( doc );
2022-10-06 11:32:45 +00:00
dict->setIndexedFtsDoc(indexedDoc);
}
2022-10-06 11:32:45 +00:00
//add a special document to mark the end of the index.
Xapian::Document doc;
doc.set_data( finish_mark );
// Add the document to the database.
db.add_document( doc );
// Free memory
offsets.clear();
db.commit();
} catch (Xapian::Error & e) {
qWarning()<<QString::fromStdString(e.get_description());
}
}
#endif
bool isCJKChar( ushort ch )
{
2022-07-31 08:17:57 +00:00
return Utils::isCJKChar(ch);
}
2014-04-16 16:18:28 +00:00
void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
QStringList const & words,
QRegExp const & searchRegexp )
{
2022-06-19 06:56:18 +00:00
// const int parallel_count = QThread::idealThreadCount()/2;
// QSemaphore sem( parallel_count < 1 ? 1 : parallel_count );
//
// QFutureSynchronizer< void > synchronizer;
const auto searchRegularExpression = createMatchRegex( searchRegexp );
for( auto & address : offsets )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2022-06-17 13:41:45 +00:00
{
return;
2022-06-17 13:41:45 +00:00
}
2022-06-19 06:56:18 +00:00
checkSingleArticle( address, words, searchRegularExpression );
}
2014-04-16 16:18:28 +00:00
}
QRegularExpression FTSResultsRequest::createMatchRegex( QRegExp const & searchRegexp )
2022-06-04 15:22:14 +00:00
{
QRegularExpression searchRegularExpression;
2022-06-04 15:22:14 +00:00
if( searchMode == FTS::Wildcards )
searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) );
else
searchRegularExpression.setPattern( searchRegexp.pattern() );
QRegularExpression::PatternOptions patternOptions =
QRegularExpression::DotMatchesEverythingOption | QRegularExpression::UseUnicodePropertiesOption
| QRegularExpression::MultilineOption | QRegularExpression::InvertedGreedinessOption;
if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive )
patternOptions |= QRegularExpression::CaseInsensitiveOption;
searchRegularExpression.setPatternOptions( patternOptions );
if( !searchRegularExpression.isValid() )
searchRegularExpression.setPattern( "" );
return searchRegularExpression;
}
void FTSResultsRequest::checkSingleArticle( uint32_t offset,
QStringList const & words,
QRegularExpression const & searchRegularExpression )
{
// int results = 0;
QString headword, articleText;
QList< uint32_t > offsetsForHeadwords;
QVector< QStringList > hiliteRegExps;
QString id = QString::fromUtf8( dict.getId().c_str() );
// RegExp mode
2022-06-04 15:22:14 +00:00
if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp )
{
// for( int i = 0; i < offsets.size(); i++ )
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return;
// auto article_address = offsets.at( i );
dict.getArticleText( offset, headword, articleText );
articleText = articleText.normalized( QString::NormalizationForm_C );
if( ignoreDiacritics )
articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );
if( articleText.contains( searchRegularExpression ) )
{
if( headword.isEmpty() )
offsetsForHeadwords.append( offset );
else
{
Mutex::Lock _( dataMutex );
2022-06-04 15:22:14 +00:00
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
}
2022-06-04 15:22:14 +00:00
++results;
if( maxResults > 0 && results >= maxResults )
return;
}
}
else
{
// Words mode
QVector< QPair< QString, bool > > wordsList;
if( ignoreWordsOrder )
{
for( QStringList::const_iterator it = words.begin(); it != words.end(); ++it )
wordsList.append( QPair< QString, bool >( *it, true ) );
}
// for( int i = 0; i < offsets.size(); i++ )
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return;
if( ignoreWordsOrder )
{
for( int i = 0; i < wordsList.size(); i++ )
wordsList[ i ].second = true;
}
dict.getArticleText( offset, headword, articleText );
articleText = articleText.normalized( QString::NormalizationForm_C );
if( ignoreDiacritics )
articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );
if( ignoreWordsOrder )
{
bool allMatch = true;
foreach( QString word, words )
{
if( containCJK( word ) || searchMode == FTS::PlainText )
{
if( !articleText.contains( word ) )
{
allMatch = false;
break;
}
}
else if( searchMode == FTS::WholeWords )
{
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
if( !articleText.contains( tmpReg ) )
{
allMatch = false;
break;
}
}
}
if( !allMatch )
{
return;
}
if( distanceBetweenWords >= 0 )
{
// the article text contains all the needed words.
// determine if distance restriction is meet
const QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
2022-06-04 15:22:14 +00:00
// use a string that could not be presented in the article.
articleText = articleText.replace( replaceReg, "=@XXXXX@=" );
auto hasCJK = false;
foreach( QString word, words )
{
if( containCJK( word ) )
{
hasCJK = true;
break;
}
}
// hascjk value ,perhaps should depend on each word
const auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ),
searchMode,
distanceBetweenWords,
hasCJK );
const QRegularExpression distanceOrderReg( searchRegStr,
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
2022-06-04 15:22:14 +00:00
// use a string that could not be presented in the article.
if( articleText.contains( distanceOrderReg ) )
{
if( headword.isEmpty() )
offsetsForHeadwords.append( offset );
else
{
Mutex::Lock _( dataMutex );
2022-06-04 15:22:14 +00:00
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
}
2022-06-04 15:22:14 +00:00
++results;
if( maxResults > 0 && results >= maxResults )
return;
}
}
}
else
{
if( articleText.contains( searchRegularExpression ) )
{
if( headword.isEmpty() )
offsetsForHeadwords.append( offset );
else
{
Mutex::Lock _( dataMutex );
2022-06-04 15:22:14 +00:00
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
}
2022-06-04 15:22:14 +00:00
++results;
if( maxResults > 0 && results >= maxResults )
return;
}
}
}
if( !offsetsForHeadwords.isEmpty() )
{
QVector< QString > headwords;
2022-06-16 12:20:33 +00:00
Mutex::Lock _( dataMutex );
2022-06-04 15:22:14 +00:00
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
for( int x = 0; x < headwords.size(); x++ )
{
2022-06-04 15:22:14 +00:00
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ),
id,
x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(),
matchCase ) );
}
2022-06-04 15:22:14 +00:00
}
}
2014-04-16 16:18:28 +00:00
void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
sptr< ChunkedStorage::Reader > chunks,
QStringList & indexWords,
QStringList & searchWords, QRegExp & regexp )
2014-04-16 16:18:28 +00:00
{
// Find articles which contains all requested words
2022-06-05 08:31:44 +00:00
QSet< uint32_t > setOfOffsets;
2014-04-16 16:18:28 +00:00
if( indexWords.isEmpty() )
return;
QList< QSet< uint32_t > > addressLists;
auto findLinks = [ & ]( const QString & word )
2014-04-16 16:18:28 +00:00
{
2022-06-05 08:31:44 +00:00
QSet< uint32_t > tmp;
uint32_t size;
2014-04-16 16:18:28 +00:00
2022-06-05 08:31:44 +00:00
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2022-06-18 04:50:12 +00:00
{
addressLists << tmp;
return;
}
2014-04-16 16:18:28 +00:00
2022-06-05 08:31:44 +00:00
vector< BtreeIndexing::WordArticleLink > links =
ftsIndex.findArticles( gd::toWString( word ), ignoreDiacritics );
2014-04-16 16:18:28 +00:00
for( unsigned x = 0; x < links.size(); x++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2022-06-18 04:50:12 +00:00
{
addressLists << tmp;
return;
}
2014-04-16 16:18:28 +00:00
vector< char > chunk;
char * linksPtr;
{
2022-06-13 14:12:03 +00:00
// Mutex::Lock _( dict.getFtsMutex() );
2014-04-16 16:18:28 +00:00
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
}
2022-06-05 08:31:44 +00:00
memcpy( &size, linksPtr, sizeof( uint32_t ) );
linksPtr += sizeof( uint32_t );
2014-04-16 16:18:28 +00:00
for( uint32_t y = 0; y < size; y++ )
{
tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
2022-06-05 08:31:44 +00:00
linksPtr += sizeof( uint32_t );
2014-04-16 16:18:28 +00:00
}
}
links.clear();
{
Mutex::Lock _( dataMutex );
addressLists << tmp;
}
2022-06-05 08:31:44 +00:00
};
// int n = indexWords.length();
2022-06-19 06:56:18 +00:00
// QtConcurrent::blockingMap( indexWords, findLinks );
for(QString word:indexWords)
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
return;
}
findLinks( word );
}
// blocked execution.
2022-06-05 08:31:44 +00:00
int i = 0;
for( auto & elem : addressLists )
2022-06-05 08:31:44 +00:00
{
if( i++ == 0 )
setOfOffsets = elem;
2014-04-16 16:18:28 +00:00
else
2022-06-05 08:31:44 +00:00
setOfOffsets = setOfOffsets.intersect( elem );
2014-04-16 16:18:28 +00:00
}
if( setOfOffsets.isEmpty() )
return;
QVector< uint32_t > offsets;
offsets.resize( setOfOffsets.size() );
uint32_t * ptr = &offsets.front();
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
it != setOfOffsets.constEnd(); ++it )
{
*ptr = *it;
ptr++;
}
setOfOffsets.clear();
dict.sortArticlesOffsetsForFTS( offsets, isCancelled );
2014-04-16 16:18:28 +00:00
checkArticles( offsets, searchWords, regexp );
2014-04-16 16:18:28 +00:00
}
void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
sptr< ChunkedStorage::Reader > chunks,
QStringList & indexWords,
QStringList & searchWords,
QRegExp & regexp )
{
// Special case - combination of index search for hieroglyphs
// and full index search for other words
QSet< uint32_t > setOfOffsets;
uint32_t size;
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return;
if( indexWords.isEmpty() )
return;
QStringList wordsList, hieroglyphsList;
for( int x = 0; x < indexWords.size(); x++ )
{
QString const & word = indexWords.at( x );
if( isCJKChar( word[ 0 ].unicode() ) )
hieroglyphsList.append( word );
else
wordsList.append( word );
}
QVector< QSet< uint32_t > > allWordsLinks;
int n = wordsList.size();
if( !hieroglyphsList.isEmpty() )
{
wordsList += hieroglyphsList;
n += 1;
}
allWordsLinks.resize( n );
if( !wordsList.empty() )
{
QList< QSet< uint32_t > > sets;
auto fn_wordLink = [ & ](const QString & word )
{
QSet< uint32_t > tmp;
vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::toWString( word ) );
for( unsigned x = 0; x < links.size(); x++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2022-06-18 04:50:12 +00:00
{
2022-06-19 05:26:41 +00:00
Mutex::Lock _( dataMutex );
2022-06-18 04:50:12 +00:00
sets << tmp;
return;
}
vector< char > chunk;
char * linksPtr;
{
2022-06-13 14:12:03 +00:00
// Mutex::Lock _( dict.getFtsMutex() );
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
}
memcpy( &size, linksPtr, sizeof( uint32_t ) );
linksPtr += sizeof( uint32_t );
// across chunks, need further investigation
uint32_t max = ( chunk.size() - ( linksPtr - &chunk.front() )) / 4;
2022-06-18 04:50:12 +00:00
tmp.reserve( size );
uint32_t q_max = qMin(size,max);
for( uint32_t y = 0; y < q_max; y++ )
{
tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
linksPtr += sizeof( uint32_t );
}
}
links.clear();
{
Mutex::Lock _( dataMutex );
sets << tmp;
}
};
2022-06-19 06:56:18 +00:00
// QtConcurrent::blockingMap( wordsList, fn_wordLink );
{
for(const auto & word : wordsList )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
return;
}
fn_wordLink( word );
}
}
//blocked execution.
int i = 0;
for( auto & elem : sets )
{
if( i++ == 0 )
setOfOffsets = elem;
else
setOfOffsets = setOfOffsets.intersect( elem );
}
// allWordsLinks[ wordNom ] = setOfOffsets;
// setOfOffsets.clear();
2022-06-16 12:20:33 +00:00
// wordNom += 1;
}
if( setOfOffsets.isEmpty() )
return;
allWordsLinks.clear();
2022-06-18 04:50:12 +00:00
QVector< uint32_t > offsets( setOfOffsets.begin(),setOfOffsets.end() );
// offsets.resize( setOfOffsets.size() );
// uint32_t * ptr = &offsets.front();
//
// for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
// it != setOfOffsets.constEnd(); ++it )
// {
// *ptr = *it;
// ptr++;
// }
setOfOffsets.clear();
dict.sortArticlesOffsetsForFTS( offsets, isCancelled );
checkArticles( offsets, searchWords, regexp );
}
2014-04-16 16:18:28 +00:00
void FTSResultsRequest::fullIndexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
sptr< ChunkedStorage::Reader > chunks,
QStringList & indexWords,
QStringList & searchWords,
QRegExp & regexp )
{
QSet< uint32_t > setOfOffsets;
uint32_t size;
QVector< BtreeIndexing::WordArticleLink > links;
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2014-04-16 16:18:28 +00:00
return;
if( indexWords.isEmpty() )
return;
links.reserve( wordsInIndex );
ftsIndex.findArticleLinks( &links, 0, 0, &isCancelled );
2014-04-16 16:18:28 +00:00
QVector< QSet< uint32_t > > allWordsLinks;
allWordsLinks.resize( indexWords.size() );
for( int x = 0; x < links.size(); x++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2014-04-16 16:18:28 +00:00
return;
QString word = QString::fromUtf8( links[ x ].word.data(), links[ x ].word.size() );
if( ignoreDiacritics )
word = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( word ) ) );
2014-04-16 16:18:28 +00:00
for( int i = 0; i < indexWords.size(); i++ )
{
if( word.length() >= indexWords.at( i ).length() && word.contains( indexWords.at( i ) ) )
{
vector< char > chunk;
char * linksPtr;
{
2022-06-13 14:12:03 +00:00
// Mutex::Lock _( dict.getFtsMutex() );
2014-04-16 16:18:28 +00:00
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
}
memcpy( &size, linksPtr, sizeof(uint32_t) );
linksPtr += sizeof(uint32_t);
for( uint32_t y = 0; y < size; y++ )
{
allWordsLinks[ i ].insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
linksPtr += sizeof(uint32_t);
}
if( searchMode == FTS::PlainText )
break;
2014-04-16 16:18:28 +00:00
}
}
}
links.clear();
for( int i = 0; i < allWordsLinks.size(); i++ )
{
if( i == 0 )
setOfOffsets = allWordsLinks.at( i );
else
setOfOffsets = setOfOffsets.intersect( allWordsLinks.at( i ) );
}
if( setOfOffsets.isEmpty() )
return;
allWordsLinks.clear();
2022-06-18 04:50:12 +00:00
QVector< uint32_t > offsets( setOfOffsets.begin(), setOfOffsets.end() );
// offsets.resize( setOfOffsets.size() );
// uint32_t * ptr = &offsets.front();
//
// for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
// it != setOfOffsets.constEnd(); ++it )
// {
// *ptr = *it;
// ptr++;
// }
2014-04-16 16:18:28 +00:00
setOfOffsets.clear();
dict.sortArticlesOffsetsForFTS( offsets, isCancelled );
2014-04-16 16:18:28 +00:00
checkArticles( offsets, searchWords, regexp );
}
void FTSResultsRequest::fullSearch( QStringList & searchWords, QRegExp & regexp )
{
// Whole file survey
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2014-04-16 16:18:28 +00:00
return;
QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( dict.getArticleCount() );
dict.findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
2014-04-16 16:18:28 +00:00
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2014-04-16 16:18:28 +00:00
return;
QVector< uint32_t > offsets;
offsets.resize( setOfOffsets.size() );
uint32_t * ptr = &offsets.front();
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
it != setOfOffsets.constEnd(); ++it )
{
*ptr = *it;
ptr++;
}
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2014-04-16 16:18:28 +00:00
return;
setOfOffsets.clear();
dict.sortArticlesOffsetsForFTS( offsets, isCancelled );
2014-04-16 16:18:28 +00:00
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
2014-04-16 16:18:28 +00:00
return;
checkArticles( offsets, searchWords, regexp );
}
void FTSResultsRequest::run()
{
#ifdef USE_XAPIAN
return runXapian();
#endif
2014-04-16 16:18:28 +00:00
if ( dict.ensureInitDone().size() )
{
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
finish();
return;
}
try
{
QStringList indexWords, searchWords;
QRegExp searchRegExp;
if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
2014-04-16 16:18:28 +00:00
{
finish();
return;
}
if( dict.haveFTSIndex() && !indexWords.isEmpty() )
{
FtsIdxHeader ftsIdxHeader;
BtreeIndexing::BtreeIndex ftsIndex;
sptr< ChunkedStorage::Reader > chunks;
File::Class ftsIdx( dict.ftsIndexName(), "rb" );
{
Mutex::Lock _( dict.getFtsMutex() );
ftsIdxHeader = ftsIdx.read< FtsIdxHeader >();
wordsInIndex = ftsIdxHeader.wordCount;
2014-04-16 16:18:28 +00:00
ftsIndex.openIndex( BtreeIndexing::IndexInfo( ftsIdxHeader.indexBtreeMaxElements,
ftsIdxHeader.indexRootOffset ),
ftsIdx, dict.getFtsMutex() );
chunks = std::shared_ptr<ChunkedStorage::Reader>(new ChunkedStorage::Reader(ftsIdx, ftsIdxHeader.chunksOffset));
2014-04-16 16:18:28 +00:00
}
if( hasCJK )
combinedIndexSearch( ftsIndex, chunks, indexWords, searchWords, searchRegExp );
2014-04-16 16:18:28 +00:00
else
{
if( searchMode == FTS::WholeWords )
indexSearch( ftsIndex, chunks, indexWords, searchWords, searchRegExp );
else
fullIndexSearch( ftsIndex, chunks, indexWords, searchWords, searchRegExp );
}
2014-04-16 16:18:28 +00:00
}
else
{
fullSearch( searchWords, searchRegExp );
}
if( foundHeadwords && foundHeadwords->size() > 0 )
{
Mutex::Lock _( dataMutex );
data.resize( sizeof( foundHeadwords ) );
memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) );
foundHeadwords = 0;
hasAnyData = true;
}
}
catch( std::exception &ex )
{
gdWarning( "FTS: Failed full-text search for \"%s\", reason: %s\n",
dict.getName().c_str(), ex.what() );
// Results not loaded -- we don't set the hasAnyData flag then
}
finish();
}
#ifdef USE_XAPIAN
void FTSResultsRequest::runXapian()
{
if ( dict.ensureInitDone().size() )
{
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
finish();
return;
}
try
{
if( dict.haveFTSIndex() )
{
//no need to parse the search string, use xapian directly.
//if the search mode is wildcard, change xapian search query flag?
// Open the database for searching.
Xapian::Database db(dict.ftsIndexName());
// Start an enquire session.
Xapian::Enquire enquire( db );
// Combine the rest of the command line arguments with spaces between
// them, so that simple queries don't have to be quoted at the shell
// level.
string query_string( searchString.toStdString() );
// Parse the query string to produce a Xapian::Query object.
Xapian::QueryParser qp;
qp.set_database( db );
Xapian::QueryParser::feature_flag flag = Xapian::QueryParser::FLAG_DEFAULT;
if( searchMode == FTS::Wildcards )
flag = Xapian::QueryParser::FLAG_WILDCARD;
Xapian::Query query = qp.parse_query( query_string, flag|Xapian::QueryParser::FLAG_CJK_NGRAM );
qDebug() << "Parsed query is: " << query.get_description().c_str();
// Find the top 100 results for the query.
enquire.set_query( query );
Xapian::MSet matches = enquire.get_mset( 0, 100 );
emit matchCount(matches.get_matches_estimated());
// Display the results.
qDebug() << matches.get_matches_estimated() << " results found.\n";
qDebug() << "Matches 1-" << matches.size() << ":\n\n";
QList< uint32_t > offsetsForHeadwords;
for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
{
qDebug() << i.get_rank() + 1 << ": " << i.get_weight() << " docid=" << *i << " ["
<< i.get_document().get_data().c_str() << "]";
2022-10-06 11:32:45 +00:00
if(i.get_document().get_data()==finish_mark)
continue;
offsetsForHeadwords.append( atoi( i.get_document().get_data().c_str() ) );
}
if( !offsetsForHeadwords.isEmpty() )
{
QVector< QString > headwords;
Mutex::Lock _( dataMutex );
QString id = QString::fromUtf8( dict.getId().c_str() );
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
for( int x = 0; x < headwords.size(); x++ )
{
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, QStringList(), matchCase ) );
}
}
}
else
{
QStringList indexWords, searchWords;
QRegExp searchRegExp;
if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
{
finish();
return;
}
fullSearch( searchWords, searchRegExp );
}
if( foundHeadwords && foundHeadwords->size() > 0 )
{
Mutex::Lock _( dataMutex );
data.resize( sizeof( foundHeadwords ) );
memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) );
foundHeadwords = 0;
hasAnyData = true;
}
}
catch (const Xapian::Error &e) {
qWarning() << e.get_description().c_str();
}
catch( std::exception &ex )
{
gdWarning( "FTS: Failed full-text search for \"%s\", reason: %s\n",
dict.getName().c_str(), ex.what() );
// Results not loaded -- we don't set the hasAnyData flag then
}
finish();
}
#endif
2014-04-16 16:18:28 +00:00
} // namespace