opt: mdx fulltext lock seperated with normal search (#759)

* opt: refactor mdx fullindex creation

* opt: incremental fulltext creation logic change

* opt: incremental fulltext creation logic change

* opt: progress of fulltext creation logic

* opt: code smell

* fix: code smell

* fix: code smell

* fix: code smell

* fix: code smell

* fix: code smell

* 🎨 apply clang-format changes

* fix: code smell

* fix: code smell

---------

Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2023-05-29 00:01:21 +08:00 committed by GitHub
parent 5d15ffbc14
commit 4eb8374a35
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 210 additions and 230 deletions

View file

@ -34,7 +34,7 @@ public:
unsigned currentGroupId; unsigned currentGroupId;
QString translateLineText{}; QString translateLineText{};
//hold the dictionary id; //hold the dictionary id;
QSet<QString> collapsedDicts; QSet< QString > collapsedDicts;
QMap< QString, QSet< QString > > folderFavoritesMap; QMap< QString, QSet< QString > > folderFavoritesMap;
QMap< unsigned, QString > groupFolderMap; QMap< unsigned, QString > groupFolderMap;
@ -42,6 +42,8 @@ public:
signals: signals:
void dictionaryChanges( ActiveDictIds ad ); void dictionaryChanges( ActiveDictIds ad );
void dictionaryClear( ActiveDictIds ad ); void dictionaryClear( ActiveDictIds ad );
void indexingDictionary( QString );
}; };
#endif // GLOBAL_GLOBALBROADCASTER_H #endif // GLOBAL_GLOBALBROADCASTER_H

View file

@ -17,6 +17,7 @@
#include "config.hh" #include "config.hh"
#include "utils.hh" #include "utils.hh"
#include <QString> #include <QString>
#include "globalbroadcaster.hh"
/// Abstract dictionary-related stuff /// Abstract dictionary-related stuff
namespace Dictionary { namespace Dictionary {
@ -261,12 +262,16 @@ Q_DECLARE_FLAGS( Features, Feature )
Q_DECLARE_OPERATORS_FOR_FLAGS( Features ) Q_DECLARE_OPERATORS_FOR_FLAGS( Features )
/// A dictionary. Can be used to query words. /// A dictionary. Can be used to query words.
class Class class Class: public QObject
{ {
Q_OBJECT
string id; string id;
vector< string > dictionaryFiles; vector< string > dictionaryFiles;
long indexedFtsDoc; long indexedFtsDoc;
long lastProgress = 0;
protected: protected:
QString dictionaryDescription; QString dictionaryDescription;
QIcon dictionaryIcon, dictionaryNativeIcon; QIcon dictionaryIcon, dictionaryNativeIcon;
@ -339,8 +344,16 @@ public:
/// Returns the number of articles in the dictionary. /// Returns the number of articles in the dictionary.
virtual unsigned long getArticleCount() noexcept=0; virtual unsigned long getArticleCount() noexcept=0;
void setIndexedFtsDoc(long _indexedFtsDoc){ void setIndexedFtsDoc(long _indexedFtsDoc)
{
indexedFtsDoc = _indexedFtsDoc; indexedFtsDoc = _indexedFtsDoc;
auto newProgress = getIndexingFtsProgress();
if ( newProgress != lastProgress ) {
lastProgress = newProgress;
emit GlobalBroadcaster::instance()->indexingDictionary(
QString( "%1......%%2" ).arg( QString::fromStdString( getName() ) ).arg( newProgress ) );
}
} }
int getIndexingFtsProgress(){ int getIndexingFtsProgress(){

View file

@ -23,11 +23,8 @@
#include <map> #include <map>
#include <set> #include <set>
#include <list> #include <list>
#include <ctype.h>
#include <stdlib.h>
#ifdef _MSC_VER #ifdef _MSC_VER
#include <stub_msvc.h> #include <stub_msvc.h>
#endif #endif
#include "globalregex.hh" #include "globalregex.hh"
@ -37,9 +34,7 @@
#include <QCryptographicHash> #include <QCryptographicHash>
#include <QDir> #include <QDir>
#include <QRegularExpression> #include <QRegularExpression>
#include <QSemaphore>
#include <QString> #include <QString>
#include <QTextDocument>
#include <QThreadPool> #include <QThreadPool>
#include <QtConcurrent> #include <QtConcurrent>
@ -198,10 +193,11 @@ public:
}; };
class MdxDictionary: public QObject, public BtreeIndexing::BtreeDictionary class MdxDictionary: public BtreeIndexing::BtreeDictionary
{ {
Mutex idxMutex; Mutex idxMutex;
File::Class idx; File::Class idx;
string idxFileName;
IdxHeader idxHeader; IdxHeader idxHeader;
string encoding; string encoding;
ChunkedStorage::Reader chunks; ChunkedStorage::Reader chunks;
@ -220,7 +216,7 @@ public:
MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles ); MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles );
~MdxDictionary(); ~MdxDictionary() override;
void deferredInit() override; void deferredInit() override;
@ -231,7 +227,7 @@ public:
map< Dictionary::Property, string > getProperties() noexcept override map< Dictionary::Property, string > getProperties() noexcept override
{ {
return map< Dictionary::Property, string >(); return {};
} }
unsigned long getArticleCount() noexcept override unsigned long getArticleCount() noexcept override
@ -273,7 +269,7 @@ public:
void setFTSParameters( Config::FullTextSearch const & fts ) override void setFTSParameters( Config::FullTextSearch const & fts ) override
{ {
if( ensureInitDone().size() ) if ( !ensureInitDone().empty() )
return; return;
can_FTS = fts.enabled can_FTS = fts.enabled
@ -305,16 +301,15 @@ private:
void removeDirectory( QString const & directory ); void removeDirectory( QString const & directory );
friend class MdxHeadwordsRequest;
friend class MdxArticleRequest; friend class MdxArticleRequest;
friend class MddResourceRequest; friend class MddResourceRequest;
void loadResourceFile( const wstring & resourceName, vector< char > & data ); void loadResourceFile( const wstring & resourceName, vector< char > & data );
}; };
MdxDictionary::MdxDictionary( string const & id, string const & indexFile, MdxDictionary::MdxDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ):
vector<string> const & dictionaryFiles ):
BtreeDictionary( id, dictionaryFiles ), BtreeDictionary( id, dictionaryFiles ),
idx( indexFile, "rb" ), idx( indexFile, "rb" ),
idxFileName( indexFile ),
idxHeader( idx.read< IdxHeader >() ), idxHeader( idx.read< IdxHeader >() ),
chunks( idx, idxHeader.chunksOffset ), chunks( idx, idxHeader.chunksOffset ),
deferredInitRunnableStarted( false ) deferredInitRunnableStarted( false )
@ -479,8 +474,8 @@ void MdxDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration
if( haveFTSIndex() ) if( haveFTSIndex() )
return; return;
if( ensureInitDone().size() ) // if( !ensureInitDone().empty() )
return; // return;
if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch ) if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
return; return;
@ -490,7 +485,10 @@ void MdxDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration
try try
{ {
FtsHelpers::makeFTSIndex( this, isCancelled ); auto _dict = std::make_shared< MdxDictionary >( this->getId(), idxFileName, this->getDictionaryFilenames() );
if( !_dict->ensureInitDone().empty() )
return;
FtsHelpers::makeFTSIndex( _dict.get(), isCancelled );
FTS_index_completed.ref(); FTS_index_completed.ref();
} }
catch( std::exception &ex ) catch( std::exception &ex )
@ -559,7 +557,7 @@ public:
isCancelled.ref(); isCancelled.ref();
} }
~MdxArticleRequest() ~MdxArticleRequest() override
{ {
isCancelled.ref(); isCancelled.ref();
f.waitForFinished(); f.waitForFinished();
@ -575,8 +573,7 @@ void MdxArticleRequest::run()
return; return;
} }
if ( dict.ensureInitDone().size() ) if ( !dict.ensureInitDone().empty() ) {
{
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) ); setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
finish(); finish();
return; return;
@ -584,10 +581,9 @@ void MdxArticleRequest::run()
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics ); vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
for ( unsigned x = 0; x < alts.size(); ++x ) for ( const auto & alt : alts ) {
{
/// Make an additional query for each alt /// Make an additional query for each alt
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics ); vector< WordArticleLink > altChain = dict.findArticles( alt, ignoreDiacritics );
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
@ -1345,30 +1341,27 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
{ {
vector< sptr< Dictionary::Class > > dictionaries; vector< sptr< Dictionary::Class > > dictionaries;
for ( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); ++i ) for ( const auto & fileName : fileNames ) {
{
// Skip files with the extensions different to .mdx to speed up the // Skip files with the extensions different to .mdx to speed up the
// scanning // scanning
if ( i->size() < 4 || strcasecmp( i->c_str() + ( i->size() - 4 ), ".mdx" ) != 0 ) if ( fileName.size() < 4 || strcasecmp( fileName.c_str() + ( fileName.size() - 4 ), ".mdx" ) != 0 )
continue; continue;
vector< string > dictFiles( 1, *i ); vector< string > dictFiles( 1, fileName );
findResourceFiles( *i, dictFiles ); findResourceFiles( fileName, dictFiles );
string dictId = Dictionary::makeDictionaryId( dictFiles ); string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId; string indexFile = indicesDir + dictId;
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( dictFiles, indexFile ) ) {
indexIsOldOrBad( dictFiles, indexFile ) )
{
// Building the index // Building the index
gdDebug( "MDict: Building the index for dictionary: %s\n", i->c_str() ); gdDebug( "MDict: Building the index for dictionary: %s\n", fileName.c_str() );
MdictParser parser; MdictParser parser;
list< sptr< MdictParser > > mddParsers; list< sptr< MdictParser > > mddParsers;
if ( !parser.open( i->c_str() ) ) if ( !parser.open( fileName.c_str() ) )
continue; continue;
string title = parser.title().toStdString(); string title = parser.title().toStdString();
@ -1470,52 +1463,46 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Save dictionary stylesheets // Save dictionary stylesheets
{ {
MdictParser::StyleSheets const & styleSheets = parser.styleSheets(); MdictParser::StyleSheets const & styleSheets = parser.styleSheets();
idxHeader.styleSheetAddress = idx.tell(); idxHeader.styleSheetAddress = idx.tell();
idxHeader.styleSheetCount = styleSheets.size(); idxHeader.styleSheetCount = styleSheets.size();
for ( MdictParser::StyleSheets::const_iterator iter = styleSheets.begin(); for ( auto const & [ key, value ] : styleSheets ) {
iter != styleSheets.end(); ++iter ) string const styleBegin( value.first.toStdString() );
{ string const styleEnd( value.second.toStdString() );
string styleBegin(iter->second.first.toStdString());
string styleEnd( iter->second.second.toStdString() );
// key // key
idx.write<qint32>( iter->first ); idx.write< qint32 >( key );
// styleBegin // styleBegin
idx.write<quint32>( ( quint32 )styleBegin.size() + 1 ); idx.write< quint32 >( (quint32)styleBegin.size() + 1 );
idx.write( styleBegin.c_str(), styleBegin.size() + 1 ); idx.write( styleBegin.c_str(), styleBegin.size() + 1 );
// styleEnd // styleEnd
idx.write<quint32>( ( quint32 )styleEnd.size() + 1 ); idx.write< quint32 >( (quint32)styleEnd.size() + 1 );
idx.write( styleEnd.c_str(), styleEnd.size() + 1 ); idx.write( styleEnd.c_str(), styleEnd.size() + 1 );
} }
} }
// read languages // read languages
QPair<quint32, quint32> langs = LangCoder::findIdsForFilename( QString::fromStdString( *i ) ); QPair< quint32, quint32 > langs = LangCoder::findIdsForFilename( QString::fromStdString( fileName ) );
// if no languages found, try dictionary's name // if no languages found, try dictionary's name
if ( langs.first == 0 || langs.second == 0 ) if ( langs.first == 0 || langs.second == 0 ) {
{
langs = LangCoder::findIdsForFilename( parser.title() ); langs = LangCoder::findIdsForFilename( parser.title() );
} }
idxHeader.langFrom = langs.first; idxHeader.langFrom = langs.first;
idxHeader.langTo = langs.second; idxHeader.langTo = langs.second;
// Build index info for each mdd file // Build index info for each mdd file
vector< IndexInfo > mddIndexInfos; vector< IndexInfo > mddIndexInfos;
for ( vector< sptr< IndexedWords > >::const_iterator mddIndexIter = mddIndices.begin(); for ( const auto & mddIndice : mddIndices ) {
mddIndexIter != mddIndices.end(); ++mddIndexIter ) IndexInfo const resourceIdxInfo = BtreeIndexing::buildIndex( *mddIndice, idx );
{
IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *( *mddIndexIter ), idx );
mddIndexInfos.push_back( resourceIdxInfo ); mddIndexInfos.push_back( resourceIdxInfo );
} }
// Save address of IndexInfos for resource files // Save address of IndexInfos for resource files
idxHeader.mddIndexInfosOffset = idx.tell(); idxHeader.mddIndexInfosOffset = idx.tell();
idxHeader.mddIndexInfosCount = mddIndexInfos.size(); idxHeader.mddIndexInfosCount = mddIndexInfos.size();
for ( uint32_t mi = 0; mi < mddIndexInfos.size(); mi++ ) for ( uint32_t mi = 0; mi < mddIndexInfos.size(); mi++ ) {
{
const string & mddfile = mddFileNames[ mi ]; const string & mddfile = mddFileNames[ mi ];
idx.write<quint32>( ( quint32 )mddfile.size() + 1 ); idx.write<quint32>( ( quint32 )mddfile.size() + 1 );

View file

@ -495,7 +495,7 @@ void ZimDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration
getName().c_str() ); getName().c_str() );
try try
{ {
return FtsHelpers::makeFTSIndexXapian(this,isCancelled); return FtsHelpers::makeFTSIndex(this,isCancelled);
} }
catch( std::exception &ex ) catch( std::exception &ex )
{ {

View file

@ -1,7 +1,7 @@
/* This file is (c) 2014 Abs62 /* This file is (c) 2014 Abs62
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "xapian.h" #include "xapian.h"
#include <stdlib.h> #include <cstdlib>
#include "fulltextsearch.hh" #include "fulltextsearch.hh"
#include "ftshelpers.hh" #include "ftshelpers.hh"
#include "wstring_qt.hh" #include "wstring_qt.hh"
@ -18,9 +18,7 @@
#include <QRegularExpression> #include <QRegularExpression>
#include "wildcard.hh" #include "wildcard.hh"
#include <QtConcurrent>
#include "globalregex.hh" #include "globalregex.hh"
#include <QFutureSynchronizer>
#include <QSemaphoreReleaser> #include <QSemaphoreReleaser>
using std::vector; using std::vector;
@ -44,7 +42,7 @@ bool ftsIndexIsOldOrBad( string const & indexFile,
qDebug()<<document.get_data().c_str(); qDebug()<<document.get_data().c_str();
//use a special document to mark the end of the index. //use a special document to mark the end of the index.
return document.get_data().compare(finish_mark)!=0; return document.get_data()!=finish_mark;
} }
catch( Xapian::Error & e ) catch( Xapian::Error & e )
{ {
@ -114,10 +112,8 @@ static QString makeHiliteRegExpString( QStringList const & words,
void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStringList list ) void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStringList list )
{ {
QStringList wordList, hieroglyphList; QStringList wordList, hieroglyphList;
for( int i = 0; i < list.size(); i ++ ) for(auto word : list)
{ {
QString word = list.at( i );
// Check for CJK symbols in word // Check for CJK symbols in word
bool parsed = false; bool parsed = false;
QString hieroglyph; QString hieroglyph;
@ -150,8 +146,8 @@ void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStri
bool containCJK( QString const & str) bool containCJK( QString const & str)
{ {
bool hasCJK = false; bool hasCJK = false;
for( int x = 0; x < str.size(); x++ ) for(auto x : str)
if( isCJKChar( str.at( x ).unicode() ) ) if( isCJKChar( x.unicode() ) )
{ {
hasCJK = true; hasCJK = true;
break; break;
@ -255,9 +251,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
QVector< QString > setOfWords; QVector< QString > setOfWords;
setOfWords.reserve( articleWords.size() ); setOfWords.reserve( articleWords.size() );
for( int x = 0; x < articleWords.size(); x++ ) for(const auto & articleWord : articleWords)
{ {
QString word = articleWords.at( x ).toLower(); QString word = articleWord.toLower();
bool hasCJK = false; bool hasCJK = false;
QString hieroglyph; QString hieroglyph;
@ -295,9 +291,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
QStringList list; QStringList list;
QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts ); QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts );
for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it ) for ( auto const & it : oldVariant )
if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) ) if ( it.size() >= FTS::MinimumWordSize && !list.contains( it ) )
list.append( *it ); list.append( it );
QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word ); QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word );
if( match.hasMatch() ) if( match.hasMatch() )
@ -321,11 +317,10 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
list.append( parsedWord ); list.append( parsedWord );
} }
for( QStringList::iterator it = list.begin(); it != list.end(); ++it ) for ( auto const & it : list ) {
{
//if( !setOfWords.contains( *it ) ) //if( !setOfWords.contains( *it ) )
{ {
setOfWords.push_back( *it ); setOfWords.push_back( it );
/*Mutex::Lock _( _mapLock ); /*Mutex::Lock _( _mapLock );
words[ *it ].push_back( articleAddress );*/ words[ *it ].push_back( articleAddress );*/
} }
@ -352,118 +347,115 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
} }
void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ) void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{
return makeFTSIndexXapian(dict,isCancelled);
}
// use xapian to create the index
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{ {
Mutex::Lock _( dict->getFtsMutex() ); Mutex::Lock _( dict->getFtsMutex() );
//check the index again.
if ( dict->haveFTSIndex() )
return;
try { try {
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort(); throw exUserAbort();
// Open the database for update, creating a new database if necessary. // Open the database for update, creating a new database if necessary.
Xapian::WritableDatabase db(dict->ftsIndexName(), Xapian::DB_CREATE_OR_OPEN); Xapian::WritableDatabase db( dict->ftsIndexName(), Xapian::DB_CREATE_OR_OPEN );
Xapian::TermGenerator indexer; Xapian::TermGenerator indexer;
// Xapian::Stem stemmer("english"); // Xapian::Stem stemmer("english");
// indexer.set_stemmer(stemmer); // indexer.set_stemmer(stemmer);
// indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS); // indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS);
indexer.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM); indexer.set_flags( Xapian::TermGenerator::FLAG_CJK_NGRAM );
BtreeIndexing::IndexedWords indexedWords; BtreeIndexing::IndexedWords indexedWords;
QSet< uint32_t > setOfOffsets; QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( dict->getArticleCount() ); setOfOffsets.reserve( dict->getArticleCount() );
dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled ); dict->findArticleLinks( nullptr, &setOfOffsets, nullptr, &isCancelled );
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort(); throw exUserAbort();
QVector< uint32_t > offsets; QVector< uint32_t > offsets;
offsets.resize( setOfOffsets.size() ); offsets.resize( setOfOffsets.size() );
uint32_t * ptr = &offsets.front(); uint32_t * ptr = &offsets.front();
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin(); for ( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin(); it != setOfOffsets.constEnd(); ++it ) {
it != setOfOffsets.constEnd(); ++it ) *ptr = *it;
{ ptr++;
*ptr = *it; }
ptr++;
}
// Free memory // Free memory
setOfOffsets.clear(); setOfOffsets.clear();
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort(); throw exUserAbort();
dict->sortArticlesOffsetsForFTS( offsets, isCancelled ); dict->sortArticlesOffsetsForFTS( offsets, isCancelled );
// incremental build the index. // incremental build the index.
// get the last address. // get the last address.
bool skip = true; bool skip = true;
uint32_t lastAddress = -1; uint32_t lastAddress = -1;
try try {
{ if ( db.get_lastdocid() > 0 ) {
Xapian::Document lastDoc = db.get_document( db.get_lastdocid() ); Xapian::Document lastDoc = db.get_document( db.get_lastdocid() );
lastAddress = atoi( lastDoc.get_data().c_str() ); lastAddress = atoi( lastDoc.get_data().c_str() );
} }
catch( Xapian::Error & e ) else {
{ skip = false;
qDebug() << e.get_description().c_str(); }
skip = false; }
} catch ( Xapian::Error & e ) {
qDebug() << "get last doc failed: " << e.get_description().c_str();
long indexedDoc=0L;
for( auto & address : offsets )
{
indexedDoc++;
if(address==lastAddress){
skip = false; skip = false;
} }
//skip until to the lastAddress;
if((address!=lastAddress)&&skip){ long indexedDoc = 0L;
continue;
for ( auto const & address : offsets ) {
indexedDoc++;
if ( address > lastAddress && skip ) {
skip = false;
}
//skip until to the lastAddress;
if ( skip ) {
continue;
}
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
return;
}
QString headword, articleStr;
dict->getArticleText( address, headword, articleStr );
Xapian::Document doc;
indexer.set_document( doc );
indexer.index_text_without_positions( articleStr.toStdString() );
doc.set_data( std::to_string( address ) );
// Add the document to the database.
db.add_document( doc );
dict->setIndexedFtsDoc( indexedDoc );
} }
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) //add a special document to mark the end of the index.
{
return;
}
QString headword, articleStr;
dict->getArticleText( address, headword, articleStr );
Xapian::Document doc; Xapian::Document doc;
doc.set_data( finish_mark );
indexer.set_document( doc );
indexer.index_text_without_positions( articleStr.toStdString() );
doc.set_data( std::to_string( address ) );
// Add the document to the database. // Add the document to the database.
db.add_document( doc ); db.add_document( doc );
dict->setIndexedFtsDoc(indexedDoc); // Free memory
offsets.clear();
db.commit();
} }
catch ( Xapian::Error & e ) {
//add a special document to mark the end of the index. qWarning() << "create xapian index:" << QString::fromStdString( e.get_description() );
Xapian::Document doc;
doc.set_data( finish_mark );
// Add the document to the database.
db.add_document( doc );
// Free memory
offsets.clear();
db.commit();
} catch (Xapian::Error & e) {
qWarning()<<QString::fromStdString(e.get_description());
} }
} }
@ -492,7 +484,7 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
} }
} }
QRegularExpression FTSResultsRequest::createMatchRegex( QRegExp const & searchRegexp ) QRegularExpression FTSResultsRequest::createMatchRegex( QRegExp const & searchRegexp ) const
{ {
QRegularExpression searchRegularExpression; QRegularExpression searchRegularExpression;
@ -559,18 +551,17 @@ void FTSResultsRequest::checkSingleArticle( uint32_t offset,
QVector< QPair< QString, bool > > wordsList; QVector< QPair< QString, bool > > wordsList;
if( ignoreWordsOrder ) if( ignoreWordsOrder )
{ {
for( QStringList::const_iterator it = words.begin(); it != words.end(); ++it ) for(const auto & word : words)
wordsList.append( QPair< QString, bool >( *it, true ) ); wordsList.append( QPair< QString, bool >( word, true ) );
} }
// for( int i = 0; i < offsets.size(); i++ ) // for( int i = 0; i < offsets.size(); i++ )
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return; return;
if( ignoreWordsOrder ) if ( ignoreWordsOrder ) {
{ for ( auto & [ fst, snd ] : wordsList )
for( int i = 0; i < wordsList.size(); i++ ) snd = true;
wordsList[ i ].second = true;
} }
dict.getArticleText( offset, headword, articleText ); dict.getArticleText( offset, headword, articleText );
@ -717,10 +708,8 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
vector< BtreeIndexing::WordArticleLink > links = vector< BtreeIndexing::WordArticleLink > links =
ftsIndex.findArticles( gd::removeTrailingZero( word ), ignoreDiacritics ); ftsIndex.findArticles( gd::removeTrailingZero( word ), ignoreDiacritics );
for( unsigned x = 0; x < links.size(); x++ ) for ( auto const & link : links ) {
{ if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
addressLists << tmp; addressLists << tmp;
return; return;
} }
@ -729,7 +718,7 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
char * linksPtr; char * linksPtr;
{ {
// Mutex::Lock _( dict.getFtsMutex() ); // Mutex::Lock _( dict.getFtsMutex() );
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk ); linksPtr = chunks->getBlock( link.articleOffset, chunk );
} }
memcpy( &size, linksPtr, sizeof( uint32_t ) ); memcpy( &size, linksPtr, sizeof( uint32_t ) );
@ -750,7 +739,7 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
// int n = indexWords.length(); // int n = indexWords.length();
// QtConcurrent::blockingMap( indexWords, findLinks ); // QtConcurrent::blockingMap( indexWords, findLinks );
for(QString word:indexWords) for(const QString& word:indexWords)
{ {
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{ {
@ -812,9 +801,8 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
QStringList wordsList, hieroglyphsList; QStringList wordsList, hieroglyphsList;
for( int x = 0; x < indexWords.size(); x++ ) for(const auto & word : indexWords)
{ {
QString const & word = indexWords.at( x );
if( isCJKChar( word[ 0 ].unicode() ) ) if( isCJKChar( word[ 0 ].unicode() ) )
hieroglyphsList.append( word ); hieroglyphsList.append( word );
else else
@ -839,10 +827,8 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
{ {
QSet< uint32_t > tmp; QSet< uint32_t > tmp;
vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::removeTrailingZero( word ) ); vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::removeTrailingZero( word ) );
for( unsigned x = 0; x < links.size(); x++ ) for ( auto const & link : links ) {
{ if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
Mutex::Lock _( dataMutex ); Mutex::Lock _( dataMutex );
sets << tmp; sets << tmp;
return; return;
@ -852,7 +838,7 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
char * linksPtr; char * linksPtr;
{ {
// Mutex::Lock _( dict.getFtsMutex() ); // Mutex::Lock _( dict.getFtsMutex() );
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk ); linksPtr = chunks->getBlock( link.articleOffset, chunk );
} }
memcpy( &size, linksPtr, sizeof( uint32_t ) ); memcpy( &size, linksPtr, sizeof( uint32_t ) );
@ -945,17 +931,17 @@ void FTSResultsRequest::fullIndexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
return; return;
links.reserve( wordsInIndex ); links.reserve( wordsInIndex );
ftsIndex.findArticleLinks( &links, 0, 0, &isCancelled ); ftsIndex.findArticleLinks( &links, nullptr, nullptr, &isCancelled );
QVector< QSet< uint32_t > > allWordsLinks; QVector< QSet< uint32_t > > allWordsLinks;
allWordsLinks.resize( indexWords.size() ); allWordsLinks.resize( indexWords.size() );
for( int x = 0; x < links.size(); x++ ) for(auto & link : links)
{ {
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return; return;
QString word = QString::fromUtf8( links[ x ].word.data(), links[ x ].word.size() ); QString word = QString::fromUtf8( link.word.data(), link.word.size() );
if( ignoreDiacritics ) if( ignoreDiacritics )
word = QString::fromStdU32String( Folding::applyDiacriticsOnly( gd::toWString( word ) ) ); word = QString::fromStdU32String( Folding::applyDiacriticsOnly( gd::toWString( word ) ) );
@ -968,7 +954,7 @@ void FTSResultsRequest::fullIndexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
char * linksPtr; char * linksPtr;
{ {
// Mutex::Lock _( dict.getFtsMutex() ); // Mutex::Lock _( dict.getFtsMutex() );
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk ); linksPtr = chunks->getBlock( link.articleOffset, chunk );
} }
memcpy( &size, linksPtr, sizeof(uint32_t) ); memcpy( &size, linksPtr, sizeof(uint32_t) );
@ -1026,7 +1012,7 @@ void FTSResultsRequest::fullSearch( QStringList & searchWords, QRegExp & regexp
QSet< uint32_t > setOfOffsets; QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( dict.getArticleCount() ); setOfOffsets.reserve( dict.getArticleCount() );
dict.findArticleLinks( 0, &setOfOffsets, 0, &isCancelled ); dict.findArticleLinks( nullptr, &setOfOffsets, nullptr, &isCancelled );
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return; return;
@ -1119,9 +1105,9 @@ void FTSResultsRequest::runXapian()
Mutex::Lock _( dataMutex ); Mutex::Lock _( dataMutex );
QString id = QString::fromUtf8( dict.getId().c_str() ); QString id = QString::fromUtf8( dict.getId().c_str() );
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled ); dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
for( int x = 0; x < headwords.size(); x++ ) for(const auto & headword : headwords)
{ {
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, QStringList(), matchCase ) ); foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
} }
} }
} }
@ -1143,7 +1129,7 @@ void FTSResultsRequest::runXapian()
Mutex::Lock _( dataMutex ); Mutex::Lock _( dataMutex );
data.resize( sizeof( foundHeadwords ) ); data.resize( sizeof( foundHeadwords ) );
memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) ); memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) );
foundHeadwords = 0; foundHeadwords = nullptr;
hasAnyData = true; hasAnyData = true;
} }
} }

View file

@ -64,7 +64,6 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
bool handleRoundBrackets = false ); bool handleRoundBrackets = false );
void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ); void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled );
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled );
bool isCJKChar( ushort ch ); bool isCJKChar( ushort ch );
class FTSResultsRequest : public Dictionary::DataRequest class FTSResultsRequest : public Dictionary::DataRequest
@ -91,7 +90,7 @@ class FTSResultsRequest : public Dictionary::DataRequest
void checkArticles( QVector< uint32_t > const & offsets, void checkArticles( QVector< uint32_t > const & offsets,
QStringList const & words, QStringList const & words,
QRegExp const & searchRegexp = QRegExp() ); QRegExp const & searchRegexp = QRegExp() );
QRegularExpression createMatchRegex( QRegExp const & searchRegexp ); QRegularExpression createMatchRegex( QRegExp const & searchRegexp ) const;
void checkSingleArticle( uint32_t offset, void checkSingleArticle( uint32_t offset,
QStringList const & words, QStringList const & words,

View file

@ -22,44 +22,30 @@
namespace FTS namespace FTS
{ {
enum
{
MinDistanceBetweenWords = 0,
MaxDistanceBetweenWords = 15,
MinArticlesPerDictionary = 1,
MaxArticlesPerDictionary = 10000
};
void Indexing::run() void Indexing::run()
{ {
try try
{ {
timerThread->start(); timerThread->start();
// First iteration - dictionaries with no more MaxDictionarySizeForFastSearch articles // First iteration - dictionaries with no more MaxDictionarySizeForFastSearch articles
for( size_t x = 0; x < dictionaries.size(); x++ ) for ( const auto & dictionary : dictionaries ) {
{ if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
break; break;
if( dictionaries.at( x )->canFTS() if ( dictionary->canFTS() && !dictionary->haveFTSIndex() ) {
&&!dictionaries.at( x )->haveFTSIndex() ) emit sendNowIndexingName( QString::fromUtf8( dictionary->getName().c_str() ) );
{ dictionary->makeFTSIndex( isCancelled, true );
emit sendNowIndexingName( QString::fromUtf8( dictionaries.at( x )->getName().c_str() ) );
dictionaries.at( x )->makeFTSIndex( isCancelled, true );
} }
} }
// Second iteration - all remaining dictionaries // Second iteration - all remaining dictionaries
for( size_t x = 0; x < dictionaries.size(); x++ ) for ( const auto & dictionary : dictionaries ) {
{ if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
break; break;
if( dictionaries.at( x )->canFTS() if ( dictionary->canFTS() && !dictionary->haveFTSIndex() ) {
&&!dictionaries.at( x )->haveFTSIndex() ) emit sendNowIndexingName( QString::fromUtf8( dictionary->getName().c_str() ) );
{ dictionary->makeFTSIndex( isCancelled, false );
emit sendNowIndexingName( QString::fromUtf8( dictionaries.at( x )->getName().c_str() ) );
dictionaries.at( x )->makeFTSIndex( isCancelled, false );
} }
} }
@ -68,26 +54,27 @@ void Indexing::run()
} }
catch( std::exception &ex ) catch( std::exception &ex )
{ {
gdWarning( "Exception occured while full-text search: %s", ex.what() ); gdWarning( "Exception occurred while full-text search: %s", ex.what() );
} }
emit sendNowIndexingName( QString() ); emit sendNowIndexingName( QString() );
} }
void Indexing::timeout(){ void Indexing::timeout()
for( size_t x = 0; x < dictionaries.size(); x++ ) {
{ //display all the dictionary name in the following loop ,may result only one dictionary name been seen.
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) //as the interval is so small.
for ( const auto & dictionary : dictionaries ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
break; break;
auto progress = dictionaries.at( x )->getIndexingFtsProgress(); auto newProgress = dictionary->getIndexingFtsProgress();
if( progress>0&&progress<100) if ( newProgress > 0 && newProgress < 100 ) {
{ emit sendNowIndexingName(
emit sendNowIndexingName( QString::fromUtf8( dictionaries.at( x )->getName().c_str() )+QString("......%1%2").arg("%").arg(progress) ); QString( "%1......%%2" ).arg( QString::fromStdString( dictionary->getName() ) ).arg( newProgress ) );
} }
} }
} }
FtsIndexing::FtsIndexing( std::vector< sptr< Dictionary::Class > > const & dicts): FtsIndexing::FtsIndexing( std::vector< sptr< Dictionary::Class > > const & dicts):
dictionaries( dicts ), dictionaries( dicts ),
started( false ) started( false )
@ -128,7 +115,7 @@ void FtsIndexing::stopIndexing()
} }
} }
void FtsIndexing::setNowIndexedName( QString name ) void FtsIndexing::setNowIndexedName( const QString & name )
{ {
{ {
Mutex::Lock _( nameMutex ); Mutex::Lock _( nameMutex );
@ -235,6 +222,10 @@ FullTextSearchDialog::FullTextSearchDialog( QWidget * parent,
setNewIndexingName( ftsIdx.nowIndexingName() ); setNewIndexingName( ftsIdx.nowIndexingName() );
connect( &ftsIdx, &FtsIndexing::newIndexingName, this, &FullTextSearchDialog::setNewIndexingName ); connect( &ftsIdx, &FtsIndexing::newIndexingName, this, &FullTextSearchDialog::setNewIndexingName );
connect( GlobalBroadcaster::instance(),
&GlobalBroadcaster::indexingDictionary,
this,
&FullTextSearchDialog::setNewIndexingName );
ui.searchMode->addItem( tr( "Whole words" ), WholeWords ); ui.searchMode->addItem( tr( "Whole words" ), WholeWords );
ui.searchMode->addItem( tr( "Plain text"), PlainText ); ui.searchMode->addItem( tr( "Plain text"), PlainText );

View file

@ -97,8 +97,6 @@ public:
~Indexing() ~Indexing()
{ {
emit sendNowIndexingName( QString() ); emit sendNowIndexingName( QString() );
hasExited.release(); hasExited.release();
} }
@ -146,7 +144,7 @@ protected:
Mutex nameMutex; Mutex nameMutex;
private slots: private slots:
void setNowIndexedName( QString name ); void setNowIndexedName( const QString & name );
signals: signals:
void newIndexingName( QString name ); void newIndexingName( QString name );

View file

@ -691,6 +691,10 @@ MainWindow::MainWindow( Config::Class & cfg_ ):
groupListInToolbar->installEventFilter( this ); groupListInToolbar->installEventFilter( this );
connect( &ftsIndexing, &FTS::FtsIndexing::newIndexingName, this, &MainWindow::showFTSIndexingName ); connect( &ftsIndexing, &FTS::FtsIndexing::newIndexingName, this, &MainWindow::showFTSIndexingName );
connect( GlobalBroadcaster::instance(),
&GlobalBroadcaster::indexingDictionary,
this,
&MainWindow::showFTSIndexingName );
applyProxySettings(); applyProxySettings();