feat: use parallel thread to create the fts (#980)

* feat: use parallel thread to create the fts

* [autofix.ci] apply automated fixes

* feat: use parallel thread to create the fts

* feat: add an option to control the thread count when create the fulltext

* [autofix.ci] apply automated fixes

* feat: set default parallel thread count to half the cpu numbers

* feat: initialize the default parallel threads to 1/3 of cpu cores

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2023-07-20 22:50:32 +08:00 committed by GitHub
parent df4bc68248
commit e5f91f6a3e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 101 additions and 116 deletions

View file

@ -1074,6 +1074,9 @@ Class load()
if ( !fts.namedItem( "maxDictionarySize" ).isNull() )
c.preferences.fts.maxDictionarySize = fts.namedItem( "maxDictionarySize" ).toElement().text().toUInt();
if ( !fts.namedItem( "parallelThreads" ).isNull() )
c.preferences.fts.parallelThreads = fts.namedItem( "parallelThreads" ).toElement().text().toUInt();
}
}
@ -2052,6 +2055,10 @@ void save( Class const & c )
opt = dd.createElement( "maxDictionarySize" );
opt.appendChild( dd.createTextNode( QString::number( c.preferences.fts.maxDictionarySize ) ) );
hd.appendChild( opt );
opt = dd.createElement( "parallelThreads" );
opt.appendChild( dd.createTextNode( QString::number( c.preferences.fts.parallelThreads ) ) );
hd.appendChild( opt );
}
}

View file

@ -16,6 +16,7 @@
#include <QDomDocument>
#include <QLocale>
#include <optional>
#include <QThread>
/// GoldenDict's configuration
namespace Config {
@ -202,6 +203,7 @@ struct FullTextSearch
bool enablePosition = false;
quint32 maxDictionarySize;
quint32 parallelThreads = QThread::idealThreadCount() / 3 + 1;
QByteArray dialogGeometry;
QString disabledTypes;

View file

@ -1,5 +1,6 @@
/* This file is (c) 2014 Abs62
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
//xapian.h must at the first in the include header files to avoid collision with other macro definition.
#include "xapian.h"
#include <cstdlib>
#include "fulltextsearch.hh"
@ -13,13 +14,9 @@
#include <vector>
#include <string>
#include <QVector>
#include <QRegularExpression>
#include "wildcard.hh"
#include "globalregex.hh"
#include <QSemaphoreReleaser>
using std::vector;
using std::string;
@ -52,52 +49,6 @@ bool ftsIndexIsOldOrBad( BtreeIndexing::BtreeDictionary * dict )
}
}
static QString makeHiliteRegExpString( QStringList const & words,
int searchMode,
int distanceBetweenWords,
bool hasCJK = false,
bool ignoreWordsOrder = false )
{
QString searchString( "(" );
QString stripWords( "(?:\\W+\\w+){0," );
if ( hasCJK ) {
stripWords = "(?:[\\W\\w]){0,";
}
if ( distanceBetweenWords >= 0 )
stripWords += QString::number( distanceBetweenWords );
stripWords += "}";
if ( !hasCJK ) {
stripWords += "\\W+";
}
QString boundWord( searchMode == FTS::WholeWords ? "\\b" : "(?:\\w*)" );
if ( hasCJK ) {
//no boundary for CJK
boundWord.clear();
}
for ( int x = 0; x < words.size(); x++ ) {
if ( x ) {
searchString += stripWords;
if ( ignoreWordsOrder )
searchString += "(";
}
searchString += boundWord + words[ x ] + boundWord;
if ( x ) {
if ( ignoreWordsOrder )
searchString += ")?";
}
}
searchString += ")";
return searchString;
}
void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStringList list )
{

View file

@ -25,28 +25,30 @@ void Indexing::run()
{
try {
timerThread->start();
// First iteration - dictionaries with no more MaxDictionarySizeForFastSearch articles
for ( const auto & dictionary : dictionaries ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
break;
const int parallel_count = GlobalBroadcaster::instance()->getPreference()->fts.parallelThreads;
QSemaphore sem( parallel_count < 1 ? 1 : parallel_count );
if ( dictionary->canFTS() && !dictionary->haveFTSIndex() ) {
emit sendNowIndexingName( QString::fromUtf8( dictionary->getName().c_str() ) );
dictionary->makeFTSIndex( isCancelled, true );
}
QFutureSynchronizer< void > synchronizer;
qDebug() << "starting create the fts with thread:" << parallel_count;
for ( const auto & dictionary : dictionaries ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
// synchronizer.setCancelOnWait( true );
break;
}
// Second iteration - all remaining dictionaries
for ( const auto & dictionary : dictionaries ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
break;
if ( dictionary->canFTS() && !dictionary->haveFTSIndex() ) {
sem.acquire();
QFuture< void > const f = QtConcurrent::run( [ this, &sem, &dictionary ]() {
QSemaphoreReleaser const _( sem );
emit sendNowIndexingName( QString::fromUtf8( dictionary->getName().c_str() ) );
dictionary->makeFTSIndex( isCancelled, false );
} );
synchronizer.addFuture( f );
}
}
qDebug() << "waiting for all the fts creation to finish.";
synchronizer.waitForFinished();
qDebug() << "finished/cancel all the fts creation";
timerThread->quit();
timerThread->wait();
}

View file

@ -7,6 +7,7 @@
#include <QDir>
#include <QFontDatabase>
#include <QMessageBox>
#include <QThread>
#include <QWebEngineProfile>
#include <QWebEngineSettings>
#include <QStyleFactory>
@ -373,6 +374,9 @@ Preferences::Preferences( QWidget * parent, Config::Class & cfg_ ):
ui.allowEpwing->hide();
#endif
ui.maxDictionarySize->setValue( p.fts.maxDictionarySize );
ui.parallelThreads->setMaximum( QThread::idealThreadCount() );
ui.parallelThreads->setValue( p.fts.parallelThreads );
}
void Preferences::buildDisabledTypes( QString & disabledTypes, bool is_checked, QString name )
@ -506,6 +510,7 @@ Config::Preferences Preferences::getPreferences()
p.fts.enabled = ui.ftsGroupBox->isChecked();
p.fts.maxDictionarySize = ui.maxDictionarySize->value();
p.fts.parallelThreads = ui.parallelThreads->value();
p.fts.enablePosition = ui.enablePosition->isChecked();
buildDisabledTypes( p.fts.disabledTypes, ui.allowAard->isChecked(), "AARD" );

View file

@ -1409,6 +1409,37 @@ download page.</string>
<bool>true</bool>
</property>
<layout class="QGridLayout" name="gridLayout_4">
<item row="3" column="1">
<widget class="QCheckBox" name="allowZim">
<property name="text">
<string notr="true">Zim</string>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QCheckBox" name="allowMDict">
<property name="text">
<string notr="true">MDict</string>
</property>
</widget>
</item>
<item row="7" column="0" colspan="2">
<widget class="QCheckBox" name="enablePosition">
<property name="toolTip">
<string>Positional information is required to use Xapian's phrase searching and NEAR operator, but the database size will be much bigger. Applies only to new incoming dictionaries.</string>
</property>
<property name="text">
<string>Enable index with positional information</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="allowBGL">
<property name="text">
<string notr="true">BGL</string>
</property>
</widget>
</item>
<item row="2" column="0">
<widget class="QCheckBox" name="allowDictD">
<property name="text">
@ -1423,24 +1454,10 @@ download page.</string>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QCheckBox" name="allowSlob">
<item row="5" column="0">
<widget class="QCheckBox" name="allowSDict">
<property name="text">
<string notr="true">Slob</string>
</property>
</widget>
</item>
<item row="5" column="1">
<widget class="QCheckBox" name="allowGls">
<property name="text">
<string notr="true">GLS</string>
</property>
</widget>
</item>
<item row="0" column="0">
<widget class="QCheckBox" name="allowAard">
<property name="text">
<string notr="true">Aard</string>
<string notr="true">SDict</string>
</property>
</widget>
</item>
@ -1451,27 +1468,6 @@ download page.</string>
</property>
</widget>
</item>
<item row="4" column="0">
<widget class="QCheckBox" name="allowMDict">
<property name="text">
<string notr="true">MDict</string>
</property>
</widget>
</item>
<item row="3" column="1">
<widget class="QCheckBox" name="allowZim">
<property name="text">
<string notr="true">Zim</string>
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QCheckBox" name="allowStardict">
<property name="text">
<string notr="true">Stardict</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QCheckBox" name="allowDSL">
<property name="text">
@ -1479,17 +1475,24 @@ download page.</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="allowBGL">
<item row="5" column="1">
<widget class="QCheckBox" name="allowGls">
<property name="text">
<string notr="true">BGL</string>
<string notr="true">GLS</string>
</property>
</widget>
</item>
<item row="5" column="0">
<widget class="QCheckBox" name="allowSDict">
<item row="0" column="1">
<widget class="QCheckBox" name="allowSlob">
<property name="text">
<string notr="true">SDict</string>
<string notr="true">Slob</string>
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QCheckBox" name="allowStardict">
<property name="text">
<string notr="true">Stardict</string>
</property>
</widget>
</item>
@ -1534,16 +1537,31 @@ download page.</string>
</item>
</layout>
</item>
<item row="7" column="0" colspan="2">
<widget class="QCheckBox" name="enablePosition">
<property name="toolTip">
<string>Positional information is required to use Xapian's phrase searching and NEAR operator, but the database size will be much bigger. Applies only to new incoming dictionaries.</string>
</property>
<item row="0" column="0">
<widget class="QCheckBox" name="allowAard">
<property name="text">
<string>Enable index with positional information</string>
<string notr="true">Aard</string>
</property>
</widget>
</item>
<item row="8" column="0">
<layout class="QHBoxLayout" name="horizontalLayout_18">
<item>
<widget class="QLabel" name="label_7">
<property name="text">
<string>Create fulltext index with parallel threads </string>
</property>
</widget>
</item>
<item>
<widget class="QSpinBox" name="parallelThreads">
<property name="minimum">
<number>1</number>
</property>
</widget>
</item>
</layout>
</item>
</layout>
</widget>
</item>