opt: add xapian support

opt: add xapian fullindex support
This commit is contained in:
Xiao YiFang 2022-10-01 21:57:55 +08:00 committed by Xiao YiFang
parent 0ce1ff8566
commit 6a34804df2
7 changed files with 295 additions and 37 deletions

View file

@ -567,13 +567,24 @@ bool needToRebuildIndex( vector< string > const & dictionaryFiles,
if ( ts > lastModified ) if ( ts > lastModified )
lastModified = ts; lastModified = ts;
} }
#ifndef USE_XAPIAN
QDir d(FsEncoding::decode( indexFile.c_str() ));
if(d.exists()){
d.removeRecursively();
}
QFileInfo fileInfo( FsEncoding::decode( indexFile.c_str() ) ); QFileInfo fileInfo( FsEncoding::decode( indexFile.c_str() ) );
if ( !fileInfo.exists() ) if ( !fileInfo.exists() )
return true; return true;
return fileInfo.lastModified().toSecsSinceEpoch() < lastModified; return fileInfo.lastModified().toSecsSinceEpoch() < lastModified;
#else
QDir d(FsEncoding::decode( indexFile.c_str() ));
if(!d.exists()){
return true;
}
return false;
#endif
} }
QString generateRandomDictionaryId() QString generateRandomDictionaryId()

View file

@ -114,6 +114,7 @@ void Class::open( char const * filename, char const * mode )
f.setFileName( FsEncoding::decode( filename ) ); f.setFileName( FsEncoding::decode( filename ) );
//maybe directory, the xapian use directory to store the index.
if ( !f.open( openMode ) ) if ( !f.open( openMode ) )
throw exCantOpen( std::string( filename ) + ": " + f.errorString().toUtf8().data() ); throw exCantOpen( std::string( filename ) + ": " + f.errorString().toUtf8().data() );
} }

View file

@ -1,6 +1,9 @@
/* This file is (c) 2014 Abs62 /* This file is (c) 2014 Abs62
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifdef USE_XAPIAN
#include "xapian.h"
#include <stdlib.h>
#endif
#include "fulltextsearch.hh" #include "fulltextsearch.hh"
#include "ftshelpers.hh" #include "ftshelpers.hh"
#include "wstring_qt.hh" #include "wstring_qt.hh"
@ -33,6 +36,25 @@ namespace FtsHelpers
bool ftsIndexIsOldOrBad( string const & indexFile, bool ftsIndexIsOldOrBad( string const & indexFile,
BtreeIndexing::BtreeDictionary * dict ) BtreeIndexing::BtreeDictionary * dict )
{ {
#ifdef USE_XAPIAN
try
{
Xapian::WritableDatabase db( dict->ftsIndexName() );
}
catch( const Xapian::Error & e )
{
qWarning() << e.get_description().c_str();
//the file is corrupted,remove it.
QFile::remove(QString::fromStdString(dict->ftsIndexName()));
return true;
}
catch( ... )
{
return true;
}
return false;
#endif
File::Class idx( indexFile, "rb" ); File::Class idx( indexFile, "rb" );
FtsIdxHeader header; FtsIdxHeader header;
@ -321,8 +343,6 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
words[ word ].push_back( articleAddress );*/ words[ word ].push_back( articleAddress );*/
} }
} }
} }
{ {
@ -337,6 +357,10 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ) void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{ {
#ifdef USE_XAPIAN
return makeFTSIndexXapian(dict,isCancelled);
#endif
Mutex::Lock _( dict->getFtsMutex() ); Mutex::Lock _( dict->getFtsMutex() );
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
@ -466,6 +490,83 @@ void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancell
ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 ); ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 );
} }
// use xapian to create the index
#ifdef USE_XAPIAN
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
{
Mutex::Lock _( dict->getFtsMutex() );
try {
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
// Open the database for update, creating a new database if necessary.
Xapian::WritableDatabase db(dict->ftsIndexName(), Xapian::DB_CREATE_OR_OPEN);
Xapian::TermGenerator indexer;
Xapian::Stem stemmer("english");
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS);
BtreeIndexing::IndexedWords indexedWords;
QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( dict->getArticleCount() );
dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QVector< uint32_t > offsets;
offsets.resize( setOfOffsets.size() );
uint32_t * ptr = &offsets.front();
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
it != setOfOffsets.constEnd(); ++it )
{
*ptr = *it;
ptr++;
}
// Free memory
setOfOffsets.clear();
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
dict->sortArticlesOffsetsForFTS( offsets, isCancelled );
for( auto & address : offsets )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
return;
}
QString headword, articleStr;
dict->getArticleText( address, headword, articleStr );
Xapian::Document doc;
indexer.set_document( doc );
indexer.index_text( articleStr.toStdString() );
doc.add_boolean_term( std::to_string( address ) );
doc.set_data( std::to_string( address ) );
// Add the document to the database.
db.add_document( doc );
}
// Free memory
offsets.clear();
db.commit();
} catch (Xapian::Error & e) {
qWarning()<<QString::fromStdString(e.get_description());
}
}
#endif
bool isCJKChar( ushort ch ) bool isCJKChar( ushort ch )
{ {
return Utils::isCJKChar(ch); return Utils::isCJKChar(ch);
@ -1056,6 +1157,10 @@ void FTSResultsRequest::fullSearch( QStringList & searchWords, QRegExp & regexp
void FTSResultsRequest::run() void FTSResultsRequest::run()
{ {
#ifdef USE_XAPIAN
return runXapian();
#endif
if ( dict.ensureInitDone().size() ) if ( dict.ensureInitDone().size() )
{ {
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) ); setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
@ -1131,5 +1236,107 @@ void FTSResultsRequest::run()
finish(); finish();
} }
#ifdef USE_XAPIAN
void FTSResultsRequest::runXapian()
{
if ( dict.ensureInitDone().size() )
{
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
finish();
return;
}
try
{
if( dict.haveFTSIndex() )
{
//no need to parse the search string, use xapian directly.
//if the search mode is wildcard, change xapian search query flag?
// Open the database for searching.
Xapian::Database db(dict.ftsIndexName());
// Start an enquire session.
Xapian::Enquire enquire( db );
// Combine the rest of the command line arguments with spaces between
// them, so that simple queries don't have to be quoted at the shell
// level.
string query_string( searchString.toStdString() );
// Parse the query string to produce a Xapian::Query object.
Xapian::QueryParser qp;
Xapian::Stem stemmer( "english" );
qp.set_stemmer( stemmer );
qp.set_database( db );
qp.set_stemming_strategy( Xapian::QueryParser::STEM_SOME );
Xapian::QueryParser::feature_flag flag = Xapian::QueryParser::FLAG_DEFAULT;
if( searchMode == FTS::Wildcards )
flag = Xapian::QueryParser::FLAG_WILDCARD;
Xapian::Query query = qp.parse_query( query_string, flag );
qDebug() << "Parsed query is: " << query.get_description().c_str();
// Find the top 100 results for the query.
enquire.set_query( query );
Xapian::MSet matches = enquire.get_mset( 0, 100 );
// Display the results.
qDebug() << matches.get_matches_estimated() << " results found.\n";
qDebug() << "Matches 1-" << matches.size() << ":\n\n";
QList< uint32_t > offsetsForHeadwords;
for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
{
qDebug() << i.get_rank() + 1 << ": " << i.get_weight() << " docid=" << *i << " ["
<< i.get_document().get_data().c_str() << "]";
offsetsForHeadwords.append( atoi( i.get_document().get_data().c_str() ) );
}
if( !offsetsForHeadwords.isEmpty() )
{
QVector< QString > headwords;
Mutex::Lock _( dataMutex );
QString id = QString::fromUtf8( dict.getId().c_str() );
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
for( int x = 0; x < headwords.size(); x++ )
{
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, QStringList(), matchCase ) );
}
}
}
else
{
QStringList indexWords, searchWords;
QRegExp searchRegExp;
if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
{
finish();
return;
}
fullSearch( searchWords, searchRegExp );
}
if( foundHeadwords && foundHeadwords->size() > 0 )
{
Mutex::Lock _( dataMutex );
data.resize( sizeof( foundHeadwords ) );
memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) );
foundHeadwords = 0;
hasAnyData = true;
}
}
catch (const Xapian::Error &e) {
qWarning() << e.get_description().c_str();
}
catch( std::exception &ex )
{
gdWarning( "FTS: Failed full-text search for \"%s\", reason: %s\n",
dict.getName().c_str(), ex.what() );
// Results not loaded -- we don't set the hasAnyData flag then
}
finish();
}
#endif
} // namespace } // namespace

View file

@ -64,7 +64,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
bool handleRoundBrackets = false ); bool handleRoundBrackets = false );
void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ); void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled );
#ifdef USE_XAPIAN
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled );
#endif
bool isCJKChar( ushort ch ); bool isCJKChar( ushort ch );
class FTSResultsRequest : public Dictionary::DataRequest class FTSResultsRequest : public Dictionary::DataRequest
@ -142,7 +144,9 @@ public:
} }
void run(); void run();
#ifdef USE_XAPIAN
void runXapian();
#endif
virtual void cancel() virtual void cancel()
{ {
isCancelled.ref(); isCancelled.ref();

View file

@ -225,7 +225,19 @@ FullTextSearchDialog::FullTextSearchDialog( QWidget * parent,
ui.searchMode->addItem( tr( "Whole words" ), WholeWords ); ui.searchMode->addItem( tr( "Whole words" ), WholeWords );
ui.searchMode->addItem( tr( "Plain text"), PlainText ); ui.searchMode->addItem( tr( "Plain text"), PlainText );
ui.searchMode->addItem( tr( "Wildcards" ), Wildcards ); ui.searchMode->addItem( tr( "Wildcards" ), Wildcards );
#ifndef USE_XAPIAN
ui.searchMode->addItem( tr( "RegExp" ), RegExp ); ui.searchMode->addItem( tr( "RegExp" ), RegExp );
#else
ui.matchCase->hide();
ui.articlesPerDictionary->hide();
ui.checkBoxArticlesPerDictionary->hide();
ui.checkBoxIgnoreDiacritics->hide();
ui.checkBoxDistanceBetweenWords->hide();
ui.distanceBetweenWords->hide();
ui.checkBoxIgnoreWordOrder->hide();
ui.searchLine->setToolTip(tr("support xapian search syntax,such as AND OR +/- etc"));
#endif
ui.searchMode->setCurrentIndex( cfg.preferences.fts.searchMode ); ui.searchMode->setCurrentIndex( cfg.preferences.fts.searchMode );
ui.searchProgressBar->hide(); ui.searchProgressBar->hide();
@ -550,6 +562,26 @@ void FullTextSearchDialog::itemClicked( const QModelIndex & idx )
{ {
QString headword = results[ idx.row() ].headword; QString headword = results[ idx.row() ].headword;
QRegExp reg; QRegExp reg;
#ifdef USE_XAPIAN
auto searchText = ui.searchLine->text();
searchText.replace(
QRegularExpression( "[\\*\\?\\+\\\"]|\\bAnd\\b|\\bOR\\b", QRegularExpression::CaseInsensitiveOption ),
" " );
auto parts = searchText.split( QRegularExpression( "\\s" ), Qt::SkipEmptyParts );
QString firstAvailbeItem;
for( auto & p : parts )
{
if( p.startsWith( '-' ) )
continue;
firstAvailbeItem = p;
break;
}
if( !firstAvailbeItem.isEmpty() )
{
reg = QRegExp( firstAvailbeItem, Qt::CaseInsensitive, QRegExp::RegExp2 );
reg.setMinimal( true );
}
#else
if( !results[ idx.row() ].foundHiliteRegExps.isEmpty() ) if( !results[ idx.row() ].foundHiliteRegExps.isEmpty() )
{ {
reg = QRegExp( results[ idx.row() ].foundHiliteRegExps.join( "|"), reg = QRegExp( results[ idx.row() ].foundHiliteRegExps.join( "|"),
@ -559,6 +591,7 @@ void FullTextSearchDialog::itemClicked( const QModelIndex & idx )
} }
else else
reg = searchRegExp; reg = searchRegExp;
#endif
emit showTranslationFor( headword, results[ idx.row() ].dictIDs, reg, ignoreDiacritics ); emit showTranslationFor( headword, results[ idx.row() ].dictIDs, reg, ignoreDiacritics );
} }
} }

View file

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>492</width> <width>562</width>
<height>593</height> <height>593</height>
</rect> </rect>
</property> </property>
@ -27,10 +27,33 @@
</property> </property>
<layout class="QVBoxLayout" name="verticalLayout"> <layout class="QVBoxLayout" name="verticalLayout">
<item> <item>
<widget class="QLineEdit" name="searchLine"/> <layout class="QHBoxLayout" name="horizontalLayout_5">
<item>
<widget class="QLineEdit" name="searchLine"/>
</item>
<item>
<widget class="QLabel" name="label_8">
<property name="text">
<string>Mode:</string>
</property>
</widget>
</item>
<item>
<widget class="QComboBox" name="searchMode"/>
</item>
</layout>
</item> </item>
<item> <item>
<layout class="QGridLayout" name="gridLayout_2"> <layout class="QGridLayout" name="fulltext_option_container">
<property name="sizeConstraint">
<enum>QLayout::SetMinAndMaxSize</enum>
</property>
<item row="1" column="1">
<widget class="QSpinBox" name="articlesPerDictionary"/>
</item>
<item row="0" column="1">
<widget class="QSpinBox" name="distanceBetweenWords"/>
</item>
<item row="1" column="2"> <item row="1" column="2">
<widget class="QCheckBox" name="matchCase"> <widget class="QCheckBox" name="matchCase">
<property name="text"> <property name="text">
@ -38,26 +61,6 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="0" column="2">
<layout class="QHBoxLayout" name="horizontalLayout_2">
<item>
<widget class="QLabel" name="label_8">
<property name="text">
<string>Mode:</string>
</property>
</widget>
</item>
<item>
<widget class="QComboBox" name="searchMode"/>
</item>
</layout>
</item>
<item row="1" column="1">
<widget class="QSpinBox" name="articlesPerDictionary"/>
</item>
<item row="0" column="1">
<widget class="QSpinBox" name="distanceBetweenWords"/>
</item>
<item row="0" column="0"> <item row="0" column="0">
<widget class="QCheckBox" name="checkBoxDistanceBetweenWords"> <widget class="QCheckBox" name="checkBoxDistanceBetweenWords">
<property name="text"> <property name="text">
@ -72,18 +75,14 @@
</property> </property>
</widget> </widget>
</item> </item>
</layout> <item row="2" column="0">
</item>
<item>
<layout class="QHBoxLayout" name="horizontalLayout_4">
<item>
<widget class="QCheckBox" name="checkBoxIgnoreWordOrder"> <widget class="QCheckBox" name="checkBoxIgnoreWordOrder">
<property name="text"> <property name="text">
<string>Ignore words order</string> <string>Ignore words order</string>
</property> </property>
</widget> </widget>
</item> </item>
<item> <item row="2" column="1">
<widget class="QCheckBox" name="checkBoxIgnoreDiacritics"> <widget class="QCheckBox" name="checkBoxIgnoreDiacritics">
<property name="text"> <property name="text">
<string>Ignore diacritics</string> <string>Ignore diacritics</string>
@ -272,11 +271,9 @@
</layout> </layout>
</widget> </widget>
<tabstops> <tabstops>
<tabstop>searchLine</tabstop>
<tabstop>headwordsView</tabstop> <tabstop>headwordsView</tabstop>
<tabstop>checkBoxDistanceBetweenWords</tabstop> <tabstop>checkBoxDistanceBetweenWords</tabstop>
<tabstop>distanceBetweenWords</tabstop> <tabstop>distanceBetweenWords</tabstop>
<tabstop>searchMode</tabstop>
<tabstop>checkBoxArticlesPerDictionary</tabstop> <tabstop>checkBoxArticlesPerDictionary</tabstop>
<tabstop>articlesPerDictionary</tabstop> <tabstop>articlesPerDictionary</tabstop>
<tabstop>matchCase</tabstop> <tabstop>matchCase</tabstop>

View file

@ -56,6 +56,11 @@ DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x050F00
DEFINES += MAKE_FFMPEG_PLAYER DEFINES += MAKE_FFMPEG_PLAYER
} }
CONFIG( use_xapian ) {
DEFINES += USE_XAPIAN
LIBS+= -lxapian
}
CONFIG += exceptions \ CONFIG += exceptions \
rtti \ rtti \
stl \ stl \