From 6a34804df230b2f8c4434206633855a5ba186698 Mon Sep 17 00:00:00 2001 From: Xiao YiFang Date: Sat, 1 Oct 2022 21:57:55 +0800 Subject: [PATCH] opt: add xapian support opt: add xapian fullindex support --- dictionary.cc | 13 ++- file.cc | 1 + ftshelpers.cc | 213 +++++++++++++++++++++++++++++++++++++++++++++- ftshelpers.hh | 8 +- fulltextsearch.cc | 33 +++++++ fulltextsearch.ui | 59 ++++++------- goldendict.pro | 5 ++ 7 files changed, 295 insertions(+), 37 deletions(-) diff --git a/dictionary.cc b/dictionary.cc index 3a638a76..a00de8e3 100644 --- a/dictionary.cc +++ b/dictionary.cc @@ -567,13 +567,24 @@ bool needToRebuildIndex( vector< string > const & dictionaryFiles, if ( ts > lastModified ) lastModified = ts; } - +#ifndef USE_XAPIAN + QDir d(FsEncoding::decode( indexFile.c_str() )); + if(d.exists()){ + d.removeRecursively(); + } QFileInfo fileInfo( FsEncoding::decode( indexFile.c_str() ) ); if ( !fileInfo.exists() ) return true; return fileInfo.lastModified().toSecsSinceEpoch() < lastModified; +#else + QDir d(FsEncoding::decode( indexFile.c_str() )); + if(!d.exists()){ + return true; + } + return false; +#endif } QString generateRandomDictionaryId() diff --git a/file.cc b/file.cc index dabc8853..a0790654 100644 --- a/file.cc +++ b/file.cc @@ -114,6 +114,7 @@ void Class::open( char const * filename, char const * mode ) f.setFileName( FsEncoding::decode( filename ) ); + //maybe directory, the xapian use directory to store the index. if ( !f.open( openMode ) ) throw exCantOpen( std::string( filename ) + ": " + f.errorString().toUtf8().data() ); } diff --git a/ftshelpers.cc b/ftshelpers.cc index 76eb07ce..9b52c5c4 100644 --- a/ftshelpers.cc +++ b/ftshelpers.cc @@ -1,6 +1,9 @@ /* This file is (c) 2014 Abs62 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ - +#ifdef USE_XAPIAN +#include "xapian.h" +#include +#endif #include "fulltextsearch.hh" #include "ftshelpers.hh" #include "wstring_qt.hh" @@ -33,6 +36,25 @@ namespace FtsHelpers bool ftsIndexIsOldOrBad( string const & indexFile, BtreeIndexing::BtreeDictionary * dict ) { +#ifdef USE_XAPIAN + try + { + Xapian::WritableDatabase db( dict->ftsIndexName() ); + } + catch( const Xapian::Error & e ) + { + qWarning() << e.get_description().c_str(); + //the file is corrupted,remove it. + QFile::remove(QString::fromStdString(dict->ftsIndexName())); + return true; + } + catch( ... ) + { + return true; + } + return false; +#endif + File::Class idx( indexFile, "rb" ); FtsIdxHeader header; @@ -321,8 +343,6 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText, words[ word ].push_back( articleAddress );*/ } } - - } { @@ -337,6 +357,10 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText, void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ) { +#ifdef USE_XAPIAN + return makeFTSIndexXapian(dict,isCancelled); +#endif + Mutex::Lock _( dict->getFtsMutex() ); if( Utils::AtomicInt::loadAcquire( isCancelled ) ) @@ -466,6 +490,83 @@ void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancell ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 ); } +// use xapian to create the index +#ifdef USE_XAPIAN +void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ) +{ + Mutex::Lock _( dict->getFtsMutex() ); + + try { + if( Utils::AtomicInt::loadAcquire( isCancelled ) ) + throw exUserAbort(); + + // Open the database for update, creating a new database if necessary. + Xapian::WritableDatabase db(dict->ftsIndexName(), Xapian::DB_CREATE_OR_OPEN); + + Xapian::TermGenerator indexer; + Xapian::Stem stemmer("english"); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS); + + BtreeIndexing::IndexedWords indexedWords; + + QSet< uint32_t > setOfOffsets; + setOfOffsets.reserve( dict->getArticleCount() ); + + dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled ); + + if( Utils::AtomicInt::loadAcquire( isCancelled ) ) + throw exUserAbort(); + + QVector< uint32_t > offsets; + offsets.resize( setOfOffsets.size() ); + uint32_t * ptr = &offsets.front(); + + for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin(); + it != setOfOffsets.constEnd(); ++it ) + { + *ptr = *it; + ptr++; + } + + // Free memory + setOfOffsets.clear(); + + if( Utils::AtomicInt::loadAcquire( isCancelled ) ) + throw exUserAbort(); + + dict->sortArticlesOffsetsForFTS( offsets, isCancelled ); + + for( auto & address : offsets ) + { + if( Utils::AtomicInt::loadAcquire( isCancelled ) ) + { + return; + } + + QString headword, articleStr; + + dict->getArticleText( address, headword, articleStr ); + + Xapian::Document doc; + + indexer.set_document( doc ); + indexer.index_text( articleStr.toStdString() ); + doc.add_boolean_term( std::to_string( address ) ); + doc.set_data( std::to_string( address ) ); + // Add the document to the database. + db.add_document( doc ); + } + // Free memory + offsets.clear(); + + db.commit(); + } catch (Xapian::Error & e) { + qWarning()< offsetsForHeadwords; + for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i ) + { + qDebug() << i.get_rank() + 1 << ": " << i.get_weight() << " docid=" << *i << " [" + << i.get_document().get_data().c_str() << "]"; + offsetsForHeadwords.append( atoi( i.get_document().get_data().c_str() ) ); + } + + if( !offsetsForHeadwords.isEmpty() ) + { + QVector< QString > headwords; + Mutex::Lock _( dataMutex ); + QString id = QString::fromUtf8( dict.getId().c_str() ); + dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled ); + for( int x = 0; x < headwords.size(); x++ ) + { + foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, QStringList(), matchCase ) ); + } + } + } + else + { + QStringList indexWords, searchWords; + QRegExp searchRegExp; + if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp, + searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) ) + { + finish(); + return; + } + fullSearch( searchWords, searchRegExp ); + } + + if( foundHeadwords && foundHeadwords->size() > 0 ) + { + Mutex::Lock _( dataMutex ); + data.resize( sizeof( foundHeadwords ) ); + memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) ); + foundHeadwords = 0; + hasAnyData = true; + } + } + catch (const Xapian::Error &e) { + qWarning() << e.get_description().c_str(); + } + catch( std::exception &ex ) + { + gdWarning( "FTS: Failed full-text search for \"%s\", reason: %s\n", + dict.getName().c_str(), ex.what() ); + // Results not loaded -- we don't set the hasAnyData flag then + } + + finish(); +} +#endif + } // namespace diff --git a/ftshelpers.hh b/ftshelpers.hh index cf02a06c..d91833b0 100644 --- a/ftshelpers.hh +++ b/ftshelpers.hh @@ -64,7 +64,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText, bool handleRoundBrackets = false ); void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ); - +#ifdef USE_XAPIAN +void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled ); +#endif bool isCJKChar( ushort ch ); class FTSResultsRequest : public Dictionary::DataRequest @@ -142,7 +144,9 @@ public: } void run(); - + #ifdef USE_XAPIAN + void runXapian(); + #endif virtual void cancel() { isCancelled.ref(); diff --git a/fulltextsearch.cc b/fulltextsearch.cc index 002f5f95..dde15853 100644 --- a/fulltextsearch.cc +++ b/fulltextsearch.cc @@ -225,7 +225,19 @@ FullTextSearchDialog::FullTextSearchDialog( QWidget * parent, ui.searchMode->addItem( tr( "Whole words" ), WholeWords ); ui.searchMode->addItem( tr( "Plain text"), PlainText ); ui.searchMode->addItem( tr( "Wildcards" ), Wildcards ); +#ifndef USE_XAPIAN ui.searchMode->addItem( tr( "RegExp" ), RegExp ); +#else + ui.matchCase->hide(); + ui.articlesPerDictionary->hide(); + ui.checkBoxArticlesPerDictionary->hide(); + ui.checkBoxIgnoreDiacritics->hide(); + ui.checkBoxDistanceBetweenWords->hide(); + ui.distanceBetweenWords->hide(); + ui.checkBoxIgnoreWordOrder->hide(); + + ui.searchLine->setToolTip(tr("support xapian search syntax,such as AND OR +/- etc")); +#endif ui.searchMode->setCurrentIndex( cfg.preferences.fts.searchMode ); ui.searchProgressBar->hide(); @@ -550,6 +562,26 @@ void FullTextSearchDialog::itemClicked( const QModelIndex & idx ) { QString headword = results[ idx.row() ].headword; QRegExp reg; +#ifdef USE_XAPIAN + auto searchText = ui.searchLine->text(); + searchText.replace( + QRegularExpression( "[\\*\\?\\+\\\"]|\\bAnd\\b|\\bOR\\b", QRegularExpression::CaseInsensitiveOption ), + " " ); + auto parts = searchText.split( QRegularExpression( "\\s" ), Qt::SkipEmptyParts ); + QString firstAvailbeItem; + for( auto & p : parts ) + { + if( p.startsWith( '-' ) ) + continue; + firstAvailbeItem = p; + break; + } + if( !firstAvailbeItem.isEmpty() ) + { + reg = QRegExp( firstAvailbeItem, Qt::CaseInsensitive, QRegExp::RegExp2 ); + reg.setMinimal( true ); + } +#else if( !results[ idx.row() ].foundHiliteRegExps.isEmpty() ) { reg = QRegExp( results[ idx.row() ].foundHiliteRegExps.join( "|"), @@ -559,6 +591,7 @@ void FullTextSearchDialog::itemClicked( const QModelIndex & idx ) } else reg = searchRegExp; +#endif emit showTranslationFor( headword, results[ idx.row() ].dictIDs, reg, ignoreDiacritics ); } } diff --git a/fulltextsearch.ui b/fulltextsearch.ui index ffc9908e..1a574365 100644 --- a/fulltextsearch.ui +++ b/fulltextsearch.ui @@ -6,7 +6,7 @@ 0 0 - 492 + 562 593 @@ -27,10 +27,33 @@ - + + + + + + + + Mode: + + + + + + + - + + + QLayout::SetMinAndMaxSize + + + + + + + @@ -38,26 +61,6 @@ - - - - - - Mode: - - - - - - - - - - - - - - @@ -72,18 +75,14 @@ - - - - - + Ignore words order - + Ignore diacritics @@ -272,11 +271,9 @@ - searchLine headwordsView checkBoxDistanceBetweenWords distanceBetweenWords - searchMode checkBoxArticlesPerDictionary articlesPerDictionary matchCase diff --git a/goldendict.pro b/goldendict.pro index 43a26419..9ae1d694 100644 --- a/goldendict.pro +++ b/goldendict.pro @@ -56,6 +56,11 @@ DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x050F00 DEFINES += MAKE_FFMPEG_PLAYER } +CONFIG( use_xapian ) { + DEFINES += USE_XAPIAN + LIBS+= -lxapian +} + CONFIG += exceptions \ rtti \ stl \