opt: use xapian as headword index for mdx dictionary

This commit is contained in:
Xiao Yi Fang 2024-07-18 17:09:01 +08:00
parent bf19b960fd
commit 1a75bc3e86
3 changed files with 46 additions and 2 deletions

View file

@ -1,6 +1,6 @@
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org> /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "xapian.h"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "utf8.hh"
@ -1020,6 +1020,46 @@ IndexInfo buildIndex( IndexedWords const & indexedWords, File::Index & file )
return IndexInfo( btreeMaxElements, rootOffset ); return IndexInfo( btreeMaxElements, rootOffset );
} }
void BtreeIndex::buildXapianIndex( IndexedWords const & indexedWords, string file ) {
try {
// Open the database for update, creating a new database if necessary.
Xapian::WritableDatabase db( file + "_temp", Xapian::DB_CREATE_OR_OPEN );
Xapian::TermGenerator indexer;
// Xapian::Stem stemmer("english");
// indexer.set_stemmer(stemmer);
// indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS);
indexer.set_flags( Xapian::TermGenerator::FLAG_CJK_NGRAM );
for ( const auto &[ word, articleLinks ] : indexedWords ) {
for ( const auto & articleLink : articleLinks ) {
Xapian::Document doc;
indexer.set_document( doc );
indexer.index_text( word );
doc.set_data( std::to_string( articleLink.articleOffset ) );
// Add the document to the database.
db.add_document( doc );
}
}
db.commit();
db.compact( file );
db.close();
Utils::Fs::removeDirectory( file + "_temp" );
}
catch ( Xapian::Error & e ) {
qWarning() << "create xapian headword index:" << QString::fromStdString( e.get_description() );
}
}
void BtreeIndex::getAllHeadwords( QSet< QString > & headwords ) void BtreeIndex::getAllHeadwords( QSet< QString > & headwords )
{ {
if ( !idxFile ) if ( !idxFile )

View file

@ -269,6 +269,8 @@ struct IndexedWords: public map< string, vector< WordArticleLink > >
/// position. /// position.
IndexInfo buildIndex( IndexedWords const &, File::Index & file ); IndexInfo buildIndex( IndexedWords const &, File::Index & file );
void buildXapianIndex( IndexedWords const &, string file );
} // namespace BtreeIndexing } // namespace BtreeIndexing
#endif #endif

View file

@ -57,7 +57,7 @@ using namespace Mdict;
enum { enum {
kSignature = 0x4349444d, // MDIC kSignature = 0x4349444d, // MDIC
kCurrentFormatVersion = 11 + BtreeIndexing::FormatVersion + Folding::Version kCurrentFormatVersion = 12 + BtreeIndexing::FormatVersion + Folding::Version
}; };
DEF_EX( exCorruptDictionary, "dictionary file was tampered or corrupted", std::exception ) DEF_EX( exCorruptDictionary, "dictionary file was tampered or corrupted", std::exception )
@ -1342,6 +1342,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
File::Index idx( indexFile, "wb" ); File::Index idx( indexFile, "wb" );
auto headIndexFile = indexFile+".head";
IdxHeader idxHeader; IdxHeader idxHeader;
memset( &idxHeader, 0, sizeof( idxHeader ) ); memset( &idxHeader, 0, sizeof( idxHeader ) );
// We write a dummy header first. At the end of the process the header // We write a dummy header first. At the end of the process the header
@ -1411,6 +1412,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
GD_DPRINTF( "Writing index...\n" ); GD_DPRINTF( "Writing index...\n" );
BtreeIndexing::buildXapianIndex( indexedWords, headIndexFile );
// Good. Now build the index // Good. Now build the index
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;