mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 15:24:05 +00:00
opt: add xapian support
opt: add xapian fullindex support
This commit is contained in:
parent
0ce1ff8566
commit
6a34804df2
|
@ -567,13 +567,24 @@ bool needToRebuildIndex( vector< string > const & dictionaryFiles,
|
|||
if ( ts > lastModified )
|
||||
lastModified = ts;
|
||||
}
|
||||
|
||||
#ifndef USE_XAPIAN
|
||||
QDir d(FsEncoding::decode( indexFile.c_str() ));
|
||||
if(d.exists()){
|
||||
d.removeRecursively();
|
||||
}
|
||||
QFileInfo fileInfo( FsEncoding::decode( indexFile.c_str() ) );
|
||||
|
||||
if ( !fileInfo.exists() )
|
||||
return true;
|
||||
|
||||
return fileInfo.lastModified().toSecsSinceEpoch() < lastModified;
|
||||
#else
|
||||
QDir d(FsEncoding::decode( indexFile.c_str() ));
|
||||
if(!d.exists()){
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
QString generateRandomDictionaryId()
|
||||
|
|
1
file.cc
1
file.cc
|
@ -114,6 +114,7 @@ void Class::open( char const * filename, char const * mode )
|
|||
|
||||
f.setFileName( FsEncoding::decode( filename ) );
|
||||
|
||||
//maybe directory, the xapian use directory to store the index.
|
||||
if ( !f.open( openMode ) )
|
||||
throw exCantOpen( std::string( filename ) + ": " + f.errorString().toUtf8().data() );
|
||||
}
|
||||
|
|
213
ftshelpers.cc
213
ftshelpers.cc
|
@ -1,6 +1,9 @@
|
|||
/* This file is (c) 2014 Abs62
|
||||
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
||||
|
||||
#ifdef USE_XAPIAN
|
||||
#include "xapian.h"
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
#include "fulltextsearch.hh"
|
||||
#include "ftshelpers.hh"
|
||||
#include "wstring_qt.hh"
|
||||
|
@ -33,6 +36,25 @@ namespace FtsHelpers
|
|||
bool ftsIndexIsOldOrBad( string const & indexFile,
|
||||
BtreeIndexing::BtreeDictionary * dict )
|
||||
{
|
||||
#ifdef USE_XAPIAN
|
||||
try
|
||||
{
|
||||
Xapian::WritableDatabase db( dict->ftsIndexName() );
|
||||
}
|
||||
catch( const Xapian::Error & e )
|
||||
{
|
||||
qWarning() << e.get_description().c_str();
|
||||
//the file is corrupted,remove it.
|
||||
QFile::remove(QString::fromStdString(dict->ftsIndexName()));
|
||||
return true;
|
||||
}
|
||||
catch( ... )
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
#endif
|
||||
|
||||
File::Class idx( indexFile, "rb" );
|
||||
|
||||
FtsIdxHeader header;
|
||||
|
@ -321,8 +343,6 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
|
|||
words[ word ].push_back( articleAddress );*/
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -337,6 +357,10 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
|
|||
|
||||
void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
|
||||
{
|
||||
#ifdef USE_XAPIAN
|
||||
return makeFTSIndexXapian(dict,isCancelled);
|
||||
#endif
|
||||
|
||||
Mutex::Lock _( dict->getFtsMutex() );
|
||||
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
|
@ -466,6 +490,83 @@ void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancell
|
|||
ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 );
|
||||
}
|
||||
|
||||
// use xapian to create the index
|
||||
#ifdef USE_XAPIAN
|
||||
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled )
|
||||
{
|
||||
Mutex::Lock _( dict->getFtsMutex() );
|
||||
|
||||
try {
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
throw exUserAbort();
|
||||
|
||||
// Open the database for update, creating a new database if necessary.
|
||||
Xapian::WritableDatabase db(dict->ftsIndexName(), Xapian::DB_CREATE_OR_OPEN);
|
||||
|
||||
Xapian::TermGenerator indexer;
|
||||
Xapian::Stem stemmer("english");
|
||||
indexer.set_stemmer(stemmer);
|
||||
indexer.set_stemming_strategy(indexer.STEM_SOME_FULL_POS);
|
||||
|
||||
BtreeIndexing::IndexedWords indexedWords;
|
||||
|
||||
QSet< uint32_t > setOfOffsets;
|
||||
setOfOffsets.reserve( dict->getArticleCount() );
|
||||
|
||||
dict->findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
|
||||
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
throw exUserAbort();
|
||||
|
||||
QVector< uint32_t > offsets;
|
||||
offsets.resize( setOfOffsets.size() );
|
||||
uint32_t * ptr = &offsets.front();
|
||||
|
||||
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
|
||||
it != setOfOffsets.constEnd(); ++it )
|
||||
{
|
||||
*ptr = *it;
|
||||
ptr++;
|
||||
}
|
||||
|
||||
// Free memory
|
||||
setOfOffsets.clear();
|
||||
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
throw exUserAbort();
|
||||
|
||||
dict->sortArticlesOffsetsForFTS( offsets, isCancelled );
|
||||
|
||||
for( auto & address : offsets )
|
||||
{
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
QString headword, articleStr;
|
||||
|
||||
dict->getArticleText( address, headword, articleStr );
|
||||
|
||||
Xapian::Document doc;
|
||||
|
||||
indexer.set_document( doc );
|
||||
indexer.index_text( articleStr.toStdString() );
|
||||
doc.add_boolean_term( std::to_string( address ) );
|
||||
doc.set_data( std::to_string( address ) );
|
||||
// Add the document to the database.
|
||||
db.add_document( doc );
|
||||
}
|
||||
// Free memory
|
||||
offsets.clear();
|
||||
|
||||
db.commit();
|
||||
} catch (Xapian::Error & e) {
|
||||
qWarning()<<QString::fromStdString(e.get_description());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
bool isCJKChar( ushort ch )
|
||||
{
|
||||
return Utils::isCJKChar(ch);
|
||||
|
@ -1056,6 +1157,10 @@ void FTSResultsRequest::fullSearch( QStringList & searchWords, QRegExp & regexp
|
|||
|
||||
void FTSResultsRequest::run()
|
||||
{
|
||||
#ifdef USE_XAPIAN
|
||||
return runXapian();
|
||||
#endif
|
||||
|
||||
if ( dict.ensureInitDone().size() )
|
||||
{
|
||||
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
|
||||
|
@ -1131,5 +1236,107 @@ void FTSResultsRequest::run()
|
|||
finish();
|
||||
}
|
||||
|
||||
#ifdef USE_XAPIAN
|
||||
void FTSResultsRequest::runXapian()
|
||||
{
|
||||
if ( dict.ensureInitDone().size() )
|
||||
{
|
||||
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
|
||||
finish();
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
if( dict.haveFTSIndex() )
|
||||
{
|
||||
//no need to parse the search string, use xapian directly.
|
||||
//if the search mode is wildcard, change xapian search query flag?
|
||||
// Open the database for searching.
|
||||
Xapian::Database db(dict.ftsIndexName());
|
||||
|
||||
// Start an enquire session.
|
||||
Xapian::Enquire enquire( db );
|
||||
|
||||
// Combine the rest of the command line arguments with spaces between
|
||||
// them, so that simple queries don't have to be quoted at the shell
|
||||
// level.
|
||||
string query_string( searchString.toStdString() );
|
||||
|
||||
// Parse the query string to produce a Xapian::Query object.
|
||||
Xapian::QueryParser qp;
|
||||
Xapian::Stem stemmer( "english" );
|
||||
qp.set_stemmer( stemmer );
|
||||
qp.set_database( db );
|
||||
qp.set_stemming_strategy( Xapian::QueryParser::STEM_SOME );
|
||||
Xapian::QueryParser::feature_flag flag = Xapian::QueryParser::FLAG_DEFAULT;
|
||||
if( searchMode == FTS::Wildcards )
|
||||
flag = Xapian::QueryParser::FLAG_WILDCARD;
|
||||
Xapian::Query query = qp.parse_query( query_string, flag );
|
||||
qDebug() << "Parsed query is: " << query.get_description().c_str();
|
||||
|
||||
// Find the top 100 results for the query.
|
||||
enquire.set_query( query );
|
||||
Xapian::MSet matches = enquire.get_mset( 0, 100 );
|
||||
|
||||
// Display the results.
|
||||
qDebug() << matches.get_matches_estimated() << " results found.\n";
|
||||
qDebug() << "Matches 1-" << matches.size() << ":\n\n";
|
||||
QList< uint32_t > offsetsForHeadwords;
|
||||
for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
|
||||
{
|
||||
qDebug() << i.get_rank() + 1 << ": " << i.get_weight() << " docid=" << *i << " ["
|
||||
<< i.get_document().get_data().c_str() << "]";
|
||||
offsetsForHeadwords.append( atoi( i.get_document().get_data().c_str() ) );
|
||||
}
|
||||
|
||||
if( !offsetsForHeadwords.isEmpty() )
|
||||
{
|
||||
QVector< QString > headwords;
|
||||
Mutex::Lock _( dataMutex );
|
||||
QString id = QString::fromUtf8( dict.getId().c_str() );
|
||||
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
|
||||
for( int x = 0; x < headwords.size(); x++ )
|
||||
{
|
||||
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, QStringList(), matchCase ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
QStringList indexWords, searchWords;
|
||||
QRegExp searchRegExp;
|
||||
if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
|
||||
searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
|
||||
{
|
||||
finish();
|
||||
return;
|
||||
}
|
||||
fullSearch( searchWords, searchRegExp );
|
||||
}
|
||||
|
||||
if( foundHeadwords && foundHeadwords->size() > 0 )
|
||||
{
|
||||
Mutex::Lock _( dataMutex );
|
||||
data.resize( sizeof( foundHeadwords ) );
|
||||
memcpy( &data.front(), &foundHeadwords, sizeof( foundHeadwords ) );
|
||||
foundHeadwords = 0;
|
||||
hasAnyData = true;
|
||||
}
|
||||
}
|
||||
catch (const Xapian::Error &e) {
|
||||
qWarning() << e.get_description().c_str();
|
||||
}
|
||||
catch( std::exception &ex )
|
||||
{
|
||||
gdWarning( "FTS: Failed full-text search for \"%s\", reason: %s\n",
|
||||
dict.getName().c_str(), ex.what() );
|
||||
// Results not loaded -- we don't set the hasAnyData flag then
|
||||
}
|
||||
|
||||
finish();
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
|
|
|
@ -64,7 +64,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
|
|||
bool handleRoundBrackets = false );
|
||||
|
||||
void makeFTSIndex( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled );
|
||||
|
||||
#ifdef USE_XAPIAN
|
||||
void makeFTSIndexXapian( BtreeIndexing::BtreeDictionary * dict, QAtomicInt & isCancelled );
|
||||
#endif
|
||||
bool isCJKChar( ushort ch );
|
||||
|
||||
class FTSResultsRequest : public Dictionary::DataRequest
|
||||
|
@ -142,7 +144,9 @@ public:
|
|||
}
|
||||
|
||||
void run();
|
||||
|
||||
#ifdef USE_XAPIAN
|
||||
void runXapian();
|
||||
#endif
|
||||
virtual void cancel()
|
||||
{
|
||||
isCancelled.ref();
|
||||
|
|
|
@ -225,7 +225,19 @@ FullTextSearchDialog::FullTextSearchDialog( QWidget * parent,
|
|||
ui.searchMode->addItem( tr( "Whole words" ), WholeWords );
|
||||
ui.searchMode->addItem( tr( "Plain text"), PlainText );
|
||||
ui.searchMode->addItem( tr( "Wildcards" ), Wildcards );
|
||||
#ifndef USE_XAPIAN
|
||||
ui.searchMode->addItem( tr( "RegExp" ), RegExp );
|
||||
#else
|
||||
ui.matchCase->hide();
|
||||
ui.articlesPerDictionary->hide();
|
||||
ui.checkBoxArticlesPerDictionary->hide();
|
||||
ui.checkBoxIgnoreDiacritics->hide();
|
||||
ui.checkBoxDistanceBetweenWords->hide();
|
||||
ui.distanceBetweenWords->hide();
|
||||
ui.checkBoxIgnoreWordOrder->hide();
|
||||
|
||||
ui.searchLine->setToolTip(tr("support xapian search syntax,such as AND OR +/- etc"));
|
||||
#endif
|
||||
ui.searchMode->setCurrentIndex( cfg.preferences.fts.searchMode );
|
||||
|
||||
ui.searchProgressBar->hide();
|
||||
|
@ -550,6 +562,26 @@ void FullTextSearchDialog::itemClicked( const QModelIndex & idx )
|
|||
{
|
||||
QString headword = results[ idx.row() ].headword;
|
||||
QRegExp reg;
|
||||
#ifdef USE_XAPIAN
|
||||
auto searchText = ui.searchLine->text();
|
||||
searchText.replace(
|
||||
QRegularExpression( "[\\*\\?\\+\\\"]|\\bAnd\\b|\\bOR\\b", QRegularExpression::CaseInsensitiveOption ),
|
||||
" " );
|
||||
auto parts = searchText.split( QRegularExpression( "\\s" ), Qt::SkipEmptyParts );
|
||||
QString firstAvailbeItem;
|
||||
for( auto & p : parts )
|
||||
{
|
||||
if( p.startsWith( '-' ) )
|
||||
continue;
|
||||
firstAvailbeItem = p;
|
||||
break;
|
||||
}
|
||||
if( !firstAvailbeItem.isEmpty() )
|
||||
{
|
||||
reg = QRegExp( firstAvailbeItem, Qt::CaseInsensitive, QRegExp::RegExp2 );
|
||||
reg.setMinimal( true );
|
||||
}
|
||||
#else
|
||||
if( !results[ idx.row() ].foundHiliteRegExps.isEmpty() )
|
||||
{
|
||||
reg = QRegExp( results[ idx.row() ].foundHiliteRegExps.join( "|"),
|
||||
|
@ -559,6 +591,7 @@ void FullTextSearchDialog::itemClicked( const QModelIndex & idx )
|
|||
}
|
||||
else
|
||||
reg = searchRegExp;
|
||||
#endif
|
||||
emit showTranslationFor( headword, results[ idx.row() ].dictIDs, reg, ignoreDiacritics );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>492</width>
|
||||
<width>562</width>
|
||||
<height>593</height>
|
||||
</rect>
|
||||
</property>
|
||||
|
@ -26,20 +26,11 @@
|
|||
<string>Search</string>
|
||||
</property>
|
||||
<layout class="QVBoxLayout" name="verticalLayout">
|
||||
<item>
|
||||
<layout class="QHBoxLayout" name="horizontalLayout_5">
|
||||
<item>
|
||||
<widget class="QLineEdit" name="searchLine"/>
|
||||
</item>
|
||||
<item>
|
||||
<layout class="QGridLayout" name="gridLayout_2">
|
||||
<item row="1" column="2">
|
||||
<widget class="QCheckBox" name="matchCase">
|
||||
<property name="text">
|
||||
<string>Match case</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="2">
|
||||
<layout class="QHBoxLayout" name="horizontalLayout_2">
|
||||
<item>
|
||||
<widget class="QLabel" name="label_8">
|
||||
<property name="text">
|
||||
|
@ -52,12 +43,24 @@
|
|||
</item>
|
||||
</layout>
|
||||
</item>
|
||||
<item>
|
||||
<layout class="QGridLayout" name="fulltext_option_container">
|
||||
<property name="sizeConstraint">
|
||||
<enum>QLayout::SetMinAndMaxSize</enum>
|
||||
</property>
|
||||
<item row="1" column="1">
|
||||
<widget class="QSpinBox" name="articlesPerDictionary"/>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QSpinBox" name="distanceBetweenWords"/>
|
||||
</item>
|
||||
<item row="1" column="2">
|
||||
<widget class="QCheckBox" name="matchCase">
|
||||
<property name="text">
|
||||
<string>Match case</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="0">
|
||||
<widget class="QCheckBox" name="checkBoxDistanceBetweenWords">
|
||||
<property name="text">
|
||||
|
@ -72,18 +75,14 @@
|
|||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</item>
|
||||
<item>
|
||||
<layout class="QHBoxLayout" name="horizontalLayout_4">
|
||||
<item>
|
||||
<item row="2" column="0">
|
||||
<widget class="QCheckBox" name="checkBoxIgnoreWordOrder">
|
||||
<property name="text">
|
||||
<string>Ignore words order</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<item row="2" column="1">
|
||||
<widget class="QCheckBox" name="checkBoxIgnoreDiacritics">
|
||||
<property name="text">
|
||||
<string>Ignore diacritics</string>
|
||||
|
@ -272,11 +271,9 @@
|
|||
</layout>
|
||||
</widget>
|
||||
<tabstops>
|
||||
<tabstop>searchLine</tabstop>
|
||||
<tabstop>headwordsView</tabstop>
|
||||
<tabstop>checkBoxDistanceBetweenWords</tabstop>
|
||||
<tabstop>distanceBetweenWords</tabstop>
|
||||
<tabstop>searchMode</tabstop>
|
||||
<tabstop>checkBoxArticlesPerDictionary</tabstop>
|
||||
<tabstop>articlesPerDictionary</tabstop>
|
||||
<tabstop>matchCase</tabstop>
|
||||
|
|
|
@ -56,6 +56,11 @@ DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x050F00
|
|||
DEFINES += MAKE_FFMPEG_PLAYER
|
||||
}
|
||||
|
||||
CONFIG( use_xapian ) {
|
||||
DEFINES += USE_XAPIAN
|
||||
LIBS+= -lxapian
|
||||
}
|
||||
|
||||
CONFIG += exceptions \
|
||||
rtti \
|
||||
stl \
|
||||
|
|
Loading…
Reference in a new issue