From 0b6f36479dd4f5f11bbf1557b4645b52fbfb552e Mon Sep 17 00:00:00 2001 From: Abs62 Date: Wed, 25 Oct 2017 17:37:39 +0300 Subject: [PATCH] Add config file parameter to limit headwords number to expand multi-word headwords while indexing (issie #914) --- aard.cc | 9 +++++++-- aard.hh | 3 ++- config.cc | 7 +++++++ config.hh | 5 ++++- loaddictionaries.cc | 11 ++++++----- loaddictionaries.hh | 1 + slob.cc | 8 ++++++-- slob.hh | 3 ++- stardict.cc | 19 +++++++++++++------ stardict.hh | 3 ++- zim.cc | 13 ++++++++++--- zim.hh | 3 ++- 12 files changed, 62 insertions(+), 23 deletions(-) diff --git a/aard.cc b/aard.cc index 0583b5ab..14b9a710 100644 --- a/aard.cc +++ b/aard.cc @@ -823,7 +823,8 @@ sptr< Dictionary::DataRequest > AardDictionary::getArticle( wstring const & word vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & initializing ) + Dictionary::Initializing & initializing, + unsigned maxHeadwordsToExpand ) throw( std::exception ) { vector< sptr< Dictionary::Class > > dictionaries; @@ -995,7 +996,11 @@ vector< sptr< Dictionary::Class > > makeDictionaries( articleOffsets.insert( articleOffset ); // Insert new entry - indexedWords.addWord( Utf8::decode( string( data.data(), wordSize ) ), articleOffset); + wstring word = Utf8::decode( string( data.data(), wordSize ) ); + if( maxHeadwordsToExpand && dictHeader.wordsCount >= maxHeadwordsToExpand ) + indexedWords.addSingleWord( word, articleOffset); + else + indexedWords.addWord( word, articleOffset); pos += has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement ); } diff --git a/aard.hh b/aard.hh index c9aabbcb..db9ee22d 100644 --- a/aard.hh +++ b/aard.hh @@ -15,7 +15,8 @@ using std::string; vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & ) + Dictionary::Initializing &, + unsigned maxHeadwordsToExpand ) throw( std::exception ); } diff --git a/config.cc b/config.cc index 6ccde30d..7f58cd98 100644 --- a/config.cc +++ b/config.cc @@ -1009,6 +1009,9 @@ Class load() throw( exError ) } } + if ( !root.namedItem( "maxHeadwordsToExpand" ).isNull() ) + c.maxHeadwordsToExpand = root.namedItem( "maxHeadwordsToExpand" ).toElement().text().toUInt(); + QDomNode headwordsDialog = root.namedItem( "headwordsDialog" ); if ( !headwordsDialog.isNull() ) @@ -1976,6 +1979,10 @@ void save( Class const & c ) throw( exError ) opt = dd.createElement( "maxHeadwordSize" ); opt.appendChild( dd.createTextNode( QString::number( c.maxHeadwordSize ) ) ); root.appendChild( opt ); + + opt = dd.createElement( "maxHeadwordsToExpand" ); + opt.appendChild( dd.createTextNode( QString::number( c.maxHeadwordsToExpand ) ) ); + root.appendChild( opt ); } { diff --git a/config.hh b/config.hh index 7046b43d..2c4e9197 100644 --- a/config.hh +++ b/config.hh @@ -598,6 +598,8 @@ struct Class /// Bigger headwords won't be indexed. For now, only in DSL. unsigned int maxHeadwordSize; + unsigned int maxHeadwordsToExpand; + HeadwordsDialog headwordsDialog; #ifdef Q_OS_WIN @@ -610,7 +612,8 @@ struct Class Class(): lastMainGroupId( 0 ), lastPopupGroupId( 0 ), pinPopupWindow( false ), showingDictBarNames( false ), usingSmallIconsInToolbars( false ), - maxPictureWidth( 0 ), maxHeadwordSize ( 256U ) + maxPictureWidth( 0 ), maxHeadwordSize ( 256U ), + maxHeadwordsToExpand( 0 ) {} Group * getGroup( unsigned id ); Group const * getGroup( unsigned id ) const; diff --git a/loaddictionaries.cc b/loaddictionaries.cc index ea4a349c..51c3398d 100644 --- a/loaddictionaries.cc +++ b/loaddictionaries.cc @@ -55,7 +55,8 @@ LoadDictionaries::LoadDictionaries( Config::Class const & cfg ): transliteration( cfg.transliteration ), exceptionText( "Load did not finish" ), // Will be cleared upon success maxPictureWidth( cfg.maxPictureWidth ), - maxHeadwordSize( cfg.maxHeadwordSize ) + maxHeadwordSize( cfg.maxHeadwordSize ), + maxHeadwordToExpand( cfg.maxHeadwordsToExpand ) { // Populate name filters @@ -140,7 +141,7 @@ void LoadDictionaries::handlePath( Config::Path const & path ) { vector< sptr< Dictionary::Class > > stardictDictionaries = - Stardict::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); + Stardict::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand ); dictionaries.insert( dictionaries.end(), stardictDictionaries.begin(), stardictDictionaries.end() ); @@ -186,7 +187,7 @@ void LoadDictionaries::handlePath( Config::Path const & path ) } { vector< sptr< Dictionary::Class > > aardDictionaries = - Aard::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); + Aard::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand ); dictionaries.insert( dictionaries.end(), aardDictionaries.begin(), aardDictionaries.end() ); @@ -215,14 +216,14 @@ void LoadDictionaries::handlePath( Config::Path const & path ) #ifdef MAKE_ZIM_SUPPORT { vector< sptr< Dictionary::Class > > zimDictionaries = - Zim::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); + Zim::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand ); dictionaries.insert( dictionaries.end(), zimDictionaries.begin(), zimDictionaries.end() ); } { vector< sptr< Dictionary::Class > > slobDictionaries = - Slob::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); + Slob::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand ); dictionaries.insert( dictionaries.end(), slobDictionaries.begin(), slobDictionaries.end() ); diff --git a/loaddictionaries.hh b/loaddictionaries.hh index e540970c..1b332a5a 100644 --- a/loaddictionaries.hh +++ b/loaddictionaries.hh @@ -25,6 +25,7 @@ class LoadDictionaries: public QThread, public Dictionary::Initializing std::string exceptionText; int maxPictureWidth; unsigned int maxHeadwordSize; + unsigned int maxHeadwordToExpand; public: diff --git a/slob.cc b/slob.cc index a9e57fb7..f18f0666 100644 --- a/slob.cc +++ b/slob.cc @@ -1518,7 +1518,8 @@ sptr< Dictionary::DataRequest > SlobDictionary::getResource( string const & name vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & initializing ) + Dictionary::Initializing & initializing, + unsigned maxHeadwordsToExpand ) throw( std::exception ) { vector< sptr< Dictionary::Class > > dictionaries; @@ -1583,7 +1584,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( || contentType.startsWith( "text/plain", Qt::CaseInsensitive ) ) { //Article - indexedWords.addWord( gd::toWString( refEntry.key ), i ); + if( maxHeadwordsToExpand && entries > maxHeadwordsToExpand ) + indexedWords.addSingleWord( gd::toWString( refEntry.key ), i ); + else + indexedWords.addWord( gd::toWString( refEntry.key ), i ); wordCount += 1; diff --git a/slob.hh b/slob.hh index 9fdbde94..ac72589d 100644 --- a/slob.hh +++ b/slob.hh @@ -14,7 +14,8 @@ using std::string; vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & ) + Dictionary::Initializing &, + unsigned maxHeadwordsToExpand ) throw( std::exception ); } diff --git a/stardict.cc b/stardict.cc index 34c4b15b..5f80bcf1 100644 --- a/stardict.cc +++ b/stardict.cc @@ -1804,7 +1804,7 @@ static void handleIdxSynFile( string const & fileName, IndexedWords & indexedWords, ChunkedStorage::Writer & chunks, vector< uint32_t > * articleOffsets, - bool isSynFile ) + bool isSynFile, bool parseHeadwords ) { gzFile stardictIdx = gd_gzopen( fileName.c_str() ); if ( !stardictIdx ) @@ -1927,7 +1927,10 @@ static void handleIdxSynFile( string const & fileName, // Insert new entry into an index - indexedWords.addWord( Utf8::decode( word ), offset ); + if( parseHeadwords ) + indexedWords.addWord( Utf8::decode( word ), offset ); + else + indexedWords.addSingleWord( Utf8::decode( word ), offset ); } GD_DPRINTF( "%u entires made\n", (unsigned) indexedWords.size() ); @@ -1937,7 +1940,8 @@ static void handleIdxSynFile( string const & fileName, vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & initializing ) + Dictionary::Initializing & initializing, + unsigned maxHeadwordsToExpand ) throw( std::exception ) { vector< sptr< Dictionary::Class > > dictionaries; @@ -2036,7 +2040,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // Load indices if ( !ifo.synwordcount ) - handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false ); + handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false, + !maxHeadwordsToExpand || ifo.wordcount < maxHeadwordsToExpand ); else { vector< uint32_t > articleOffsets; @@ -2044,10 +2049,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries( articleOffsets.reserve( ifo.wordcount ); handleIdxSynFile( idxFileName, indexedWords, chunks, &articleOffsets, - false ); + false, + !maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand ); handleIdxSynFile( synFileName, indexedWords, chunks, &articleOffsets, - true ); + true, + !maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand ); } // Finish with the chunks diff --git a/stardict.hh b/stardict.hh index 052c8d88..034e9fd3 100644 --- a/stardict.hh +++ b/stardict.hh @@ -15,7 +15,8 @@ using std::string; vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & ) + Dictionary::Initializing &, + unsigned maxHeadwordsToExpand ) throw( std::exception ); } diff --git a/zim.cc b/zim.cc index 7afcc36d..f26755e9 100644 --- a/zim.cc +++ b/zim.cc @@ -1184,7 +1184,8 @@ sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & initializing ) + Dictionary::Initializing & initializing, + unsigned maxHeadwordsToExpand ) throw( std::exception ) { vector< sptr< Dictionary::Class > > dictionaries; @@ -1316,10 +1317,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries( if( nameSpace == 'A' ) { + wstring word; if( !title.empty() ) - indexedWords.addWord( Utf8::decode( title ), n ); + word = Utf8::decode( title ); else - indexedWords.addWord( Utf8::decode( url ), n ); + word = Utf8::decode( url ); + + if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand ) + indexedWords.addSingleWord( word, n ); + else + indexedWords.addWord( word, n ); wordCount++; } else diff --git a/zim.hh b/zim.hh index 2717cd06..d451a43e 100644 --- a/zim.hh +++ b/zim.hh @@ -14,7 +14,8 @@ using std::string; vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, - Dictionary::Initializing & ) + Dictionary::Initializing &, + unsigned maxHeadwordsToExpand ) throw( std::exception ); }