Add config file parameter to limit headwords number to expand multi-word headwords while indexing (issie #914)

This commit is contained in:
Abs62 2017-10-25 17:37:39 +03:00
parent 1824d9ab02
commit 0b6f36479d
12 changed files with 62 additions and 23 deletions

View file

@ -823,7 +823,8 @@ sptr< Dictionary::DataRequest > AardDictionary::getArticle( wstring const & word
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & initializing ) Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception ) throw( std::exception )
{ {
vector< sptr< Dictionary::Class > > dictionaries; vector< sptr< Dictionary::Class > > dictionaries;
@ -995,7 +996,11 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
articleOffsets.insert( articleOffset ); articleOffsets.insert( articleOffset );
// Insert new entry // Insert new entry
indexedWords.addWord( Utf8::decode( string( data.data(), wordSize ) ), articleOffset); wstring word = Utf8::decode( string( data.data(), wordSize ) );
if( maxHeadwordsToExpand && dictHeader.wordsCount >= maxHeadwordsToExpand )
indexedWords.addSingleWord( word, articleOffset);
else
indexedWords.addWord( word, articleOffset);
pos += has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement ); pos += has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement );
} }

View file

@ -15,7 +15,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & ) Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception ); throw( std::exception );
} }

View file

@ -1009,6 +1009,9 @@ Class load() throw( exError )
} }
} }
if ( !root.namedItem( "maxHeadwordsToExpand" ).isNull() )
c.maxHeadwordsToExpand = root.namedItem( "maxHeadwordsToExpand" ).toElement().text().toUInt();
QDomNode headwordsDialog = root.namedItem( "headwordsDialog" ); QDomNode headwordsDialog = root.namedItem( "headwordsDialog" );
if ( !headwordsDialog.isNull() ) if ( !headwordsDialog.isNull() )
@ -1976,6 +1979,10 @@ void save( Class const & c ) throw( exError )
opt = dd.createElement( "maxHeadwordSize" ); opt = dd.createElement( "maxHeadwordSize" );
opt.appendChild( dd.createTextNode( QString::number( c.maxHeadwordSize ) ) ); opt.appendChild( dd.createTextNode( QString::number( c.maxHeadwordSize ) ) );
root.appendChild( opt ); root.appendChild( opt );
opt = dd.createElement( "maxHeadwordsToExpand" );
opt.appendChild( dd.createTextNode( QString::number( c.maxHeadwordsToExpand ) ) );
root.appendChild( opt );
} }
{ {

View file

@ -598,6 +598,8 @@ struct Class
/// Bigger headwords won't be indexed. For now, only in DSL. /// Bigger headwords won't be indexed. For now, only in DSL.
unsigned int maxHeadwordSize; unsigned int maxHeadwordSize;
unsigned int maxHeadwordsToExpand;
HeadwordsDialog headwordsDialog; HeadwordsDialog headwordsDialog;
#ifdef Q_OS_WIN #ifdef Q_OS_WIN
@ -610,7 +612,8 @@ struct Class
Class(): lastMainGroupId( 0 ), lastPopupGroupId( 0 ), Class(): lastMainGroupId( 0 ), lastPopupGroupId( 0 ),
pinPopupWindow( false ), showingDictBarNames( false ), pinPopupWindow( false ), showingDictBarNames( false ),
usingSmallIconsInToolbars( false ), usingSmallIconsInToolbars( false ),
maxPictureWidth( 0 ), maxHeadwordSize ( 256U ) maxPictureWidth( 0 ), maxHeadwordSize ( 256U ),
maxHeadwordsToExpand( 0 )
{} {}
Group * getGroup( unsigned id ); Group * getGroup( unsigned id );
Group const * getGroup( unsigned id ) const; Group const * getGroup( unsigned id ) const;

View file

@ -55,7 +55,8 @@ LoadDictionaries::LoadDictionaries( Config::Class const & cfg ):
transliteration( cfg.transliteration ), transliteration( cfg.transliteration ),
exceptionText( "Load did not finish" ), // Will be cleared upon success exceptionText( "Load did not finish" ), // Will be cleared upon success
maxPictureWidth( cfg.maxPictureWidth ), maxPictureWidth( cfg.maxPictureWidth ),
maxHeadwordSize( cfg.maxHeadwordSize ) maxHeadwordSize( cfg.maxHeadwordSize ),
maxHeadwordToExpand( cfg.maxHeadwordsToExpand )
{ {
// Populate name filters // Populate name filters
@ -140,7 +141,7 @@ void LoadDictionaries::handlePath( Config::Path const & path )
{ {
vector< sptr< Dictionary::Class > > stardictDictionaries = vector< sptr< Dictionary::Class > > stardictDictionaries =
Stardict::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); Stardict::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), stardictDictionaries.begin(), dictionaries.insert( dictionaries.end(), stardictDictionaries.begin(),
stardictDictionaries.end() ); stardictDictionaries.end() );
@ -186,7 +187,7 @@ void LoadDictionaries::handlePath( Config::Path const & path )
} }
{ {
vector< sptr< Dictionary::Class > > aardDictionaries = vector< sptr< Dictionary::Class > > aardDictionaries =
Aard::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); Aard::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), aardDictionaries.begin(), dictionaries.insert( dictionaries.end(), aardDictionaries.begin(),
aardDictionaries.end() ); aardDictionaries.end() );
@ -215,14 +216,14 @@ void LoadDictionaries::handlePath( Config::Path const & path )
#ifdef MAKE_ZIM_SUPPORT #ifdef MAKE_ZIM_SUPPORT
{ {
vector< sptr< Dictionary::Class > > zimDictionaries = vector< sptr< Dictionary::Class > > zimDictionaries =
Zim::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); Zim::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), zimDictionaries.begin(), dictionaries.insert( dictionaries.end(), zimDictionaries.begin(),
zimDictionaries.end() ); zimDictionaries.end() );
} }
{ {
vector< sptr< Dictionary::Class > > slobDictionaries = vector< sptr< Dictionary::Class > > slobDictionaries =
Slob::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this ); Slob::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), slobDictionaries.begin(), dictionaries.insert( dictionaries.end(), slobDictionaries.begin(),
slobDictionaries.end() ); slobDictionaries.end() );

View file

@ -25,6 +25,7 @@ class LoadDictionaries: public QThread, public Dictionary::Initializing
std::string exceptionText; std::string exceptionText;
int maxPictureWidth; int maxPictureWidth;
unsigned int maxHeadwordSize; unsigned int maxHeadwordSize;
unsigned int maxHeadwordToExpand;
public: public:

View file

@ -1518,7 +1518,8 @@ sptr< Dictionary::DataRequest > SlobDictionary::getResource( string const & name
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & initializing ) Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception ) throw( std::exception )
{ {
vector< sptr< Dictionary::Class > > dictionaries; vector< sptr< Dictionary::Class > > dictionaries;
@ -1583,7 +1584,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|| contentType.startsWith( "text/plain", Qt::CaseInsensitive ) ) || contentType.startsWith( "text/plain", Qt::CaseInsensitive ) )
{ {
//Article //Article
indexedWords.addWord( gd::toWString( refEntry.key ), i ); if( maxHeadwordsToExpand && entries > maxHeadwordsToExpand )
indexedWords.addSingleWord( gd::toWString( refEntry.key ), i );
else
indexedWords.addWord( gd::toWString( refEntry.key ), i );
wordCount += 1; wordCount += 1;

View file

@ -14,7 +14,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & ) Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception ); throw( std::exception );
} }

View file

@ -1804,7 +1804,7 @@ static void handleIdxSynFile( string const & fileName,
IndexedWords & indexedWords, IndexedWords & indexedWords,
ChunkedStorage::Writer & chunks, ChunkedStorage::Writer & chunks,
vector< uint32_t > * articleOffsets, vector< uint32_t > * articleOffsets,
bool isSynFile ) bool isSynFile, bool parseHeadwords )
{ {
gzFile stardictIdx = gd_gzopen( fileName.c_str() ); gzFile stardictIdx = gd_gzopen( fileName.c_str() );
if ( !stardictIdx ) if ( !stardictIdx )
@ -1927,7 +1927,10 @@ static void handleIdxSynFile( string const & fileName,
// Insert new entry into an index // Insert new entry into an index
indexedWords.addWord( Utf8::decode( word ), offset ); if( parseHeadwords )
indexedWords.addWord( Utf8::decode( word ), offset );
else
indexedWords.addSingleWord( Utf8::decode( word ), offset );
} }
GD_DPRINTF( "%u entires made\n", (unsigned) indexedWords.size() ); GD_DPRINTF( "%u entires made\n", (unsigned) indexedWords.size() );
@ -1937,7 +1940,8 @@ static void handleIdxSynFile( string const & fileName,
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & initializing ) Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception ) throw( std::exception )
{ {
vector< sptr< Dictionary::Class > > dictionaries; vector< sptr< Dictionary::Class > > dictionaries;
@ -2036,7 +2040,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
// Load indices // Load indices
if ( !ifo.synwordcount ) if ( !ifo.synwordcount )
handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false ); handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false,
!maxHeadwordsToExpand || ifo.wordcount < maxHeadwordsToExpand );
else else
{ {
vector< uint32_t > articleOffsets; vector< uint32_t > articleOffsets;
@ -2044,10 +2049,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
articleOffsets.reserve( ifo.wordcount ); articleOffsets.reserve( ifo.wordcount );
handleIdxSynFile( idxFileName, indexedWords, chunks, &articleOffsets, handleIdxSynFile( idxFileName, indexedWords, chunks, &articleOffsets,
false ); false,
!maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
handleIdxSynFile( synFileName, indexedWords, chunks, &articleOffsets, handleIdxSynFile( synFileName, indexedWords, chunks, &articleOffsets,
true ); true,
!maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
} }
// Finish with the chunks // Finish with the chunks

View file

@ -15,7 +15,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & ) Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception ); throw( std::exception );
} }

13
zim.cc
View file

@ -1184,7 +1184,8 @@ sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & initializing ) Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception ) throw( std::exception )
{ {
vector< sptr< Dictionary::Class > > dictionaries; vector< sptr< Dictionary::Class > > dictionaries;
@ -1316,10 +1317,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
if( nameSpace == 'A' ) if( nameSpace == 'A' )
{ {
wstring word;
if( !title.empty() ) if( !title.empty() )
indexedWords.addWord( Utf8::decode( title ), n ); word = Utf8::decode( title );
else else
indexedWords.addWord( Utf8::decode( url ), n ); word = Utf8::decode( url );
if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand )
indexedWords.addSingleWord( word, n );
else
indexedWords.addWord( word, n );
wordCount++; wordCount++;
} }
else else

3
zim.hh
View file

@ -14,7 +14,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames, vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & ) Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception ); throw( std::exception );
} }