Add config file parameter to limit headwords number to expand multi-word headwords while indexing (issie #914)

This commit is contained in:
Abs62 2017-10-25 17:37:39 +03:00
parent 1824d9ab02
commit 0b6f36479d
12 changed files with 62 additions and 23 deletions

View file

@ -823,7 +823,8 @@ sptr< Dictionary::DataRequest > AardDictionary::getArticle( wstring const & word
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing )
Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception )
{
vector< sptr< Dictionary::Class > > dictionaries;
@ -995,7 +996,11 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
articleOffsets.insert( articleOffset );
// Insert new entry
indexedWords.addWord( Utf8::decode( string( data.data(), wordSize ) ), articleOffset);
wstring word = Utf8::decode( string( data.data(), wordSize ) );
if( maxHeadwordsToExpand && dictHeader.wordsCount >= maxHeadwordsToExpand )
indexedWords.addSingleWord( word, articleOffset);
else
indexedWords.addWord( word, articleOffset);
pos += has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement );
}

View file

@ -15,7 +15,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & )
Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception );
}

View file

@ -1009,6 +1009,9 @@ Class load() throw( exError )
}
}
if ( !root.namedItem( "maxHeadwordsToExpand" ).isNull() )
c.maxHeadwordsToExpand = root.namedItem( "maxHeadwordsToExpand" ).toElement().text().toUInt();
QDomNode headwordsDialog = root.namedItem( "headwordsDialog" );
if ( !headwordsDialog.isNull() )
@ -1976,6 +1979,10 @@ void save( Class const & c ) throw( exError )
opt = dd.createElement( "maxHeadwordSize" );
opt.appendChild( dd.createTextNode( QString::number( c.maxHeadwordSize ) ) );
root.appendChild( opt );
opt = dd.createElement( "maxHeadwordsToExpand" );
opt.appendChild( dd.createTextNode( QString::number( c.maxHeadwordsToExpand ) ) );
root.appendChild( opt );
}
{

View file

@ -598,6 +598,8 @@ struct Class
/// Bigger headwords won't be indexed. For now, only in DSL.
unsigned int maxHeadwordSize;
unsigned int maxHeadwordsToExpand;
HeadwordsDialog headwordsDialog;
#ifdef Q_OS_WIN
@ -610,7 +612,8 @@ struct Class
Class(): lastMainGroupId( 0 ), lastPopupGroupId( 0 ),
pinPopupWindow( false ), showingDictBarNames( false ),
usingSmallIconsInToolbars( false ),
maxPictureWidth( 0 ), maxHeadwordSize ( 256U )
maxPictureWidth( 0 ), maxHeadwordSize ( 256U ),
maxHeadwordsToExpand( 0 )
{}
Group * getGroup( unsigned id );
Group const * getGroup( unsigned id ) const;

View file

@ -55,7 +55,8 @@ LoadDictionaries::LoadDictionaries( Config::Class const & cfg ):
transliteration( cfg.transliteration ),
exceptionText( "Load did not finish" ), // Will be cleared upon success
maxPictureWidth( cfg.maxPictureWidth ),
maxHeadwordSize( cfg.maxHeadwordSize )
maxHeadwordSize( cfg.maxHeadwordSize ),
maxHeadwordToExpand( cfg.maxHeadwordsToExpand )
{
// Populate name filters
@ -140,7 +141,7 @@ void LoadDictionaries::handlePath( Config::Path const & path )
{
vector< sptr< Dictionary::Class > > stardictDictionaries =
Stardict::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this );
Stardict::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), stardictDictionaries.begin(),
stardictDictionaries.end() );
@ -186,7 +187,7 @@ void LoadDictionaries::handlePath( Config::Path const & path )
}
{
vector< sptr< Dictionary::Class > > aardDictionaries =
Aard::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this );
Aard::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), aardDictionaries.begin(),
aardDictionaries.end() );
@ -215,14 +216,14 @@ void LoadDictionaries::handlePath( Config::Path const & path )
#ifdef MAKE_ZIM_SUPPORT
{
vector< sptr< Dictionary::Class > > zimDictionaries =
Zim::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this );
Zim::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), zimDictionaries.begin(),
zimDictionaries.end() );
}
{
vector< sptr< Dictionary::Class > > slobDictionaries =
Slob::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this );
Slob::makeDictionaries( allFiles, FsEncoding::encode( Config::getIndexDir() ), *this, maxHeadwordToExpand );
dictionaries.insert( dictionaries.end(), slobDictionaries.begin(),
slobDictionaries.end() );

View file

@ -25,6 +25,7 @@ class LoadDictionaries: public QThread, public Dictionary::Initializing
std::string exceptionText;
int maxPictureWidth;
unsigned int maxHeadwordSize;
unsigned int maxHeadwordToExpand;
public:

View file

@ -1518,7 +1518,8 @@ sptr< Dictionary::DataRequest > SlobDictionary::getResource( string const & name
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing )
Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception )
{
vector< sptr< Dictionary::Class > > dictionaries;
@ -1583,7 +1584,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|| contentType.startsWith( "text/plain", Qt::CaseInsensitive ) )
{
//Article
indexedWords.addWord( gd::toWString( refEntry.key ), i );
if( maxHeadwordsToExpand && entries > maxHeadwordsToExpand )
indexedWords.addSingleWord( gd::toWString( refEntry.key ), i );
else
indexedWords.addWord( gd::toWString( refEntry.key ), i );
wordCount += 1;

View file

@ -14,7 +14,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & )
Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception );
}

View file

@ -1804,7 +1804,7 @@ static void handleIdxSynFile( string const & fileName,
IndexedWords & indexedWords,
ChunkedStorage::Writer & chunks,
vector< uint32_t > * articleOffsets,
bool isSynFile )
bool isSynFile, bool parseHeadwords )
{
gzFile stardictIdx = gd_gzopen( fileName.c_str() );
if ( !stardictIdx )
@ -1927,7 +1927,10 @@ static void handleIdxSynFile( string const & fileName,
// Insert new entry into an index
indexedWords.addWord( Utf8::decode( word ), offset );
if( parseHeadwords )
indexedWords.addWord( Utf8::decode( word ), offset );
else
indexedWords.addSingleWord( Utf8::decode( word ), offset );
}
GD_DPRINTF( "%u entires made\n", (unsigned) indexedWords.size() );
@ -1937,7 +1940,8 @@ static void handleIdxSynFile( string const & fileName,
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing )
Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception )
{
vector< sptr< Dictionary::Class > > dictionaries;
@ -2036,7 +2040,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
// Load indices
if ( !ifo.synwordcount )
handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false );
handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false,
!maxHeadwordsToExpand || ifo.wordcount < maxHeadwordsToExpand );
else
{
vector< uint32_t > articleOffsets;
@ -2044,10 +2049,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
articleOffsets.reserve( ifo.wordcount );
handleIdxSynFile( idxFileName, indexedWords, chunks, &articleOffsets,
false );
false,
!maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
handleIdxSynFile( synFileName, indexedWords, chunks, &articleOffsets,
true );
true,
!maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
}
// Finish with the chunks

View file

@ -15,7 +15,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & )
Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception );
}

13
zim.cc
View file

@ -1184,7 +1184,8 @@ sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing )
Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
throw( std::exception )
{
vector< sptr< Dictionary::Class > > dictionaries;
@ -1316,10 +1317,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
if( nameSpace == 'A' )
{
wstring word;
if( !title.empty() )
indexedWords.addWord( Utf8::decode( title ), n );
word = Utf8::decode( title );
else
indexedWords.addWord( Utf8::decode( url ), n );
word = Utf8::decode( url );
if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand )
indexedWords.addSingleWord( word, n );
else
indexedWords.addWord( word, n );
wordCount++;
}
else

3
zim.hh
View file

@ -14,7 +14,8 @@ using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & )
Dictionary::Initializing &,
unsigned maxHeadwordsToExpand )
throw( std::exception );
}