From 769ef7d24be32eae868e43e380147c80fabe7a9d Mon Sep 17 00:00:00 2001 From: Xiao YiFang Date: Tue, 4 Oct 2022 15:07:04 +0800 Subject: [PATCH] fix: epwing reference process logic --- epwing.cc | 233 +++++++++++++++++++++++++++---------------------- epwing.hh | 13 ++- epwing_book.cc | 135 ++++++++++++---------------- epwing_book.hh | 10 ++- 4 files changed, 204 insertions(+), 187 deletions(-) diff --git a/epwing.cc b/epwing.cc index e1e5647e..d61ad57d 100644 --- a/epwing.cc +++ b/epwing.cc @@ -169,8 +169,12 @@ protected: private: /// Loads the article. - void loadArticle( quint32 address, string & articleHeadword, - string & articleText, int & articlePage, int & articleOffset ); + void loadArticle( quint32 address, + string & articleHeadword, + string & articleText, + int & articlePage, + int & articleOffset, + QString word = 0 ); void loadArticle( int articlePage, int articleOffset, string & articleHeadword, string & articleText ); @@ -284,10 +288,15 @@ void EpwingDictionary::removeDirectory( QString const & directory ) } void EpwingDictionary::loadArticle( quint32 address, + string & articleHeadword, + string & articleText, + int & articlePage, - int & articleOffset ) + + int & articleOffset, + QString word) { vector< char > chunk; @@ -307,7 +316,7 @@ void EpwingDictionary::loadArticle( quint32 address, try { Mutex::Lock _( eBook.getLibMutex() ); - eBook.getArticle( headword, text, articlePage, articleOffset, false ); + eBook.getArticle( headword, text, articlePage, articleOffset, false, word ); } catch( std::exception & e ) { @@ -521,8 +530,16 @@ void EpwingArticleRequest::run() try { - dict.loadArticle( chain[ x ].articleOffset, headword, articleText, - articlePage, articleOffset ); + dict.loadArticle( chain[ x ].articleOffset, + + headword, + + articleText, + + articlePage, + + articleOffset, + gd::toQString(word) ); } catch(...) { @@ -939,6 +956,107 @@ sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch( } // anonymous namespace +void addWordToChunks( Epwing::Book::EpwingHeadword & head, + ChunkedStorage::Writer & chunks, + BtreeIndexing::IndexedWords & indexedWords, + int & wordCount, + int & articleCount ) +{ + if( !head.headword.isEmpty() ) + { + uint32_t offset = chunks.startNewBlock(); + chunks.addToBlock( &head.page, sizeof( head.page ) ); + chunks.addToBlock( &head.offset, sizeof( head.offset ) ); + + wstring hw = gd::toWString( head.headword ); + + indexedWords.addWord( hw, offset ); + wordCount++; + articleCount++; + + vector< wstring > words; + + // Parse combined kanji/katakana/hiragana headwords + + int w_prev = 0; + wstring word; + for( wstring::size_type n = 0; n < hw.size(); n++ ) + { + gd::wchar ch = hw[ n ]; + + if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isSign( ch ) + || EpwingDictionary::isJapanesePunctiation( ch ) ) + continue; + + int w = EpwingDictionary::japaneseWriting( ch ); + + if( w > 0 ) + { + // Store only separated words + gd::wchar ch_prev = 0; + if( n ) + ch_prev = hw[ n - 1 ]; + bool needStore = ( n == 0 || Folding::isPunct( ch_prev ) || Folding::isWhitespace( ch_prev ) + || EpwingDictionary::isJapanesePunctiation( ch ) ); + + word.push_back( ch ); + w_prev = w; + wstring::size_type i; + for( i = n + 1; i < hw.size(); i++ ) + { + ch = hw[ i ]; + if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) ) + break; + w = EpwingDictionary::japaneseWriting( ch ); + if( w != w_prev ) + break; + word.push_back( ch ); + } + + if( needStore ) + { + if( i >= hw.size() || Folding::isPunct( ch ) || Folding::isWhitespace( ch ) + || EpwingDictionary::isJapanesePunctiation( ch ) ) + words.push_back( word ); + } + word.clear(); + + if( i < hw.size() ) + n = i; + else + break; + } + } + + if( words.size() > 1 ) + { + // Allow only one word in every charset + + size_t n; + int writings[ 4 ]; + memset( writings, 0, sizeof( writings ) ); + + for( n = 0; n < words.size(); n++ ) + { + int w = EpwingDictionary::japaneseWriting( words[ n ][ 0 ] ); + if( writings[ w ] ) + break; + else + writings[ w ] = 1; + } + + if( n >= words.size() ) + { + for( n = 0; n < words.size(); n++ ) + { + indexedWords.addWord( words[ n ], offset ); + wordCount++; + } + } + } + } +} + vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, @@ -1045,107 +1163,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries( for( ; ; ) { - if( !head.headword.isEmpty() ) - { - uint32_t offset = chunks.startNewBlock(); - chunks.addToBlock( &head.page, sizeof( head.page ) ); - chunks.addToBlock( &head.offset, sizeof( head.offset ) ); - - wstring hw = gd::toWString( head.headword ); - - indexedWords.addWord( hw, offset ); - wordCount++; - articleCount++; - - vector< wstring > words; - - // Parse combined kanji/katakana/hiragana headwords - - int w_prev = 0; - wstring word; - for( wstring::size_type n = 0; n < hw.size(); n++ ) - { - gd::wchar ch = hw[ n ]; - - if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) - || EpwingDictionary::isSign( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) ) - continue; - - int w = EpwingDictionary::japaneseWriting( ch ); - - if( w > 0 ) - { - // Store only separated words - gd::wchar ch_prev = 0; - if( n ) - ch_prev = hw[ n - 1 ]; - bool needStore = ( n == 0 - || Folding::isPunct( ch_prev ) - || Folding::isWhitespace( ch_prev ) - || EpwingDictionary::isJapanesePunctiation( ch ) ); - - word.push_back( ch ); - w_prev = w; - wstring::size_type i; - for( i = n + 1; i < hw.size(); i++ ) - { - ch = hw[ i ]; - if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) - || EpwingDictionary::isJapanesePunctiation( ch ) ) - break; - w = EpwingDictionary::japaneseWriting( ch ); - if( w != w_prev ) - break; - word.push_back( ch ); - } - - if( needStore ) - { - if( i >= hw.size() || Folding::isPunct( ch ) || Folding::isWhitespace( ch ) - || EpwingDictionary::isJapanesePunctiation( ch ) ) - words.push_back( word ); - } - word.clear(); - - if( i < hw.size() ) - n = i; - else - break; - } - } - - if( words.size() > 1 ) - { - // Allow only one word in every charset - - size_t n; - int writings[ 4 ]; - memset( writings, 0, sizeof(writings) ); - - for( n = 0; n < words.size(); n++ ) - { - int w = EpwingDictionary::japaneseWriting( words[ n ][ 0 ] ); - if( writings[ w ] ) - break; - else - writings[ w ] = 1; - } - - if( n >= words.size() ) - { - for( n = 0; n < words.size(); n++ ) - { - indexedWords.addWord( words[ n ], offset ); - wordCount++; - } - } - } - - } + addWordToChunks( head, chunks, indexedWords, wordCount, articleCount ); if( !dict.getNextHeadword( head ) ) break; } + while( dict.processRef( head ) ) + { + addWordToChunks( head, chunks, indexedWords, wordCount, articleCount ); + } + dict.clearBuffers(); // Finish with the chunks diff --git a/epwing.hh b/epwing.hh index 8ce85e26..ae46d1e0 100644 --- a/epwing.hh +++ b/epwing.hh @@ -2,14 +2,23 @@ #define __EPWING_HH__INCLUDED__ #include "dictionary.hh" - +#include "epwing_book.hh" +#include "btreeidx.hh" +#include "chunkedstorage.hh" /// Support for the Epwing dictionaries. namespace Epwing { using std::vector; using std::string; -vector< sptr< Dictionary::Class > > makeDictionaries( +void addWordToChunks( Epwing::Book::EpwingHeadword & head, + ChunkedStorage::Writer & chunks, + BtreeIndexing::IndexedWords & indexedWords, + int & wordCount, + int & articleCount ); + +vector< sptr< Dictionary::Class > > +makeDictionaries( vector< string > const & fileNames, string const & indicesDir, Dictionary::Initializing & ) diff --git a/epwing_book.cc b/epwing_book.cc index 3abe85b4..f6eed69c 100644 --- a/epwing_book.cc +++ b/epwing_book.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include "gddebug.hh" #include "fsencoding.hh" #include "audiolink.hh" @@ -732,7 +733,7 @@ QString EpwingBook::getText( int page, int offset, bool text_only ) } QString text = QString::fromUtf8( buf.data(), buf.size() ).trimmed(); - finalizeText( text ); + finalizeText( text); return text; } @@ -776,7 +777,9 @@ void EpwingBook::getReferencesFromText( int page, int offset ) } for( int x = 0; x < refPages.size(); x++ ) + { LinksQueue.push_back( EWPos( refPages[ x ], refOffsets[ x ] ) ); + } } EB_Error_Code EpwingBook::forwardText( EB_Position & startPos ) @@ -850,60 +853,13 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head ) fixHeadword( head.headword ); EWPos epos( pos.page, pos.offset ); - allHeadwordPositions[ head.headword ] << epos; + allHeadwordPositions[ ((uint64_t)pos.page)<<32|(pos.offset>>2) ] =true; } bool EpwingBook::getNextHeadword( EpwingHeadword & head ) { EB_Position pos; - - QRegularExpression badLinks( "#(v|n)\\d", QRegularExpression::UseUnicodePropertiesOption); - - // At first we check references queue - while( !LinksQueue.isEmpty() ) - { - EWPos epos = LinksQueue.last(); - LinksQueue.pop_back(); - - pos.page = epos.first; - pos.offset = epos.second; - - if( readHeadword( pos, head.headword, true ) ) - { - if( head.headword.isEmpty() - || head.headword.contains( badLinks ) ) - continue; - - fixHeadword( head.headword ); - - head.page = pos.page; - head.offset = pos.offset; - - if( allHeadwordPositions.contains( head.headword ) ) - { - // existed position - bool existed = false; - foreach( EWPos epos, allHeadwordPositions[ head.headword ] ) - { - if( pos.page == epos.first && abs( pos.offset - epos.second ) <= 4 ) - { - existed = true; - break; - } - } - if( !existed ) - { - allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset ); - return true; - } - } - else - { - allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset ); - return true; - } - } - } + // No queued positions - forward to next article @@ -934,13 +890,7 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head ) indexHeadwordsPosition = pos; - try - { - getReferencesFromText( pos.page, pos.offset ); - } - catch( std::exception & ) - { - } + head.page = pos.page; head.offset = pos.offset; @@ -953,27 +903,17 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head ) fixHeadword( head.headword ); - if( allHeadwordPositions.contains( head.headword ) ) + try { - // existed position - bool existed = false; - foreach( EWPos epos, allHeadwordPositions[ head.headword ] ) - { - if( pos.page == epos.first && abs( pos.offset - epos.second ) <= 4 ) - { - existed = true; - break; - } - } - if( !existed ) - { - allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset ); - return true; - } + getReferencesFromText( pos.page, pos.offset); } - else + catch( std::exception & ) { - allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset ); + } + + if( !allHeadwordPositions.contains( ((uint64_t)pos.page) << 32 | ( pos.offset / 4 ) ) ) + { + allHeadwordPositions[ ((uint64_t)pos.page) << 32 | ( pos.offset / 4 ) ] = true; return true; } } @@ -981,6 +921,43 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head ) return true; } +bool EpwingBook::processRef( EpwingHeadword & head) +{ + EB_Position pos; + + QRegularExpression badLinks( "#(v|n)\\d", QRegularExpression::UseUnicodePropertiesOption ); + while( !LinksQueue.isEmpty() ) + { + EWPos epos = LinksQueue.last(); + LinksQueue.pop_back(); + + pos.page = epos.first; + pos.offset = epos.second; + + if( readHeadword( pos, head.headword, true ) ) + { + if( head.headword.isEmpty() || head.headword.contains( badLinks ) ) + continue; + + fixHeadword( head.headword ); + + head.page = pos.page; + head.offset = pos.offset; + auto key = ( (uint64_t)pos.page ) << 32 | ( pos.offset >> 2 ); + if( !allRefPositions.contains( key ) ) + { + // fixed the reference headword ,to avoid the headword collision with other entry . + //if(!allHeadwordPositions.contains(key)) + head.headword = QString( "r%1At%2" ).arg( pos.page ).arg( pos.offset ); + + allRefPositions[ key ] = true; + return true; + } + } + } + return false; +} + bool EpwingBook::readHeadword( EB_Position const& pos, QString & headword, bool text_only ) @@ -1094,7 +1071,7 @@ void EpwingBook::fixHeadword( QString & headword ) } void EpwingBook::getArticle( QString & headword, QString & articleText, - int page, int offset, bool text_only) + int page, int offset, bool text_only, QString word) { error_string.clear(); char buffer[ TextBufferSize + 1 ]; @@ -1126,12 +1103,12 @@ void EpwingBook::getArticle( QString & headword, QString & articleText, } headword = QString::fromUtf8( buffer, length ); - finalizeText( headword ); + finalizeText( headword); if( text_only ) fixHeadword( headword ); - articleText = getText( pos.page, pos.offset, text_only ); + articleText = getText( pos.page, pos.offset, text_only); } const char * EpwingBook::beginDecoration( unsigned int code ) @@ -1268,7 +1245,7 @@ void EpwingBook::finalizeText( QString & text ) { QString headword = QString::fromUtf8( buf, length ); fixHeadword( headword ); - url.setPath( Utils::Url::ensureLeadingSlash( headword ) ); + url.setPath( Utils::Url::ensureLeadingSlash( QString( "r%1At%2" ).arg( ebpos.page ).arg(ebpos.offset) ) ); } QString link = ""; diff --git a/epwing_book.hh b/epwing_book.hh index 558d4684..e5b00f0f 100644 --- a/epwing_book.hh +++ b/epwing_book.hh @@ -78,7 +78,8 @@ class EpwingBook QStringList imageCacheList, soundsCacheList, moviesCacheList, fontsCacheList; QMap< QString, QString > baseFontsMap, customFontsMap; QVector< int > refPages, refOffsets; - QMap< QString, QList< EWPos > > allHeadwordPositions; + QMap< uint64_t,bool > allHeadwordPositions; + QMap< uint64_t, bool > allRefPositions; QVector< EWPos > LinksQueue; int refOpenCount, refCloseCount; static Mutex libMutex; @@ -98,7 +99,7 @@ class EpwingBook EB_Error_Code forwardText( EB_Position & startPos ); // Retrieve article text from dictionary - QString getText( int page, int offset, bool text_only ); + QString getText( int page, int offset, bool text_only); unsigned int normalizeDecorationCode( unsigned int code ); @@ -151,6 +152,7 @@ public: void clearBuffers() { allHeadwordPositions.clear(); + allRefPositions.clear(); LinksQueue.clear(); } @@ -181,6 +183,8 @@ public: // Find next headword and article position bool getNextHeadword( EpwingHeadword & head ); + bool processRef( EpwingHeadword & head ); + bool readHeadword( EB_Position const & pos, QString & headword, bool text_only ); @@ -191,7 +195,7 @@ public: // Retrieve article from dictionary void getArticle( QString & headword, QString & articleText, - int page, int offset, bool text_only ); + int page, int offset, bool text_only, QString word=0 ); const char * beginDecoration( unsigned int code ); const char * endDecoration( unsigned int code );