From 2dd04207d1320e9eb0e0b8fe73608e81894240e6 Mon Sep 17 00:00:00 2001 From: xiaoyifang <105986+xiaoyifang@users.noreply.github.com> Date: Wed, 30 Aug 2023 19:12:59 +0800 Subject: [PATCH] fix: skip invalid headword in some epwing dictionaries (#1096) * fix: for some epwing dictionaries , skip invalid headword fix #1095 * [autofix.ci] apply automated fixes * fix: epwing detect next text block * [autofix.ci] apply automated fixes * fix: epwing getFirstHeadword do not need forwardtext * fix: epwing ,if epwing subbook does not contain headword ,use menu instead * [autofix.ci] apply automated fixes * fix: if epwing subbook does not contain headword ,use menu instead * [autofix.ci] apply automated fixes * fix:code smells --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- src/dict/epwing.cc | 26 +++++--- src/dict/epwing_book.cc | 132 +++++++++++++++++++++++++++++++++------- src/dict/epwing_book.hh | 8 ++- 3 files changed, 136 insertions(+), 30 deletions(-) diff --git a/src/dict/epwing.cc b/src/dict/epwing.cc index 9422b4d0..a5e2d647 100644 --- a/src/dict/epwing.cc +++ b/src/dict/epwing.cc @@ -1220,16 +1220,26 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f ChunkedStorage::Writer chunks( idx ); Epwing::Book::EpwingHeadword head; - - dict.getFirstHeadword( head ); - int wordCount = 0; int articleCount = 0; - - for ( ;; ) { - addWordToChunks( head, chunks, indexedWords, wordCount, articleCount ); - if ( !dict.getNextHeadword( head ) ) - break; + if ( dict.getFirstHeadword( head ) ) { + for ( ;; ) { + addWordToChunks( head, chunks, indexedWords, wordCount, articleCount ); + if ( !dict.getNextHeadword( head ) ) + break; + } + } + else { + //the book does not contain text,use menu instead if any. + if ( dict.getMenu( head ) ) { + auto candidateItems = dict.candidate( head.page, head.offset ); + for ( Epwing::Book::EpwingHeadword word : candidateItems ) { + addWordToChunks( word, chunks, indexedWords, wordCount, articleCount ); + } + } + else { + throw exEbLibrary( dict.errorString().toUtf8().data() ); + } } dict.clearBuffers(); diff --git a/src/dict/epwing_book.cc b/src/dict/epwing_book.cc index 2b1c54e8..46b12528 100644 --- a/src/dict/epwing_book.cc +++ b/src/dict/epwing_book.cc @@ -61,6 +61,7 @@ HookFunc( hook_mpeg ); HookFunc( hook_narrow_font ); HookFunc( hook_wide_font ); HookFunc( hook_reference ); +HookFunc( hook_candidate ); const EB_Hook hooks[] = { { EB_HOOK_NEWLINE, hook_newline }, { EB_HOOK_ISO8859_1, hook_iso8859_1 }, @@ -89,6 +90,7 @@ const EB_Hook hooks[] = { { EB_HOOK_NEWLINE, hook_newline }, { EB_HOOK_WIDE_FONT, hook_wide_font }, { EB_HOOK_BEGIN_REFERENCE, hook_reference }, { EB_HOOK_END_REFERENCE, hook_reference }, + { EB_HOOK_END_CANDIDATE_GROUP, hook_candidate }, { EB_HOOK_NULL, NULL } }; const EB_Hook refHooks[] = { @@ -377,6 +379,21 @@ hook_reference( EB_Book * book, EB_Appendix *, void * container, EB_Hook_Code co return EB_SUCCESS; } +EB_Error_Code +hook_candidate( EB_Book * book, EB_Appendix *, void * container, EB_Hook_Code code, int, const unsigned int * argv ) +{ + EContainer * cn = static_cast< EContainer * >( container ); + + if ( cn->textOnly ) + return EB_SUCCESS; + + QByteArray str = cn->book->handleCandidate( code, argv ); + if ( !str.isEmpty() ) + eb_write_text( book, str.data(), str.size() ); + + return EB_SUCCESS; +} + // EpwingBook class EpwingBook::EpwingBook(): @@ -564,7 +581,7 @@ QString EpwingBook::createCacheDir( QString const & dirName ) if ( !info.exists() || !info.isDir() ) { if ( !dir.mkdir( mainCacheDir ) ) { gdWarning( "Epwing: can't create cache directory \"%s\"", mainCacheDir.toUtf8().data() ); - return QString(); + return {}; } } @@ -573,7 +590,7 @@ QString EpwingBook::createCacheDir( QString const & dirName ) if ( !info.exists() || !info.isDir() ) { if ( !dir.mkdir( cacheDir ) ) { gdWarning( "Epwing: can't create cache directory \"%s\"", cacheDir.toUtf8().data() ); - return QString(); + return {}; } } return cacheDir; @@ -649,7 +666,7 @@ QString EpwingBook::title() if ( codec_Euc ) return codec_Euc->toUnicode( buf ); - return QString(); + return {}; } QString EpwingBook::copyright() @@ -657,7 +674,7 @@ QString EpwingBook::copyright() error_string.clear(); if ( !eb_have_copyright( &book ) ) - return QString(); + return {}; EB_Position position; EB_Error_Code ret = eb_copyright( &book, &position ); @@ -671,9 +688,18 @@ QString EpwingBook::copyright() return getText( position.page, position.offset, true ); } +QList< EpwingHeadword > EpwingBook::candidate( int page, int offset ) +{ + //clear candidateItems in getText; + candidateItems.clear(); + getText( page, offset, false ); + return candidateItems; +} + QString EpwingBook::getText( int page, int offset, bool text_only ) { error_string.clear(); + candidateItems.clear(); seekBookThrow( page, offset ); @@ -701,7 +727,7 @@ QString EpwingBook::getText( int page, int offset, bool text_only ) if ( buf.length() > TextSizeLimit ) { error_string = "Data too large"; currentPosition.page = 0; - return QString(); + return {}; } } @@ -822,7 +848,6 @@ QString EpwingBook::getPreviousTextWithLength( int page, int offset, int total, return text; } - void EpwingBook::getReferencesFromText( int page, int offset ) { error_string.clear(); @@ -872,15 +897,14 @@ EB_Error_Code EpwingBook::forwardText( EB_Position & startPos ) } ret = eb_forward_text( &book, &appendix ); - while ( ret == EB_ERR_END_OF_CONTENT ) { - ret = eb_tell_text( &book, &startPos ); - if ( ret != EB_SUCCESS ) - break; + while ( ret != EB_SUCCESS ) { if ( startPos.page >= book.subbook_current->text.end_page ) return EB_ERR_END_OF_CONTENT; - startPos.offset += 2; + const auto offset = startPos.offset + 2; + startPos.offset = offset % EB_SIZE_PAGE; + startPos.page += offset / EB_SIZE_PAGE; currentPosition = startPos; ret = eb_seek_text( &book, &startPos ); @@ -891,7 +915,7 @@ EB_Error_Code EpwingBook::forwardText( EB_Position & startPos ) return ret; } -void EpwingBook::getFirstHeadword( EpwingHeadword & head ) +bool EpwingBook::getFirstHeadword( EpwingHeadword & head ) { error_string.clear(); @@ -900,13 +924,15 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head ) EB_Error_Code ret = eb_text( &book, &pos ); if ( ret != EB_SUCCESS ) { setErrorString( "eb_text", ret ); - throw exEbLibrary( error_string.toUtf8().data() ); + qWarning() << error_string; + return false; } ret = forwardText( pos ); if ( ret != EB_SUCCESS ) { - setErrorString( "forwardText", ret ); - throw exEbLibrary( error_string.toUtf8().data() ); + setErrorString( "getFirstHeadword", ret ); + qWarning() << error_string; + return false; } eb_backward_text( &book, &appendix ); @@ -914,7 +940,49 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head ) ret = eb_tell_text( &book, &pos ); if ( ret != EB_SUCCESS ) { setErrorString( "eb_tell_text", ret ); - throw exEbLibrary( error_string.toUtf8().data() ); + qWarning() << error_string; + return false; + } + + currentPosition = pos; + indexHeadwordsPosition = pos; + + head.page = pos.page; + head.offset = pos.offset; + + if ( !readHeadword( pos, head.headword, true ) ) { + qWarning() << error_string; + return false; + } + + fixHeadword( head.headword ); + + allHeadwordPositions[ ( (uint64_t)pos.page ) << 32 | ( pos.offset ) ] = true; + return true; +} + +bool EpwingBook::haveMenu() +{ + error_string.clear(); + + int ret = eb_have_menu( &book ); + return ret == 1; +} + +bool EpwingBook::getMenu( EpwingHeadword & head ) +{ + error_string.clear(); + + if ( !haveMenu() ) { + return false; + } + + EB_Position pos; + + EB_Error_Code ret = eb_menu( &book, &pos ); + if ( ret != EB_SUCCESS ) { + setErrorString( "getMenu", ret ); + return false; } currentPosition = pos; @@ -924,19 +992,18 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head ) head.offset = pos.offset; if ( !readHeadword( pos, head.headword, true ) ) - throw exEbLibrary( error_string.toUtf8().data() ); + return false; fixHeadword( head.headword ); - EWPos epos( pos.page, pos.offset ); allHeadwordPositions[ ( (uint64_t)pos.page ) << 32 | ( pos.offset ) ] = true; + return true; } bool EpwingBook::getNextHeadword( EpwingHeadword & head ) { EB_Position pos; - // No queued positions - forward to next article error_string.clear(); @@ -965,8 +1032,10 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head ) head.page = pos.page; head.offset = pos.offset; - if ( !readHeadword( pos, head.headword, true ) ) - throw exEbLibrary( error_string.toUtf8().data() ); + if ( !readHeadword( pos, head.headword, true ) ) { + qDebug() << "Epwing: ignore the following error=> " << error_string; + continue; + } if ( head.headword.isEmpty() ) continue; @@ -1766,6 +1835,27 @@ QByteArray EpwingBook::handleReference( EB_Hook_Code code, const unsigned int * return str.toUtf8(); } +QByteArray EpwingBook::handleCandidate( EB_Hook_Code code, const unsigned int * argv ) +{ + EpwingHeadword w_headword; + w_headword.headword = currentCandidate(); + w_headword.page = argv[ 1 ]; + w_headword.offset = argv[ 2 ]; + + candidateItems << w_headword; + return QByteArray{}; +} + +QString EpwingBook::currentCandidate() +{ + const char * s = eb_current_candidate( &book ); + if ( book.character_code == EB_CHARCODE_ISO8859_1 ) + return QString::fromLatin1( s ); + if ( codec_Euc ) + return codec_Euc->toUnicode( s ); + return QString{}; +} + bool EpwingBook::getMatches( QString word, QVector< QString > & matches ) { QByteArray bword, bword2; diff --git a/src/dict/epwing_book.hh b/src/dict/epwing_book.hh index fcef9c16..baeefa20 100644 --- a/src/dict/epwing_book.hh +++ b/src/dict/epwing_book.hh @@ -85,6 +85,7 @@ class EpwingBook QVector< EWPos > LinksQueue; int refOpenCount, refCloseCount; static QMutex libMutex; + QList< EpwingHeadword > candidateItems; QString createCacheDir( QString const & dir ); @@ -184,6 +185,8 @@ public: // Make name for resource QString makeFName( QString const & ext, int page, int offset ) const; + QByteArray handleCandidate( EB_Hook_Code code, const unsigned * argv ); + QString currentCandidate(); // Store all files in Epwing folder static void collectFilenames( QString const & directory, vector< string > & files ); @@ -199,10 +202,13 @@ public: QString getCurrentSubBookDirectory(); QString copyright(); + QList< EpwingHeadword > candidate( int page, int offset ); QString title(); // Seek to first article - void getFirstHeadword( EpwingHeadword & head ); + bool getFirstHeadword( EpwingHeadword & head ); + bool haveMenu(); + bool getMenu( EpwingHeadword & head ); // Find next headword and article position bool getNextHeadword( EpwingHeadword & head );