From c4674a246f3cf7f9ee41ca764e07161208ac373a Mon Sep 17 00:00:00 2001 From: Xiao YiFang Date: Sun, 12 Mar 2023 15:04:49 +0800 Subject: [PATCH] feature: epwing dictionary reference navigation improved. feature: add branch method to handle the reference navigation loadArticle function feature: refactor epwing loadArticle method epwing: add previous and next page link --- base/globalregex.cc | 3 + base/globalregex.hh | 5 ++ epwing.cc | 110 +++++++++++++++++++++-- epwing_book.cc | 209 +++++++++++++++++++++++++++++++++++++------- epwing_book.hh | 7 ++ 5 files changed, 298 insertions(+), 36 deletions(-) diff --git a/base/globalregex.cc b/base/globalregex.cc index d20b5e2e..44cbf6cc 100644 --- a/base/globalregex.cc +++ b/base/globalregex.cc @@ -66,3 +66,6 @@ QRegularExpression Mdx::styleElment( R"((]*>)([\w\W]*?)(<\/style>))", QRegularExpression Zim::linkSpecialChar("[\\.\\/]"); + + +QRegularExpression Epwing::refWord(R"([r|p](\d+)at(\d+))", QRegularExpression::CaseInsensitiveOption); diff --git a/base/globalregex.hh b/base/globalregex.hh index fa8f8003..8db5b93b 100644 --- a/base/globalregex.hh +++ b/base/globalregex.hh @@ -50,6 +50,11 @@ class Zim{ static QRegularExpression linkSpecialChar; }; +class Epwing{ + public: + static QRegularExpression refWord; +}; + } // namespace RX #endif // GLOBALREGEX_HH diff --git a/epwing.cc b/epwing.cc index d426ef09..bcb3f448 100644 --- a/epwing.cc +++ b/epwing.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include "btreeidx.hh" #include "folding.hh" @@ -24,6 +25,7 @@ #include "utf8.hh" #include "filetype.hh" #include "ftshelpers.hh" +#include "base/globalregex.hh" namespace Epwing { @@ -175,6 +177,9 @@ private: int & articlePage, int & articleOffset ); + void loadArticleNextPage( string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ); + void loadArticlePreviousPage( string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ); + void loadArticle( int articlePage, int articleOffset, string & articleHeadword, string & articleText ); @@ -194,6 +199,8 @@ private: friend class EpwingArticleRequest; friend class EpwingResourceRequest; friend class EpwingWordSearchRequest; + string epwing_previous_button(int& articleOffset, int& articlePage); + string epwing_next_button(int& articleOffset, int& articlePage); }; @@ -323,6 +330,77 @@ void EpwingDictionary::loadArticle( articleText = prefix + articleText + ""; } +string Epwing::EpwingDictionary::epwing_previous_button(int& articlePage, int& articleOffset) +{ + QString previousButton = QString( "p%1At%2" ).arg( articlePage ).arg( articleOffset ); + string previousLink = "

" + QObject::tr( "Previous Page" ).toStdString() + "

"; + + return previousLink; +} + +void EpwingDictionary::loadArticleNextPage(string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ) +{ + QString headword, text; + EB_Position pos; + try + { + Mutex::Lock _( eBook.getLibMutex() ); + pos = eBook.getArticleNextPage( headword, text, articlePage, articleOffset, false ); + } + catch( std::exception & e ) + { + text = QString( "Article reading error: %1") + .arg( QString::fromUtf8( e.what() ) ); + } + + articleHeadword = string( headword.toUtf8().data() ); + articleText = string( text.toUtf8().data() ); + + string prefix( "
" ); + string previousLink = epwing_previous_button(articlePage, articleOffset); + + articleText = prefix + previousLink + articleText; + string nextLink = epwing_next_button(pos.page, pos.offset); + articleText = articleText + nextLink; + articleText = articleText + "
"; +} + +string Epwing::EpwingDictionary::epwing_next_button(int& articlePage, int& articleOffset ) +{ + QString refLink = QString( "r%1At%2" ).arg( articlePage ).arg( articleOffset ); + string nextLink = "

" + + QObject::tr( "Next Page" ).toStdString() + "

"; + + return nextLink; +} + +void EpwingDictionary::loadArticlePreviousPage( + string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ) +{ + QString headword, text; + EB_Position pos; + try + { + Mutex::Lock _( eBook.getLibMutex() ); + pos = eBook.getArticlePreviousPage( headword, text, articlePage, articleOffset, false ); + } catch( std::exception & e ) + { + text = QString( "Article reading error: %1" ).arg( QString::fromUtf8( e.what() ) ); + } + + articleHeadword = string( headword.toUtf8().data() ); + articleText = string( text.toUtf8().data() ); + + string prefix( "
" ); + + string previousLink = epwing_previous_button(pos.page, pos.offset ); + articleText = prefix + previousLink + articleText; + string nextLink = epwing_next_button( articlePage, articleOffset ); + articleText = articleText + nextLink; + articleText = articleText + "
"; +} + void EpwingDictionary::loadArticle( int articlePage, int articleOffset, string & articleHeadword, @@ -521,11 +599,7 @@ void EpwingArticleRequest::run() try { - dict.loadArticle( chain[ x ].articleOffset, - headword, - articleText, - articlePage, - articleOffset ); + dict.loadArticle( chain[ x ].articleOffset, headword, articleText, articlePage, articleOffset ); } catch(...) { @@ -562,7 +636,11 @@ void EpwingArticleRequest::run() getBuiltInArticle( alts[ x ], pages, offsets, alternateArticles ); } - if ( mainArticles.empty() && alternateArticles.empty() ) + + QRegularExpressionMatch m = RX::Epwing::refWord.match( gd::toQString( word ) ); + bool ref = m.hasMatch(); + + if ( mainArticles.empty() && alternateArticles.empty() && !ref) { // No such word finish(); @@ -589,6 +667,26 @@ void EpwingArticleRequest::run() result += i->second.second; } + { + QRegularExpressionMatch m = RX::Epwing::refWord.match( gd::toQString( word ) ); + if( m.hasMatch() ) + { + string headword, articleText; + int articlePage = m.captured( 1 ).toInt(); + int articleOffset = m.captured( 2 ).toInt(); + if( word[ 0 ] =='r' ) + dict.loadArticleNextPage( headword, articleText, articlePage, articleOffset ); + else + { + //starts with p + dict.loadArticlePreviousPage( headword, articleText, articlePage, articleOffset ); + } + + result += articleText; + + } + } + result += ""; Mutex::Lock _( dataMutex ); diff --git a/epwing_book.cc b/epwing_book.cc index 16a84917..58e3adea 100644 --- a/epwing_book.cc +++ b/epwing_book.cc @@ -14,7 +14,7 @@ #include "wstring_qt.hh" #include "folding.hh" #include "epwing_charmap.hh" - +#include "htmlescape.hh" #if defined( Q_OS_WIN32 ) || defined( Q_OS_MAC ) #define _FILE_OFFSET_BITS 64 #endif @@ -687,18 +687,7 @@ QString EpwingBook::getText( int page, int offset, bool text_only ) { error_string.clear(); - EB_Position pos; - pos.page = page; - pos.offset = offset; - currentPosition = pos; - - EB_Error_Code ret = eb_seek_text(&book, &pos); - if( ret != EB_SUCCESS ) - { - setErrorString( "eb_seek_text", ret ); - currentPosition.page = 0; - throw exEbLibrary( error_string.toUtf8().data() ); - } + seekBookThrow( page, offset ); QByteArray buf; char buffer[ TextBufferSize + 1 ]; @@ -710,7 +699,7 @@ QString EpwingBook::getText( int page, int offset, bool text_only ) for( ; ; ) { - ret = eb_read_text( &book, &appendix, &hookSet, &container, + EB_Error_Code ret = eb_read_text( &book, &appendix, &hookSet, &container, TextBufferSize, buffer, &buffer_length ); if( ret != EB_SUCCESS ) @@ -737,6 +726,129 @@ QString EpwingBook::getText( int page, int offset, bool text_only ) return text; } +void EpwingBook::seekBookThrow( int page, int offset ) +{ + EB_Position pos; + pos.page = page; + pos.offset = offset; + currentPosition = pos; + + EB_Error_Code ret = eb_seek_text( &book, &pos ); + if( ret != EB_SUCCESS ) + { + setErrorString( "eb_seek_text", ret ); + currentPosition.page = 0; + throw exEbLibrary( error_string.toUtf8().data() ); + } +} + + QString EpwingBook::getTextWithLength( int page, int offset, int total, EB_Position & pos ) +{ + error_string.clear(); + int currentLength = 0; + + seekBookThrow( page, offset ); + + QByteArray buf; + char buffer[ TextBufferSize + 1 ]; + ssize_t buffer_length; + EContainer container( this, false ); + + prepareToRead(); + + for( ;; ) + { + EB_Error_Code ret = eb_read_text( &book, &appendix, &hookSet, &container, TextBufferSize, buffer, &buffer_length ); + + if( ret != EB_SUCCESS ) + { + setErrorString( "eb_read_text", ret ); + break; + } + + buf += QByteArray( buffer, buffer_length ); + currentLength += buffer_length; + + if( currentLength > total || buffer_length == 0 ) + break; + + if( buf.length() > TextSizeLimit ) + { + error_string = "Data too large"; + currentPosition.page = 0; + return QString(); + } + + ret = eb_forward_text( &book, &appendix ); + if( ret != EB_SUCCESS ) + { + setErrorString( "eb_seek_text", ret ); + currentPosition.page = 0; + throw exEbLibrary( error_string.toUtf8().data() ); + } + } + + eb_tell_text( &book, &pos ); + QString text = QString::fromUtf8( buf.data(), buf.size() ).trimmed(); + finalizeText( text ); + return text; +} + +QString EpwingBook::getPreviousTextWithLength( int page, int offset, int total, EB_Position & pos ) +{ + error_string.clear(); + int currentLength = 0; + + QByteArray buf; + char buffer[ TextBufferSize + 1 ]; + ssize_t buffer_length; + EContainer container( this, false ); + + prepareToRead(); + + for( ;; ) + { + seekBookThrow( page, offset ); + EB_Error_Code ret = eb_backward_text( &book, &appendix ); + if( ret != EB_SUCCESS ) + { + setErrorString( "eb_backward_text", ret ); + currentPosition.page = 0; + throw exEbLibrary( error_string.toUtf8().data() ); + } + eb_tell_text( &book, &pos ); + page = pos.page; + offset = pos.offset; + + ret = eb_read_text( &book, &appendix, &hookSet, &container, TextBufferSize, buffer, &buffer_length ); + + if( ret != EB_SUCCESS ) + { + setErrorString( "eb_read_text", ret ); + break; + } + + buf.prepend( QByteArray( buffer, buffer_length )); + currentLength += buffer_length; + + if( currentLength > total || buffer_length == 0 ) + break; + + if( buf.length() > TextSizeLimit ) + { + error_string = "Data too large"; + currentPosition.page = 0; + return QString(); + } + } + + QString text = QString::fromUtf8( buf.data(), buf.size() ).trimmed(); + finalizeText( text ); + return text; +} + + + void EpwingBook::getReferencesFromText( int page, int offset ) { error_string.clear(); @@ -934,6 +1046,7 @@ bool EpwingBook::processRef( EpwingHeadword & head) pos.page = epos.first; pos.offset = epos.second; + // epwing ebook use ref link to navigate , the headword(at such position) usually has no meaningful point. if( readHeadword( pos, head.headword, true ) ) { if( head.headword.isEmpty() || head.headword.contains( badLinks ) ) @@ -944,13 +1057,16 @@ bool EpwingBook::processRef( EpwingHeadword & head) head.page = pos.page; head.offset = pos.offset; auto key = ( (uint64_t)pos.page ) << 32 | ( pos.offset ); - if( !allRefPositions.contains( key ) ) + //this only add the existed reference point which has already in the headwords as another headword(rxxxxAtxxxx) in the headword list. + //this will make the loadarticle's real reference link judgement easier. + + if( allRefPositions.contains( key ) ) { // fixed the reference headword ,to avoid the headword collision with other entry . //if(!allHeadwordPositions.contains(key)) head.headword = QString( "r%1At%2" ).arg( pos.page ).arg( pos.offset ); - allRefPositions[ key ] = true; + //allRefPositions[ key ] = true; try { @@ -1080,6 +1196,10 @@ void EpwingBook::fixHeadword( QString & headword ) //if( isHeadwordCorrect( fixed ) ) // headword = fixed; + //remove leading number and space. + QRegularExpression leadingNumAndSpace( R"(^[\d\s]+\b)" ); + fixed.remove( leadingNumAndSpace ); + headword = fixed; } @@ -1087,28 +1207,30 @@ void EpwingBook::getArticle( QString & headword, QString & articleText, int page, int offset, bool text_only) { error_string.clear(); - char buffer[ TextBufferSize + 1 ]; - EB_Position pos; - pos.page = page; - pos.offset = offset; + seekBookThrow( page, offset ); - currentPosition = pos; + readHeadword( headword, text_only ); - EB_Error_Code ret = eb_seek_text( &book, &pos ); - if( ret != EB_SUCCESS ) - { - setErrorString( "eb_seek_text", ret ); - throw exEbLibrary( error_string.toUtf8().data() ); - } + QString hw = Html::unescape( headword, true ); + fixHeadword( hw ); + auto parts = hw.split( QChar::Space, Qt::SkipEmptyParts ); + articleText = getText( page, offset, text_only ); + +} + +void EpwingBook::readHeadword(QString & headword, bool text_only) +{ EContainer container( this, text_only ); ssize_t length; prepareToRead(); - ret = eb_read_heading( &book, &appendix, &hookSet, &container, - TextBufferSize, buffer, &length ); + char buffer[ TextBufferSize + 1 ]; + + EB_Error_Code ret = eb_read_heading( &book, &appendix, &hookSet, &container, + TextBufferSize, buffer, &length ); if( ret != EB_SUCCESS ) { setErrorString( "eb_read_heading", ret ); @@ -1120,8 +1242,35 @@ void EpwingBook::getArticle( QString & headword, QString & articleText, if( text_only ) fixHeadword( headword ); +} - articleText = getText( pos.page, pos.offset, text_only); +EB_Position EpwingBook::getArticleNextPage( + QString & headword, QString & articleText, + int page, int offset, bool text_only) +{ + error_string.clear(); + + seekBookThrow( page, offset ); + + readHeadword( headword, text_only ); + + EB_Position pos; + articleText = getTextWithLength( page, offset, 4000, pos); + return pos; +} + +EB_Position EpwingBook::getArticlePreviousPage( + QString & headword, QString & articleText, int page, int offset, bool text_only ) +{ + error_string.clear(); + + seekBookThrow( page, offset ); + + readHeadword( headword, text_only ); + + EB_Position pos; + articleText = getPreviousTextWithLength( page, offset, 4000, pos ); + return pos; } const char * EpwingBook::beginDecoration( unsigned int code ) diff --git a/epwing_book.hh b/epwing_book.hh index 25b29c92..e0a8054d 100644 --- a/epwing_book.hh +++ b/epwing_book.hh @@ -100,6 +100,9 @@ class EpwingBook // Retrieve article text from dictionary QString getText( int page, int offset, bool text_only); + void seekBookThrow( int page, int offset ); + QString getTextWithLength( int page, int offset, int total, EB_Position & pos ); + QString getPreviousTextWithLength( int page, int offset, int total, EB_Position & pos ); unsigned int normalizeDecorationCode( unsigned int code ); @@ -196,7 +199,11 @@ public: // Retrieve article from dictionary void getArticle( QString & headword, QString & articleText, int page, int offset, bool text_only ); + void readHeadword( QString & headword, bool text_only); + EB_Position getArticleNextPage( QString & headword, QString & articleText, + int page, int offset, bool text_only ); + EB_Position getArticlePreviousPage( QString & headword, QString & articleText, int page, int offset, bool text_only ); const char * beginDecoration( unsigned int code ); const char * endDecoration( unsigned int code );