fix: skip invalid headword in some epwing dictionaries (#1096)

* fix: for some epwing dictionaries , skip invalid headword

fix #1095

* [autofix.ci] apply automated fixes

* fix: epwing detect next text block

* [autofix.ci] apply automated fixes

* fix: epwing getFirstHeadword do not need forwardtext

* fix: epwing ,if epwing subbook does not contain headword ,use menu instead

* [autofix.ci] apply automated fixes

* fix: if epwing subbook does not contain headword ,use menu instead

* [autofix.ci] apply automated fixes

* fix:code smells

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2023-08-30 19:12:59 +08:00 committed by GitHub
parent 18c25b36ee
commit 2dd04207d1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 136 additions and 30 deletions

View file

@ -1220,16 +1220,26 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
ChunkedStorage::Writer chunks( idx );
Epwing::Book::EpwingHeadword head;
dict.getFirstHeadword( head );
int wordCount = 0;
int articleCount = 0;
for ( ;; ) {
addWordToChunks( head, chunks, indexedWords, wordCount, articleCount );
if ( !dict.getNextHeadword( head ) )
break;
if ( dict.getFirstHeadword( head ) ) {
for ( ;; ) {
addWordToChunks( head, chunks, indexedWords, wordCount, articleCount );
if ( !dict.getNextHeadword( head ) )
break;
}
}
else {
//the book does not contain text,use menu instead if any.
if ( dict.getMenu( head ) ) {
auto candidateItems = dict.candidate( head.page, head.offset );
for ( Epwing::Book::EpwingHeadword word : candidateItems ) {
addWordToChunks( word, chunks, indexedWords, wordCount, articleCount );
}
}
else {
throw exEbLibrary( dict.errorString().toUtf8().data() );
}
}
dict.clearBuffers();

View file

@ -61,6 +61,7 @@ HookFunc( hook_mpeg );
HookFunc( hook_narrow_font );
HookFunc( hook_wide_font );
HookFunc( hook_reference );
HookFunc( hook_candidate );
const EB_Hook hooks[] = { { EB_HOOK_NEWLINE, hook_newline },
{ EB_HOOK_ISO8859_1, hook_iso8859_1 },
@ -89,6 +90,7 @@ const EB_Hook hooks[] = { { EB_HOOK_NEWLINE, hook_newline },
{ EB_HOOK_WIDE_FONT, hook_wide_font },
{ EB_HOOK_BEGIN_REFERENCE, hook_reference },
{ EB_HOOK_END_REFERENCE, hook_reference },
{ EB_HOOK_END_CANDIDATE_GROUP, hook_candidate },
{ EB_HOOK_NULL, NULL } };
const EB_Hook refHooks[] = {
@ -377,6 +379,21 @@ hook_reference( EB_Book * book, EB_Appendix *, void * container, EB_Hook_Code co
return EB_SUCCESS;
}
EB_Error_Code
hook_candidate( EB_Book * book, EB_Appendix *, void * container, EB_Hook_Code code, int, const unsigned int * argv )
{
EContainer * cn = static_cast< EContainer * >( container );
if ( cn->textOnly )
return EB_SUCCESS;
QByteArray str = cn->book->handleCandidate( code, argv );
if ( !str.isEmpty() )
eb_write_text( book, str.data(), str.size() );
return EB_SUCCESS;
}
// EpwingBook class
EpwingBook::EpwingBook():
@ -564,7 +581,7 @@ QString EpwingBook::createCacheDir( QString const & dirName )
if ( !info.exists() || !info.isDir() ) {
if ( !dir.mkdir( mainCacheDir ) ) {
gdWarning( "Epwing: can't create cache directory \"%s\"", mainCacheDir.toUtf8().data() );
return QString();
return {};
}
}
@ -573,7 +590,7 @@ QString EpwingBook::createCacheDir( QString const & dirName )
if ( !info.exists() || !info.isDir() ) {
if ( !dir.mkdir( cacheDir ) ) {
gdWarning( "Epwing: can't create cache directory \"%s\"", cacheDir.toUtf8().data() );
return QString();
return {};
}
}
return cacheDir;
@ -649,7 +666,7 @@ QString EpwingBook::title()
if ( codec_Euc )
return codec_Euc->toUnicode( buf );
return QString();
return {};
}
QString EpwingBook::copyright()
@ -657,7 +674,7 @@ QString EpwingBook::copyright()
error_string.clear();
if ( !eb_have_copyright( &book ) )
return QString();
return {};
EB_Position position;
EB_Error_Code ret = eb_copyright( &book, &position );
@ -671,9 +688,18 @@ QString EpwingBook::copyright()
return getText( position.page, position.offset, true );
}
QList< EpwingHeadword > EpwingBook::candidate( int page, int offset )
{
//clear candidateItems in getText;
candidateItems.clear();
getText( page, offset, false );
return candidateItems;
}
QString EpwingBook::getText( int page, int offset, bool text_only )
{
error_string.clear();
candidateItems.clear();
seekBookThrow( page, offset );
@ -701,7 +727,7 @@ QString EpwingBook::getText( int page, int offset, bool text_only )
if ( buf.length() > TextSizeLimit ) {
error_string = "Data too large";
currentPosition.page = 0;
return QString();
return {};
}
}
@ -822,7 +848,6 @@ QString EpwingBook::getPreviousTextWithLength( int page, int offset, int total,
return text;
}
void EpwingBook::getReferencesFromText( int page, int offset )
{
error_string.clear();
@ -872,15 +897,14 @@ EB_Error_Code EpwingBook::forwardText( EB_Position & startPos )
}
ret = eb_forward_text( &book, &appendix );
while ( ret == EB_ERR_END_OF_CONTENT ) {
ret = eb_tell_text( &book, &startPos );
if ( ret != EB_SUCCESS )
break;
while ( ret != EB_SUCCESS ) {
if ( startPos.page >= book.subbook_current->text.end_page )
return EB_ERR_END_OF_CONTENT;
startPos.offset += 2;
const auto offset = startPos.offset + 2;
startPos.offset = offset % EB_SIZE_PAGE;
startPos.page += offset / EB_SIZE_PAGE;
currentPosition = startPos;
ret = eb_seek_text( &book, &startPos );
@ -891,7 +915,7 @@ EB_Error_Code EpwingBook::forwardText( EB_Position & startPos )
return ret;
}
void EpwingBook::getFirstHeadword( EpwingHeadword & head )
bool EpwingBook::getFirstHeadword( EpwingHeadword & head )
{
error_string.clear();
@ -900,13 +924,15 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head )
EB_Error_Code ret = eb_text( &book, &pos );
if ( ret != EB_SUCCESS ) {
setErrorString( "eb_text", ret );
throw exEbLibrary( error_string.toUtf8().data() );
qWarning() << error_string;
return false;
}
ret = forwardText( pos );
if ( ret != EB_SUCCESS ) {
setErrorString( "forwardText", ret );
throw exEbLibrary( error_string.toUtf8().data() );
setErrorString( "getFirstHeadword", ret );
qWarning() << error_string;
return false;
}
eb_backward_text( &book, &appendix );
@ -914,7 +940,49 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head )
ret = eb_tell_text( &book, &pos );
if ( ret != EB_SUCCESS ) {
setErrorString( "eb_tell_text", ret );
throw exEbLibrary( error_string.toUtf8().data() );
qWarning() << error_string;
return false;
}
currentPosition = pos;
indexHeadwordsPosition = pos;
head.page = pos.page;
head.offset = pos.offset;
if ( !readHeadword( pos, head.headword, true ) ) {
qWarning() << error_string;
return false;
}
fixHeadword( head.headword );
allHeadwordPositions[ ( (uint64_t)pos.page ) << 32 | ( pos.offset ) ] = true;
return true;
}
bool EpwingBook::haveMenu()
{
error_string.clear();
int ret = eb_have_menu( &book );
return ret == 1;
}
bool EpwingBook::getMenu( EpwingHeadword & head )
{
error_string.clear();
if ( !haveMenu() ) {
return false;
}
EB_Position pos;
EB_Error_Code ret = eb_menu( &book, &pos );
if ( ret != EB_SUCCESS ) {
setErrorString( "getMenu", ret );
return false;
}
currentPosition = pos;
@ -924,19 +992,18 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head )
head.offset = pos.offset;
if ( !readHeadword( pos, head.headword, true ) )
throw exEbLibrary( error_string.toUtf8().data() );
return false;
fixHeadword( head.headword );
EWPos epos( pos.page, pos.offset );
allHeadwordPositions[ ( (uint64_t)pos.page ) << 32 | ( pos.offset ) ] = true;
return true;
}
bool EpwingBook::getNextHeadword( EpwingHeadword & head )
{
EB_Position pos;
// No queued positions - forward to next article
error_string.clear();
@ -965,8 +1032,10 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
head.page = pos.page;
head.offset = pos.offset;
if ( !readHeadword( pos, head.headword, true ) )
throw exEbLibrary( error_string.toUtf8().data() );
if ( !readHeadword( pos, head.headword, true ) ) {
qDebug() << "Epwing: ignore the following error=> " << error_string;
continue;
}
if ( head.headword.isEmpty() )
continue;
@ -1766,6 +1835,27 @@ QByteArray EpwingBook::handleReference( EB_Hook_Code code, const unsigned int *
return str.toUtf8();
}
QByteArray EpwingBook::handleCandidate( EB_Hook_Code code, const unsigned int * argv )
{
EpwingHeadword w_headword;
w_headword.headword = currentCandidate();
w_headword.page = argv[ 1 ];
w_headword.offset = argv[ 2 ];
candidateItems << w_headword;
return QByteArray{};
}
QString EpwingBook::currentCandidate()
{
const char * s = eb_current_candidate( &book );
if ( book.character_code == EB_CHARCODE_ISO8859_1 )
return QString::fromLatin1( s );
if ( codec_Euc )
return codec_Euc->toUnicode( s );
return QString{};
}
bool EpwingBook::getMatches( QString word, QVector< QString > & matches )
{
QByteArray bword, bword2;

View file

@ -85,6 +85,7 @@ class EpwingBook
QVector< EWPos > LinksQueue;
int refOpenCount, refCloseCount;
static QMutex libMutex;
QList< EpwingHeadword > candidateItems;
QString createCacheDir( QString const & dir );
@ -184,6 +185,8 @@ public:
// Make name for resource
QString makeFName( QString const & ext, int page, int offset ) const;
QByteArray handleCandidate( EB_Hook_Code code, const unsigned * argv );
QString currentCandidate();
// Store all files in Epwing folder
static void collectFilenames( QString const & directory, vector< string > & files );
@ -199,10 +202,13 @@ public:
QString getCurrentSubBookDirectory();
QString copyright();
QList< EpwingHeadword > candidate( int page, int offset );
QString title();
// Seek to first article
void getFirstHeadword( EpwingHeadword & head );
bool getFirstHeadword( EpwingHeadword & head );
bool haveMenu();
bool getMenu( EpwingHeadword & head );
// Find next headword and article position
bool getNextHeadword( EpwingHeadword & head );