epwing: ignore reference link processing when parse the dictionary

This commit is contained in:
Xiao YiFang 2023-03-22 22:52:49 +08:00 committed by xiaoyifang
parent 22f9d5d832
commit ad087b1031
3 changed files with 24 additions and 59 deletions

View file

@ -203,6 +203,7 @@ private:
friend class EpwingWordSearchRequest;
string epwing_previous_button(int& articleOffset, int& articlePage);
string epwing_next_button(int& articleOffset, int& articlePage);
bool readHeadword( EB_Position & pos, QString & headword );
};
@ -1043,6 +1044,20 @@ sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch(
return std::make_shared<EpwingWordSearchRequest>( *this, str, minLength, (int)maxSuffixVariation,
false, maxResults );
}
bool Epwing::EpwingDictionary::readHeadword( EB_Position & pos, QString & headword )
{
try
{
Mutex::Lock _( eBook.getLibMutex() );
eBook.readHeadword( pos,headword, true);
eBook.fixHeadword( headword );
return eBook.isHeadwordCorrect( headword ) ;
}
catch( std::exception & e )
{
return false;
}
}
} // anonymous namespace
@ -1258,11 +1273,6 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
break;
}
while( dict.processRef( head ) )
{
addWordToChunks( head, chunks, indexedWords, wordCount, articleCount );
}
dict.clearBuffers();
// Finish with the chunks

View file

@ -1002,8 +1002,6 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
indexHeadwordsPosition = pos;
head.page = pos.page;
head.offset = pos.offset;
@ -1033,54 +1031,6 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
return true;
}
bool EpwingBook::processRef( EpwingHeadword & head)
{
EB_Position pos;
QRegularExpression badLinks( "#(v|n)\\d", QRegularExpression::UseUnicodePropertiesOption );
while( !LinksQueue.isEmpty() )
{
EWPos epos = LinksQueue.last();
LinksQueue.pop_back();
pos.page = epos.first;
pos.offset = epos.second;
// epwing ebook use ref link to navigate , the headword(at such position) usually has no meaningful point.
if( readHeadword( pos, head.headword, true ) )
{
if( head.headword.isEmpty() || head.headword.contains( badLinks ) )
continue;
fixHeadword( head.headword );
head.page = pos.page;
head.offset = pos.offset;
auto key = ( (uint64_t)pos.page ) << 32 | ( pos.offset );
//this only add the existed reference point which has already in the headwords as another headword(rxxxxAtxxxx) in the headword list.
//this will make the loadarticle's real reference link judgement easier.
if( allHeadwordPositions.contains( key ) )
{
// fixed the reference headword ,to avoid the headword collision with other entry .
//if(!allHeadwordPositions.contains(key))
head.headword = QString( "r%1at%2" ).arg( pos.page ).arg( pos.offset );
//allRefPositions[ key ] = true;
try
{
getReferencesFromText( pos.page, pos.offset);
}
catch( std::exception & )
{
}
return true;
}
}
}
return false;
}
bool EpwingBook::readHeadword( EB_Position const& pos,
QString & headword,
bool text_only )
@ -1199,6 +1149,13 @@ void EpwingBook::fixHeadword( QString & headword )
QRegularExpression leadingNumAndSpace( R"(^[\d\s]+\b)" );
fixed.remove( leadingNumAndSpace );
auto parts = fixed.split( ' ', Qt::SkipEmptyParts );
if( parts.size() > 2 ) {
//only return the first parts to avoid duplication
headword = QString( "%1 %2" ).arg( parts[ 0 ], parts[ 1 ] );
return;
}
headword = fixed;
}

View file

@ -186,8 +186,6 @@ public:
// Find next headword and article position
bool getNextHeadword( EpwingHeadword & head );
bool processRef( EpwingHeadword & head );
bool readHeadword( EB_Position const & pos,
QString & headword,
bool text_only );