Merge pull request #162 from xiaoyifang/fix/epwing-2

fix: epwing reference process logic
This commit is contained in:
xiaoyifang 2022-10-04 21:10:15 +08:00 committed by GitHub
commit a9b3a64f05
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 204 additions and 187 deletions

233
epwing.cc
View file

@ -169,8 +169,12 @@ protected:
private:
/// Loads the article.
void loadArticle( quint32 address, string & articleHeadword,
string & articleText, int & articlePage, int & articleOffset );
void loadArticle( quint32 address,
string & articleHeadword,
string & articleText,
int & articlePage,
int & articleOffset,
QString word = 0 );
void loadArticle( int articlePage, int articleOffset, string & articleHeadword,
string & articleText );
@ -284,10 +288,15 @@ void EpwingDictionary::removeDirectory( QString const & directory )
}
void EpwingDictionary::loadArticle( quint32 address,
string & articleHeadword,
string & articleText,
int & articlePage,
int & articleOffset )
int & articleOffset,
QString word)
{
vector< char > chunk;
@ -307,7 +316,7 @@ void EpwingDictionary::loadArticle( quint32 address,
try
{
Mutex::Lock _( eBook.getLibMutex() );
eBook.getArticle( headword, text, articlePage, articleOffset, false );
eBook.getArticle( headword, text, articlePage, articleOffset, false, word );
}
catch( std::exception & e )
{
@ -521,8 +530,16 @@ void EpwingArticleRequest::run()
try
{
dict.loadArticle( chain[ x ].articleOffset, headword, articleText,
articlePage, articleOffset );
dict.loadArticle( chain[ x ].articleOffset,
headword,
articleText,
articlePage,
articleOffset,
gd::toQString(word) );
}
catch(...)
{
@ -939,6 +956,107 @@ sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch(
} // anonymous namespace
void addWordToChunks( Epwing::Book::EpwingHeadword & head,
ChunkedStorage::Writer & chunks,
BtreeIndexing::IndexedWords & indexedWords,
int & wordCount,
int & articleCount )
{
if( !head.headword.isEmpty() )
{
uint32_t offset = chunks.startNewBlock();
chunks.addToBlock( &head.page, sizeof( head.page ) );
chunks.addToBlock( &head.offset, sizeof( head.offset ) );
wstring hw = gd::toWString( head.headword );
indexedWords.addWord( hw, offset );
wordCount++;
articleCount++;
vector< wstring > words;
// Parse combined kanji/katakana/hiragana headwords
int w_prev = 0;
wstring word;
for( wstring::size_type n = 0; n < hw.size(); n++ )
{
gd::wchar ch = hw[ n ];
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isSign( ch )
|| EpwingDictionary::isJapanesePunctiation( ch ) )
continue;
int w = EpwingDictionary::japaneseWriting( ch );
if( w > 0 )
{
// Store only separated words
gd::wchar ch_prev = 0;
if( n )
ch_prev = hw[ n - 1 ];
bool needStore = ( n == 0 || Folding::isPunct( ch_prev ) || Folding::isWhitespace( ch_prev )
|| EpwingDictionary::isJapanesePunctiation( ch ) );
word.push_back( ch );
w_prev = w;
wstring::size_type i;
for( i = n + 1; i < hw.size(); i++ )
{
ch = hw[ i ];
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) )
break;
w = EpwingDictionary::japaneseWriting( ch );
if( w != w_prev )
break;
word.push_back( ch );
}
if( needStore )
{
if( i >= hw.size() || Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|| EpwingDictionary::isJapanesePunctiation( ch ) )
words.push_back( word );
}
word.clear();
if( i < hw.size() )
n = i;
else
break;
}
}
if( words.size() > 1 )
{
// Allow only one word in every charset
size_t n;
int writings[ 4 ];
memset( writings, 0, sizeof( writings ) );
for( n = 0; n < words.size(); n++ )
{
int w = EpwingDictionary::japaneseWriting( words[ n ][ 0 ] );
if( writings[ w ] )
break;
else
writings[ w ] = 1;
}
if( n >= words.size() )
{
for( n = 0; n < words.size(); n++ )
{
indexedWords.addWord( words[ n ], offset );
wordCount++;
}
}
}
}
}
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
@ -1045,107 +1163,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
for( ; ; )
{
if( !head.headword.isEmpty() )
{
uint32_t offset = chunks.startNewBlock();
chunks.addToBlock( &head.page, sizeof( head.page ) );
chunks.addToBlock( &head.offset, sizeof( head.offset ) );
wstring hw = gd::toWString( head.headword );
indexedWords.addWord( hw, offset );
wordCount++;
articleCount++;
vector< wstring > words;
// Parse combined kanji/katakana/hiragana headwords
int w_prev = 0;
wstring word;
for( wstring::size_type n = 0; n < hw.size(); n++ )
{
gd::wchar ch = hw[ n ];
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|| EpwingDictionary::isSign( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) )
continue;
int w = EpwingDictionary::japaneseWriting( ch );
if( w > 0 )
{
// Store only separated words
gd::wchar ch_prev = 0;
if( n )
ch_prev = hw[ n - 1 ];
bool needStore = ( n == 0
|| Folding::isPunct( ch_prev )
|| Folding::isWhitespace( ch_prev )
|| EpwingDictionary::isJapanesePunctiation( ch ) );
word.push_back( ch );
w_prev = w;
wstring::size_type i;
for( i = n + 1; i < hw.size(); i++ )
{
ch = hw[ i ];
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|| EpwingDictionary::isJapanesePunctiation( ch ) )
break;
w = EpwingDictionary::japaneseWriting( ch );
if( w != w_prev )
break;
word.push_back( ch );
}
if( needStore )
{
if( i >= hw.size() || Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|| EpwingDictionary::isJapanesePunctiation( ch ) )
words.push_back( word );
}
word.clear();
if( i < hw.size() )
n = i;
else
break;
}
}
if( words.size() > 1 )
{
// Allow only one word in every charset
size_t n;
int writings[ 4 ];
memset( writings, 0, sizeof(writings) );
for( n = 0; n < words.size(); n++ )
{
int w = EpwingDictionary::japaneseWriting( words[ n ][ 0 ] );
if( writings[ w ] )
break;
else
writings[ w ] = 1;
}
if( n >= words.size() )
{
for( n = 0; n < words.size(); n++ )
{
indexedWords.addWord( words[ n ], offset );
wordCount++;
}
}
}
}
addWordToChunks( head, chunks, indexedWords, wordCount, articleCount );
if( !dict.getNextHeadword( head ) )
break;
}
while( dict.processRef( head ) )
{
addWordToChunks( head, chunks, indexedWords, wordCount, articleCount );
}
dict.clearBuffers();
// Finish with the chunks

View file

@ -2,14 +2,23 @@
#define __EPWING_HH__INCLUDED__
#include "dictionary.hh"
#include "epwing_book.hh"
#include "btreeidx.hh"
#include "chunkedstorage.hh"
/// Support for the Epwing dictionaries.
namespace Epwing {
using std::vector;
using std::string;
vector< sptr< Dictionary::Class > > makeDictionaries(
void addWordToChunks( Epwing::Book::EpwingHeadword & head,
ChunkedStorage::Writer & chunks,
BtreeIndexing::IndexedWords & indexedWords,
int & wordCount,
int & articleCount );
vector< sptr< Dictionary::Class > >
makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & )

View file

@ -6,6 +6,7 @@
#include <QDir>
#include <QTextStream>
#include <QTextDocumentFragment>
#include <QHash>
#include "gddebug.hh"
#include "fsencoding.hh"
#include "audiolink.hh"
@ -732,7 +733,7 @@ QString EpwingBook::getText( int page, int offset, bool text_only )
}
QString text = QString::fromUtf8( buf.data(), buf.size() ).trimmed();
finalizeText( text );
finalizeText( text);
return text;
}
@ -776,7 +777,9 @@ void EpwingBook::getReferencesFromText( int page, int offset )
}
for( int x = 0; x < refPages.size(); x++ )
{
LinksQueue.push_back( EWPos( refPages[ x ], refOffsets[ x ] ) );
}
}
EB_Error_Code EpwingBook::forwardText( EB_Position & startPos )
@ -850,60 +853,13 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head )
fixHeadword( head.headword );
EWPos epos( pos.page, pos.offset );
allHeadwordPositions[ head.headword ] << epos;
allHeadwordPositions[ ((uint64_t)pos.page)<<32|(pos.offset>>2) ] =true;
}
bool EpwingBook::getNextHeadword( EpwingHeadword & head )
{
EB_Position pos;
QRegularExpression badLinks( "#(v|n)\\d", QRegularExpression::UseUnicodePropertiesOption);
// At first we check references queue
while( !LinksQueue.isEmpty() )
{
EWPos epos = LinksQueue.last();
LinksQueue.pop_back();
pos.page = epos.first;
pos.offset = epos.second;
if( readHeadword( pos, head.headword, true ) )
{
if( head.headword.isEmpty()
|| head.headword.contains( badLinks ) )
continue;
fixHeadword( head.headword );
head.page = pos.page;
head.offset = pos.offset;
if( allHeadwordPositions.contains( head.headword ) )
{
// existed position
bool existed = false;
foreach( EWPos epos, allHeadwordPositions[ head.headword ] )
{
if( pos.page == epos.first && abs( pos.offset - epos.second ) <= 4 )
{
existed = true;
break;
}
}
if( !existed )
{
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
return true;
}
}
else
{
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
return true;
}
}
}
// No queued positions - forward to next article
@ -934,13 +890,7 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
indexHeadwordsPosition = pos;
try
{
getReferencesFromText( pos.page, pos.offset );
}
catch( std::exception & )
{
}
head.page = pos.page;
head.offset = pos.offset;
@ -953,27 +903,17 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
fixHeadword( head.headword );
if( allHeadwordPositions.contains( head.headword ) )
try
{
// existed position
bool existed = false;
foreach( EWPos epos, allHeadwordPositions[ head.headword ] )
{
if( pos.page == epos.first && abs( pos.offset - epos.second ) <= 4 )
{
existed = true;
break;
}
}
if( !existed )
{
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
return true;
}
getReferencesFromText( pos.page, pos.offset);
}
else
catch( std::exception & )
{
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
}
if( !allHeadwordPositions.contains( ((uint64_t)pos.page) << 32 | ( pos.offset / 4 ) ) )
{
allHeadwordPositions[ ((uint64_t)pos.page) << 32 | ( pos.offset / 4 ) ] = true;
return true;
}
}
@ -981,6 +921,43 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
return true;
}
bool EpwingBook::processRef( EpwingHeadword & head)
{
EB_Position pos;
QRegularExpression badLinks( "#(v|n)\\d", QRegularExpression::UseUnicodePropertiesOption );
while( !LinksQueue.isEmpty() )
{
EWPos epos = LinksQueue.last();
LinksQueue.pop_back();
pos.page = epos.first;
pos.offset = epos.second;
if( readHeadword( pos, head.headword, true ) )
{
if( head.headword.isEmpty() || head.headword.contains( badLinks ) )
continue;
fixHeadword( head.headword );
head.page = pos.page;
head.offset = pos.offset;
auto key = ( (uint64_t)pos.page ) << 32 | ( pos.offset >> 2 );
if( !allRefPositions.contains( key ) )
{
// fixed the reference headword ,to avoid the headword collision with other entry .
//if(!allHeadwordPositions.contains(key))
head.headword = QString( "r%1At%2" ).arg( pos.page ).arg( pos.offset );
allRefPositions[ key ] = true;
return true;
}
}
}
return false;
}
bool EpwingBook::readHeadword( EB_Position const& pos,
QString & headword,
bool text_only )
@ -1094,7 +1071,7 @@ void EpwingBook::fixHeadword( QString & headword )
}
void EpwingBook::getArticle( QString & headword, QString & articleText,
int page, int offset, bool text_only)
int page, int offset, bool text_only, QString word)
{
error_string.clear();
char buffer[ TextBufferSize + 1 ];
@ -1126,12 +1103,12 @@ void EpwingBook::getArticle( QString & headword, QString & articleText,
}
headword = QString::fromUtf8( buffer, length );
finalizeText( headword );
finalizeText( headword);
if( text_only )
fixHeadword( headword );
articleText = getText( pos.page, pos.offset, text_only );
articleText = getText( pos.page, pos.offset, text_only);
}
const char * EpwingBook::beginDecoration( unsigned int code )
@ -1268,7 +1245,7 @@ void EpwingBook::finalizeText( QString & text )
{
QString headword = QString::fromUtf8( buf, length );
fixHeadword( headword );
url.setPath( Utils::Url::ensureLeadingSlash( headword ) );
url.setPath( Utils::Url::ensureLeadingSlash( QString( "r%1At%2" ).arg( ebpos.page ).arg(ebpos.offset) ) );
}
QString link = "<a href=\"" + url.toEncoded() + "\">";

View file

@ -78,7 +78,8 @@ class EpwingBook
QStringList imageCacheList, soundsCacheList, moviesCacheList, fontsCacheList;
QMap< QString, QString > baseFontsMap, customFontsMap;
QVector< int > refPages, refOffsets;
QMap< QString, QList< EWPos > > allHeadwordPositions;
QMap< uint64_t,bool > allHeadwordPositions;
QMap< uint64_t, bool > allRefPositions;
QVector< EWPos > LinksQueue;
int refOpenCount, refCloseCount;
static Mutex libMutex;
@ -98,7 +99,7 @@ class EpwingBook
EB_Error_Code forwardText( EB_Position & startPos );
// Retrieve article text from dictionary
QString getText( int page, int offset, bool text_only );
QString getText( int page, int offset, bool text_only);
unsigned int normalizeDecorationCode( unsigned int code );
@ -151,6 +152,7 @@ public:
void clearBuffers()
{
allHeadwordPositions.clear();
allRefPositions.clear();
LinksQueue.clear();
}
@ -181,6 +183,8 @@ public:
// Find next headword and article position
bool getNextHeadword( EpwingHeadword & head );
bool processRef( EpwingHeadword & head );
bool readHeadword( EB_Position const & pos,
QString & headword,
bool text_only );
@ -191,7 +195,7 @@ public:
// Retrieve article from dictionary
void getArticle( QString & headword, QString & articleText,
int page, int offset, bool text_only );
int page, int offset, bool text_only, QString word=0 );
const char * beginDecoration( unsigned int code );
const char * endDecoration( unsigned int code );