mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-23 20:14:05 +00:00
fix: epwing reference process logic
This commit is contained in:
parent
98769b6dd2
commit
769ef7d24b
233
epwing.cc
233
epwing.cc
|
@ -169,8 +169,12 @@ protected:
|
|||
private:
|
||||
|
||||
/// Loads the article.
|
||||
void loadArticle( quint32 address, string & articleHeadword,
|
||||
string & articleText, int & articlePage, int & articleOffset );
|
||||
void loadArticle( quint32 address,
|
||||
string & articleHeadword,
|
||||
string & articleText,
|
||||
int & articlePage,
|
||||
int & articleOffset,
|
||||
QString word = 0 );
|
||||
|
||||
void loadArticle( int articlePage, int articleOffset, string & articleHeadword,
|
||||
string & articleText );
|
||||
|
@ -284,10 +288,15 @@ void EpwingDictionary::removeDirectory( QString const & directory )
|
|||
}
|
||||
|
||||
void EpwingDictionary::loadArticle( quint32 address,
|
||||
|
||||
string & articleHeadword,
|
||||
|
||||
string & articleText,
|
||||
|
||||
int & articlePage,
|
||||
int & articleOffset )
|
||||
|
||||
int & articleOffset,
|
||||
QString word)
|
||||
{
|
||||
vector< char > chunk;
|
||||
|
||||
|
@ -307,7 +316,7 @@ void EpwingDictionary::loadArticle( quint32 address,
|
|||
try
|
||||
{
|
||||
Mutex::Lock _( eBook.getLibMutex() );
|
||||
eBook.getArticle( headword, text, articlePage, articleOffset, false );
|
||||
eBook.getArticle( headword, text, articlePage, articleOffset, false, word );
|
||||
}
|
||||
catch( std::exception & e )
|
||||
{
|
||||
|
@ -521,8 +530,16 @@ void EpwingArticleRequest::run()
|
|||
|
||||
try
|
||||
{
|
||||
dict.loadArticle( chain[ x ].articleOffset, headword, articleText,
|
||||
articlePage, articleOffset );
|
||||
dict.loadArticle( chain[ x ].articleOffset,
|
||||
|
||||
headword,
|
||||
|
||||
articleText,
|
||||
|
||||
articlePage,
|
||||
|
||||
articleOffset,
|
||||
gd::toQString(word) );
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
|
@ -939,6 +956,107 @@ sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch(
|
|||
|
||||
} // anonymous namespace
|
||||
|
||||
void addWordToChunks( Epwing::Book::EpwingHeadword & head,
|
||||
ChunkedStorage::Writer & chunks,
|
||||
BtreeIndexing::IndexedWords & indexedWords,
|
||||
int & wordCount,
|
||||
int & articleCount )
|
||||
{
|
||||
if( !head.headword.isEmpty() )
|
||||
{
|
||||
uint32_t offset = chunks.startNewBlock();
|
||||
chunks.addToBlock( &head.page, sizeof( head.page ) );
|
||||
chunks.addToBlock( &head.offset, sizeof( head.offset ) );
|
||||
|
||||
wstring hw = gd::toWString( head.headword );
|
||||
|
||||
indexedWords.addWord( hw, offset );
|
||||
wordCount++;
|
||||
articleCount++;
|
||||
|
||||
vector< wstring > words;
|
||||
|
||||
// Parse combined kanji/katakana/hiragana headwords
|
||||
|
||||
int w_prev = 0;
|
||||
wstring word;
|
||||
for( wstring::size_type n = 0; n < hw.size(); n++ )
|
||||
{
|
||||
gd::wchar ch = hw[ n ];
|
||||
|
||||
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isSign( ch )
|
||||
|| EpwingDictionary::isJapanesePunctiation( ch ) )
|
||||
continue;
|
||||
|
||||
int w = EpwingDictionary::japaneseWriting( ch );
|
||||
|
||||
if( w > 0 )
|
||||
{
|
||||
// Store only separated words
|
||||
gd::wchar ch_prev = 0;
|
||||
if( n )
|
||||
ch_prev = hw[ n - 1 ];
|
||||
bool needStore = ( n == 0 || Folding::isPunct( ch_prev ) || Folding::isWhitespace( ch_prev )
|
||||
|| EpwingDictionary::isJapanesePunctiation( ch ) );
|
||||
|
||||
word.push_back( ch );
|
||||
w_prev = w;
|
||||
wstring::size_type i;
|
||||
for( i = n + 1; i < hw.size(); i++ )
|
||||
{
|
||||
ch = hw[ i ];
|
||||
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) )
|
||||
break;
|
||||
w = EpwingDictionary::japaneseWriting( ch );
|
||||
if( w != w_prev )
|
||||
break;
|
||||
word.push_back( ch );
|
||||
}
|
||||
|
||||
if( needStore )
|
||||
{
|
||||
if( i >= hw.size() || Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|
||||
|| EpwingDictionary::isJapanesePunctiation( ch ) )
|
||||
words.push_back( word );
|
||||
}
|
||||
word.clear();
|
||||
|
||||
if( i < hw.size() )
|
||||
n = i;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( words.size() > 1 )
|
||||
{
|
||||
// Allow only one word in every charset
|
||||
|
||||
size_t n;
|
||||
int writings[ 4 ];
|
||||
memset( writings, 0, sizeof( writings ) );
|
||||
|
||||
for( n = 0; n < words.size(); n++ )
|
||||
{
|
||||
int w = EpwingDictionary::japaneseWriting( words[ n ][ 0 ] );
|
||||
if( writings[ w ] )
|
||||
break;
|
||||
else
|
||||
writings[ w ] = 1;
|
||||
}
|
||||
|
||||
if( n >= words.size() )
|
||||
{
|
||||
for( n = 0; n < words.size(); n++ )
|
||||
{
|
||||
indexedWords.addWord( words[ n ], offset );
|
||||
wordCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vector< sptr< Dictionary::Class > > makeDictionaries(
|
||||
vector< string > const & fileNames,
|
||||
string const & indicesDir,
|
||||
|
@ -1045,107 +1163,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
for( ; ; )
|
||||
{
|
||||
if( !head.headword.isEmpty() )
|
||||
{
|
||||
uint32_t offset = chunks.startNewBlock();
|
||||
chunks.addToBlock( &head.page, sizeof( head.page ) );
|
||||
chunks.addToBlock( &head.offset, sizeof( head.offset ) );
|
||||
|
||||
wstring hw = gd::toWString( head.headword );
|
||||
|
||||
indexedWords.addWord( hw, offset );
|
||||
wordCount++;
|
||||
articleCount++;
|
||||
|
||||
vector< wstring > words;
|
||||
|
||||
// Parse combined kanji/katakana/hiragana headwords
|
||||
|
||||
int w_prev = 0;
|
||||
wstring word;
|
||||
for( wstring::size_type n = 0; n < hw.size(); n++ )
|
||||
{
|
||||
gd::wchar ch = hw[ n ];
|
||||
|
||||
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|
||||
|| EpwingDictionary::isSign( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) )
|
||||
continue;
|
||||
|
||||
int w = EpwingDictionary::japaneseWriting( ch );
|
||||
|
||||
if( w > 0 )
|
||||
{
|
||||
// Store only separated words
|
||||
gd::wchar ch_prev = 0;
|
||||
if( n )
|
||||
ch_prev = hw[ n - 1 ];
|
||||
bool needStore = ( n == 0
|
||||
|| Folding::isPunct( ch_prev )
|
||||
|| Folding::isWhitespace( ch_prev )
|
||||
|| EpwingDictionary::isJapanesePunctiation( ch ) );
|
||||
|
||||
word.push_back( ch );
|
||||
w_prev = w;
|
||||
wstring::size_type i;
|
||||
for( i = n + 1; i < hw.size(); i++ )
|
||||
{
|
||||
ch = hw[ i ];
|
||||
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|
||||
|| EpwingDictionary::isJapanesePunctiation( ch ) )
|
||||
break;
|
||||
w = EpwingDictionary::japaneseWriting( ch );
|
||||
if( w != w_prev )
|
||||
break;
|
||||
word.push_back( ch );
|
||||
}
|
||||
|
||||
if( needStore )
|
||||
{
|
||||
if( i >= hw.size() || Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|
||||
|| EpwingDictionary::isJapanesePunctiation( ch ) )
|
||||
words.push_back( word );
|
||||
}
|
||||
word.clear();
|
||||
|
||||
if( i < hw.size() )
|
||||
n = i;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( words.size() > 1 )
|
||||
{
|
||||
// Allow only one word in every charset
|
||||
|
||||
size_t n;
|
||||
int writings[ 4 ];
|
||||
memset( writings, 0, sizeof(writings) );
|
||||
|
||||
for( n = 0; n < words.size(); n++ )
|
||||
{
|
||||
int w = EpwingDictionary::japaneseWriting( words[ n ][ 0 ] );
|
||||
if( writings[ w ] )
|
||||
break;
|
||||
else
|
||||
writings[ w ] = 1;
|
||||
}
|
||||
|
||||
if( n >= words.size() )
|
||||
{
|
||||
for( n = 0; n < words.size(); n++ )
|
||||
{
|
||||
indexedWords.addWord( words[ n ], offset );
|
||||
wordCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
addWordToChunks( head, chunks, indexedWords, wordCount, articleCount );
|
||||
if( !dict.getNextHeadword( head ) )
|
||||
break;
|
||||
}
|
||||
|
||||
while( dict.processRef( head ) )
|
||||
{
|
||||
addWordToChunks( head, chunks, indexedWords, wordCount, articleCount );
|
||||
}
|
||||
|
||||
dict.clearBuffers();
|
||||
|
||||
// Finish with the chunks
|
||||
|
|
13
epwing.hh
13
epwing.hh
|
@ -2,14 +2,23 @@
|
|||
#define __EPWING_HH__INCLUDED__
|
||||
|
||||
#include "dictionary.hh"
|
||||
|
||||
#include "epwing_book.hh"
|
||||
#include "btreeidx.hh"
|
||||
#include "chunkedstorage.hh"
|
||||
/// Support for the Epwing dictionaries.
|
||||
namespace Epwing {
|
||||
|
||||
using std::vector;
|
||||
using std::string;
|
||||
|
||||
vector< sptr< Dictionary::Class > > makeDictionaries(
|
||||
void addWordToChunks( Epwing::Book::EpwingHeadword & head,
|
||||
ChunkedStorage::Writer & chunks,
|
||||
BtreeIndexing::IndexedWords & indexedWords,
|
||||
int & wordCount,
|
||||
int & articleCount );
|
||||
|
||||
vector< sptr< Dictionary::Class > >
|
||||
makeDictionaries(
|
||||
vector< string > const & fileNames,
|
||||
string const & indicesDir,
|
||||
Dictionary::Initializing & )
|
||||
|
|
129
epwing_book.cc
129
epwing_book.cc
|
@ -6,6 +6,7 @@
|
|||
#include <QDir>
|
||||
#include <QTextStream>
|
||||
#include <QTextDocumentFragment>
|
||||
#include <QHash>
|
||||
#include "gddebug.hh"
|
||||
#include "fsencoding.hh"
|
||||
#include "audiolink.hh"
|
||||
|
@ -732,7 +733,7 @@ QString EpwingBook::getText( int page, int offset, bool text_only )
|
|||
}
|
||||
|
||||
QString text = QString::fromUtf8( buf.data(), buf.size() ).trimmed();
|
||||
finalizeText( text );
|
||||
finalizeText( text);
|
||||
return text;
|
||||
}
|
||||
|
||||
|
@ -776,7 +777,9 @@ void EpwingBook::getReferencesFromText( int page, int offset )
|
|||
}
|
||||
|
||||
for( int x = 0; x < refPages.size(); x++ )
|
||||
{
|
||||
LinksQueue.push_back( EWPos( refPages[ x ], refOffsets[ x ] ) );
|
||||
}
|
||||
}
|
||||
|
||||
EB_Error_Code EpwingBook::forwardText( EB_Position & startPos )
|
||||
|
@ -850,60 +853,13 @@ void EpwingBook::getFirstHeadword( EpwingHeadword & head )
|
|||
fixHeadword( head.headword );
|
||||
|
||||
EWPos epos( pos.page, pos.offset );
|
||||
allHeadwordPositions[ head.headword ] << epos;
|
||||
allHeadwordPositions[ ((uint64_t)pos.page)<<32|(pos.offset>>2) ] =true;
|
||||
}
|
||||
|
||||
bool EpwingBook::getNextHeadword( EpwingHeadword & head )
|
||||
{
|
||||
EB_Position pos;
|
||||
|
||||
QRegularExpression badLinks( "#(v|n)\\d", QRegularExpression::UseUnicodePropertiesOption);
|
||||
|
||||
// At first we check references queue
|
||||
while( !LinksQueue.isEmpty() )
|
||||
{
|
||||
EWPos epos = LinksQueue.last();
|
||||
LinksQueue.pop_back();
|
||||
|
||||
pos.page = epos.first;
|
||||
pos.offset = epos.second;
|
||||
|
||||
if( readHeadword( pos, head.headword, true ) )
|
||||
{
|
||||
if( head.headword.isEmpty()
|
||||
|| head.headword.contains( badLinks ) )
|
||||
continue;
|
||||
|
||||
fixHeadword( head.headword );
|
||||
|
||||
head.page = pos.page;
|
||||
head.offset = pos.offset;
|
||||
|
||||
if( allHeadwordPositions.contains( head.headword ) )
|
||||
{
|
||||
// existed position
|
||||
bool existed = false;
|
||||
foreach( EWPos epos, allHeadwordPositions[ head.headword ] )
|
||||
{
|
||||
if( pos.page == epos.first && abs( pos.offset - epos.second ) <= 4 )
|
||||
{
|
||||
existed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !existed )
|
||||
{
|
||||
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No queued positions - forward to next article
|
||||
|
||||
|
@ -934,13 +890,7 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
|
|||
|
||||
indexHeadwordsPosition = pos;
|
||||
|
||||
try
|
||||
{
|
||||
getReferencesFromText( pos.page, pos.offset );
|
||||
}
|
||||
catch( std::exception & )
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
head.page = pos.page;
|
||||
head.offset = pos.offset;
|
||||
|
@ -953,27 +903,17 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
|
|||
|
||||
fixHeadword( head.headword );
|
||||
|
||||
if( allHeadwordPositions.contains( head.headword ) )
|
||||
try
|
||||
{
|
||||
// existed position
|
||||
bool existed = false;
|
||||
foreach( EWPos epos, allHeadwordPositions[ head.headword ] )
|
||||
{
|
||||
if( pos.page == epos.first && abs( pos.offset - epos.second ) <= 4 )
|
||||
{
|
||||
existed = true;
|
||||
break;
|
||||
getReferencesFromText( pos.page, pos.offset);
|
||||
}
|
||||
}
|
||||
if( !existed )
|
||||
catch( std::exception & )
|
||||
{
|
||||
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
if( !allHeadwordPositions.contains( ((uint64_t)pos.page) << 32 | ( pos.offset / 4 ) ) )
|
||||
{
|
||||
allHeadwordPositions[ head.headword ] << EWPos( pos.page, pos.offset );
|
||||
allHeadwordPositions[ ((uint64_t)pos.page) << 32 | ( pos.offset / 4 ) ] = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -981,6 +921,43 @@ bool EpwingBook::getNextHeadword( EpwingHeadword & head )
|
|||
return true;
|
||||
}
|
||||
|
||||
bool EpwingBook::processRef( EpwingHeadword & head)
|
||||
{
|
||||
EB_Position pos;
|
||||
|
||||
QRegularExpression badLinks( "#(v|n)\\d", QRegularExpression::UseUnicodePropertiesOption );
|
||||
while( !LinksQueue.isEmpty() )
|
||||
{
|
||||
EWPos epos = LinksQueue.last();
|
||||
LinksQueue.pop_back();
|
||||
|
||||
pos.page = epos.first;
|
||||
pos.offset = epos.second;
|
||||
|
||||
if( readHeadword( pos, head.headword, true ) )
|
||||
{
|
||||
if( head.headword.isEmpty() || head.headword.contains( badLinks ) )
|
||||
continue;
|
||||
|
||||
fixHeadword( head.headword );
|
||||
|
||||
head.page = pos.page;
|
||||
head.offset = pos.offset;
|
||||
auto key = ( (uint64_t)pos.page ) << 32 | ( pos.offset >> 2 );
|
||||
if( !allRefPositions.contains( key ) )
|
||||
{
|
||||
// fixed the reference headword ,to avoid the headword collision with other entry .
|
||||
//if(!allHeadwordPositions.contains(key))
|
||||
head.headword = QString( "r%1At%2" ).arg( pos.page ).arg( pos.offset );
|
||||
|
||||
allRefPositions[ key ] = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool EpwingBook::readHeadword( EB_Position const& pos,
|
||||
QString & headword,
|
||||
bool text_only )
|
||||
|
@ -1094,7 +1071,7 @@ void EpwingBook::fixHeadword( QString & headword )
|
|||
}
|
||||
|
||||
void EpwingBook::getArticle( QString & headword, QString & articleText,
|
||||
int page, int offset, bool text_only)
|
||||
int page, int offset, bool text_only, QString word)
|
||||
{
|
||||
error_string.clear();
|
||||
char buffer[ TextBufferSize + 1 ];
|
||||
|
@ -1126,12 +1103,12 @@ void EpwingBook::getArticle( QString & headword, QString & articleText,
|
|||
}
|
||||
|
||||
headword = QString::fromUtf8( buffer, length );
|
||||
finalizeText( headword );
|
||||
finalizeText( headword);
|
||||
|
||||
if( text_only )
|
||||
fixHeadword( headword );
|
||||
|
||||
articleText = getText( pos.page, pos.offset, text_only );
|
||||
articleText = getText( pos.page, pos.offset, text_only);
|
||||
}
|
||||
|
||||
const char * EpwingBook::beginDecoration( unsigned int code )
|
||||
|
@ -1268,7 +1245,7 @@ void EpwingBook::finalizeText( QString & text )
|
|||
{
|
||||
QString headword = QString::fromUtf8( buf, length );
|
||||
fixHeadword( headword );
|
||||
url.setPath( Utils::Url::ensureLeadingSlash( headword ) );
|
||||
url.setPath( Utils::Url::ensureLeadingSlash( QString( "r%1At%2" ).arg( ebpos.page ).arg(ebpos.offset) ) );
|
||||
}
|
||||
|
||||
QString link = "<a href=\"" + url.toEncoded() + "\">";
|
||||
|
|
|
@ -78,7 +78,8 @@ class EpwingBook
|
|||
QStringList imageCacheList, soundsCacheList, moviesCacheList, fontsCacheList;
|
||||
QMap< QString, QString > baseFontsMap, customFontsMap;
|
||||
QVector< int > refPages, refOffsets;
|
||||
QMap< QString, QList< EWPos > > allHeadwordPositions;
|
||||
QMap< uint64_t,bool > allHeadwordPositions;
|
||||
QMap< uint64_t, bool > allRefPositions;
|
||||
QVector< EWPos > LinksQueue;
|
||||
int refOpenCount, refCloseCount;
|
||||
static Mutex libMutex;
|
||||
|
@ -98,7 +99,7 @@ class EpwingBook
|
|||
EB_Error_Code forwardText( EB_Position & startPos );
|
||||
|
||||
// Retrieve article text from dictionary
|
||||
QString getText( int page, int offset, bool text_only );
|
||||
QString getText( int page, int offset, bool text_only);
|
||||
|
||||
unsigned int normalizeDecorationCode( unsigned int code );
|
||||
|
||||
|
@ -151,6 +152,7 @@ public:
|
|||
void clearBuffers()
|
||||
{
|
||||
allHeadwordPositions.clear();
|
||||
allRefPositions.clear();
|
||||
LinksQueue.clear();
|
||||
}
|
||||
|
||||
|
@ -181,6 +183,8 @@ public:
|
|||
// Find next headword and article position
|
||||
bool getNextHeadword( EpwingHeadword & head );
|
||||
|
||||
bool processRef( EpwingHeadword & head );
|
||||
|
||||
bool readHeadword( EB_Position const & pos,
|
||||
QString & headword,
|
||||
bool text_only );
|
||||
|
@ -191,7 +195,7 @@ public:
|
|||
|
||||
// Retrieve article from dictionary
|
||||
void getArticle( QString & headword, QString & articleText,
|
||||
int page, int offset, bool text_only );
|
||||
int page, int offset, bool text_only, QString word=0 );
|
||||
|
||||
const char * beginDecoration( unsigned int code );
|
||||
const char * endDecoration( unsigned int code );
|
||||
|
|
Loading…
Reference in a new issue