Epwing: Create additional headwords separated by charset (issue #504)

This commit is contained in:
Abs62 2014-07-08 18:56:24 +04:00
parent d866dfd68f
commit c7844655d5

147
epwing.cc
View file

@ -42,7 +42,7 @@ namespace {
enum enum
{ {
Signature = 0x58575045, // EPWX on little-endian, XWPE on big-endian Signature = 0x58575045, // EPWX on little-endian, XWPE on big-endian
CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version CurrentFormatVersion = 5 + BtreeIndexing::FormatVersion + Folding::Version
}; };
struct IdxHeader struct IdxHeader
@ -140,6 +140,12 @@ public:
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize ); && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
} }
static int japaneseWriting( gd::wchar ch );
static bool isSign( gd::wchar ch );
static bool isJapanesePunctiation( gd::wchar ch );
protected: protected:
void loadIcon() throw(); void loadIcon() throw();
@ -691,6 +697,52 @@ sptr< Dictionary::DataRequest > EpwingDictionary::getSearchResults( QString cons
return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults ); return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults );
} }
int EpwingDictionary::japaneseWriting( gd::wchar ch )
{
if( ( ch >= 0x30A0 && ch <= 0x30FF )
|| ( ch >= 0x31F0 && ch <= 0x31FF )
|| ( ch >= 0x3200 && ch <= 0x32FF )
|| ( ch >= 0xFF00 && ch <= 0xFFEF )
|| ( ch == 0x1B000 ) )
return 1; // Katakana
else
if( ( ch >= 0x3040 && ch <= 0x309F )
|| ( ch == 0x1B001 ) )
return 2; // Hiragana
else
if( ( ch >= 0x4E00 && ch <= 0x9FAF )
|| ( ch >= 0x3400 && ch <= 0x4DBF ) )
return 3; // Kanji
return 0;
}
bool EpwingDictionary::isSign( gd::wchar ch )
{
switch( ch )
{
case 0x002B: // PLUS SIGN
case 0x003C: // LESS-THAN SIGN
case 0x003D: // EQUALS SIGN
case 0x003E: // GREATER-THAN SIGN
case 0x00AC: // NOT SIGN
case 0xFF0B: // FULLWIDTH PLUS SIGN
case 0xFF1C: // FULLWIDTH LESS-THAN SIGN
case 0xFF1D: // FULLWIDTH EQUALS SIGN
case 0xFF1E: // FULLWIDTH GREATER-THAN SIGN
case 0xFFE2: // FULLWIDTH NOT SIGN
return true;
default:
return false;
}
}
bool EpwingDictionary::isJapanesePunctiation( gd::wchar ch )
{
return ch >= 0x3000 && ch <= 0x303F;
}
} // anonymous namespace } // anonymous namespace
vector< sptr< Dictionary::Class > > makeDictionaries( vector< sptr< Dictionary::Class > > makeDictionaries(
@ -788,6 +840,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
dict.getFirstHeadword( head ); dict.getFirstHeadword( head );
int wordCount = 0; int wordCount = 0;
int articleCount = 0;
for( ; ; ) for( ; ; )
{ {
@ -797,11 +850,97 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
chunks.addToBlock( &head.page, sizeof( head.page ) ); chunks.addToBlock( &head.page, sizeof( head.page ) );
chunks.addToBlock( &head.offset, sizeof( head.offset ) ); chunks.addToBlock( &head.offset, sizeof( head.offset ) );
indexedWords.addWord( gd::toWString( head.headword ), offset ); wstring hw = gd::toWString( head.headword );
indexedWords.addWord( hw, offset );
wordCount++; wordCount++;
} articleCount++;
vector< wstring > words;
// Parse combined kanji/katakana/hiragana headwords
int w_prev = 0;
wstring word;
for( wstring::size_type n = 0; n < hw.size(); n++ )
{
gd::wchar ch = hw[ n ];
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|| EpwingDictionary::isSign( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) )
continue;
int w = EpwingDictionary::japaneseWriting( ch );
if( w > 0 )
{
// Store only separated words
gd::wchar ch_prev = 0;
if( n )
ch_prev = hw[ n - 1 ];
bool needStore = ( n == 0
|| Folding::isPunct( ch_prev )
|| Folding::isWhitespace( ch_prev )
|| EpwingDictionary::isJapanesePunctiation( ch ) );
word.push_back( ch );
w_prev = w;
wstring::size_type i;
for( i = n + 1; i < hw.size(); i++ )
{
ch = hw[ i ];
if( Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|| EpwingDictionary::isJapanesePunctiation( ch ) )
break;
w = EpwingDictionary::japaneseWriting( ch );
if( w != w_prev )
break;
word.push_back( ch );
}
if( needStore )
{
if( i >= hw.size() || Folding::isPunct( ch ) || Folding::isWhitespace( ch )
|| EpwingDictionary::isJapanesePunctiation( ch ) )
words.push_back( word );
}
word.clear();
if( i < hw.size() )
n = i;
else
break;
}
}
if( words.size() > 1 )
{
// Allow only one word in every charset
size_t n;
int writings[ 4 ];
memset( writings, 0, sizeof(writings) );
for( n = 0; n < words.size(); n++ )
{
int w = EpwingDictionary::japaneseWriting( words[ n ][ 0 ] );
if( writings[ w ] )
break;
else
writings[ w ] = 1;
}
if( n >= words.size() )
{
for( n = 0; n < words.size(); n++ )
{
indexedWords.addWord( words[ n ], offset );
wordCount++;
}
}
}
}
if( !dict.getNextHeadword( head ) ) if( !dict.getNextHeadword( head ) )
break; break;
} }
@ -827,7 +966,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
idxHeader.formatVersion = CurrentFormatVersion; idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.wordCount = wordCount; idxHeader.wordCount = wordCount;
idxHeader.articleCount = wordCount; idxHeader.articleCount = articleCount;
idx.rewind(); idx.rewind();