Handle big entries in bgl

This commit is contained in:
Abs62 2013-01-18 15:50:49 +04:00
parent 77ded2c328
commit 160db20a04
2 changed files with 77 additions and 6 deletions

2
bgl.cc
View file

@ -50,7 +50,7 @@ namespace
enum enum
{ {
Signature = 0x584c4742, // BGLX on little-endian, XLGB on big-endian Signature = 0x584c4742, // BGLX on little-endian, XLGB on big-endian
CurrentFormatVersion = 18 + BtreeIndexing::FormatVersion CurrentFormatVersion = 19 + BtreeIndexing::FormatVersion
}; };
struct IdxHeader struct IdxHeader

View file

@ -208,6 +208,7 @@ bool Babylon::read(std::string &source_charset, std::string &target_charset)
case 1: case 1:
case 7: case 7:
case 10: case 10:
case 11:
// Only count entries // Only count entries
m_numEntries++; m_numEntries++;
break; break;
@ -323,6 +324,7 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
bgl_block block; bgl_block block;
unsigned int len, pos; unsigned int len, pos;
unsigned int alts_num;
std::string headword, displayedHeadword; std::string headword, displayedHeadword;
std::string definition; std::string definition;
std::string temp; std::string temp;
@ -359,6 +361,7 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
case 1: case 1:
case 7: case 7:
case 10: case 10:
case 11:
alternate.clear(); alternate.clear();
headword.clear(); headword.clear();
displayedHeadword.clear(); displayedHeadword.clear();
@ -368,8 +371,23 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
pos = 0; pos = 0;
// Headword // Headword
len = 0; if( block.type == 11 )
len = (unsigned char)block.data[pos++]; {
pos = 1;
len = 0;
if( pos + 4 > block.length )
break;
for( int i = 0; i < 4; i++ )
{
len = len << 8;
len |= (unsigned char)block.data[ pos++ ];
}
}
else
{
len = (unsigned char)block.data[pos++];
}
if( pos + len > block.length ) if( pos + len > block.length )
break; break;
@ -383,10 +401,63 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
if( headword.find( "&#" ) != string::npos ) if( headword.find( "&#" ) != string::npos )
headword = Html::unescapeUtf8( headword ); headword = Html::unescapeUtf8( headword );
if( block.type == 11 )
{
// Alternate forms
if( pos + 4 >= block.length )
break;
alts_num = 0;
for( int i = 0; i < 4; i++ )
{
alts_num = alts_num << 8;
alts_num |= (unsigned char)block.data[ pos++ ];
}
for( unsigned j = 0; j < alts_num; j++ )
{
len = 0;
if( pos + 4 > block.length )
break;
for( int i = 0; i < 4; i++ )
{
len = len << 8;
len |= (unsigned char)block.data[ pos++ ];
}
if( pos + len >= block.length )
break;
alternate.reserve( len );
for(unsigned int a=0;a<len;a++) alternate += block.data[pos++];
convertToUtf8( alternate, SOURCE_CHARSET );
// Try to repair malformed forms
if( alternate.find( "&#" ) != string::npos )
alternate = Html::unescapeUtf8( alternate );
alternates.push_back( alternate );
alternate.clear();
}
}
// Definition // Definition
len = 0;
len = (unsigned char)block.data[pos++] << 8; if( block.type == 11 )
len |= (unsigned char)block.data[pos++]; {
len = 0;
if( pos + 4 > block.length )
break;
for( int i = 0; i < 4; i++ )
{
len = len << 8;
len |= (unsigned char)block.data[ pos++ ];
}
}
else
{
len = (unsigned char)block.data[pos++] << 8;
len |= (unsigned char)block.data[pos++];
}
if( pos + len > block.length ) if( pos + len > block.length )
break; break;