More work with Babylon.

1) Support 2-byte sized displayed headwords in a more general way. This version
should catch more of them.
2) Only add displayed headword as a headword to the index if the original one
contain two digits, not one, as it was previously.
This commit is contained in:
Konstantin Isakov 2010-05-28 19:02:37 +04:00
parent 57e416820e
commit 9980863bd8
2 changed files with 29 additions and 51 deletions

View file

@ -306,6 +306,7 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
std::vector<std::string> alternates;
std::string alternate;
std::string root;
bool defBodyEnded = false;
while( readBlock( block ) )
{
@ -403,6 +404,27 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
pos += length + 2;
a += length + 1;
}
else
if ( block.data[ pos ] == 0x28 && defBodyEnded &&
len - a >= 3 )
{
// 2-byte sized displayed headword
unsigned length = (unsigned char)block.data[ pos + 1 ];
length <<= 8;
length += (unsigned char)block.data[ pos + 2 ];
if ( length > len - a - 3 )
{
fprintf( stderr, "2-byte sized displayed headword for %s is too large\n", headword.c_str() );
pos += len - a;
break;
}
displayedHeadword = std::string( block.data + pos + 3, length );
pos += length + 3;
a += length + 2;
}
else if ( (unsigned char)block.data[pos] == 0x50 && len - a - 1 >= 2 &&
(unsigned char)block.data[pos + 1 ] == 0x1B )
{
@ -487,52 +509,8 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
//break;
}
else
if ( block.data[ pos ] == 0x14 && len - a >= 5 &&
block.data[ pos + 1 ] == 0x06 &&
block.data[ pos + 2 ] == 0x5A &&
block.data[ pos + 3 ] == 0x18 )
{
// 1-byte sized displayed headword
// note that this probably overrides the handlers above
unsigned length = (unsigned char)block.data[ pos + 4 ];
if ( length > len - a - 5 )
{
fprintf( stderr, "1-byte sized displayed headword for %s is too large\n", headword.c_str() );
pos += len - a;
break;
}
displayedHeadword = std::string( block.data + pos + 5, length );
pos += length + 5;
a += length + 4;
}
else
if ( block.data[ pos ] == 0x14 && len - a >= 6 &&
block.data[ pos + 1 ] == 0x06 &&
block.data[ pos + 2 ] == 0x5A &&
block.data[ pos + 3 ] == 0x28 )
{
// 2-byte sized displayed headword
unsigned length = (unsigned char)block.data[ pos + 4 ];
length <<= 8;
length += (unsigned char)block.data[ pos + 5 ];
if ( length > len - a - 6 )
{
fprintf( stderr, "2-byte sized displayed headword for %s is too large\n", headword.c_str() );
pos += len - a;
break;
}
displayedHeadword = std::string( block.data + pos + 6, length );
pos += length + 6;
a += length + 5;
}
else
if (block.data[pos] == 0x14) {
defBodyEnded = true; // Presumably
pos++;
} else if ((unsigned char)block.data[pos] == 0x1A){
unsigned length = (unsigned char)block.data[ pos + 1 ];
@ -573,19 +551,19 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler )
// Try adding displayed headword to the list of alts
if ( headword != displayedHeadword )
{
// Only add displayed headword if the normal one has digits.
// Only add displayed headword if the normal one has two or more digits.
// This would indicate some irregularity in it (like e.g. if it serves
// as some kind of an identifier instead of being an actual headword)
bool hasDigits = false;
int totalDigits = 0;
for( char const * p = headword.c_str(); *p; ++p )
if ( *p >= '0' && *p <= '9' )
{
hasDigits = true;
if ( ++totalDigits > 1 )
break;
}
if ( hasDigits )
if ( totalDigits > 1 )
{
// Ok, let's add it.

View file

@ -193,7 +193,7 @@ public:
enum
{
ParserVersion = 11
ParserVersion = 12
};
private: