+ Ignore abbreviation files based on the _abrv suffix in their names.

*! Properly identify source and target languages.
This commit is contained in:
Konstantin Isakov 2009-04-23 11:43:20 +00:00
parent c08805f728
commit bde25bb8a3
3 changed files with 58 additions and 11 deletions

View file

@ -63,7 +63,7 @@ DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
enum
{
Signature = 0x584c5344, // DSLX on little-endian, XLSD on big-endian
CurrentFormatVersion = 8 + BtreeIndexing::FormatVersion + Folding::Version
CurrentFormatVersion = 9 + BtreeIndexing::FormatVersion + Folding::Version
};
struct IdxHeader
@ -1147,12 +1147,23 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
{
// Try .dsl and .dsl.dz suffixes
if ( ( i->size() < 4 ||
strcasecmp( i->c_str() + ( i->size() - 4 ), ".dsl" ) != 0 ) &&
bool uncompressedDsl = ( i->size() >= 4 &&
strcasecmp( i->c_str() + ( i->size() - 4 ), ".dsl" ) == 0 );
if ( !uncompressedDsl &&
( i->size() < 7 ||
strcasecmp( i->c_str() + ( i->size() - 7 ), ".dsl.dz" ) != 0 ) )
continue;
// Make sure it's not an abbreviation file
int extSize = ( uncompressedDsl ? 4 : 7 );
if ( i->size() - extSize >= 5 &&
strncasecmp( i->c_str() + i->size() - extSize - 5, "_abrv", 5 ) == 0 )
{
// It is, skip it
continue;
}
try
{
vector< string > dictFiles( 1, *i );
@ -1403,8 +1414,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
idxHeader.articleCount = articleCount;
idxHeader.wordCount = wordCount;
idxHeader.langFrom = LangCoder::code3toInt( scanner.getLangFrom() );
idxHeader.langTo = LangCoder::code3toInt( scanner.getLangTo() );
idxHeader.langFrom = dslLanguageToId( scanner.getLangFrom() );
idxHeader.langTo = dslLanguageToId( scanner.getLangTo() );
idx.rewind();

View file

@ -3,7 +3,7 @@
#include "dsl_details.hh"
#include "folding.hh"
#include "utf8.hh"
#include "langcoder.hh"
#include <wctype.h>
#include <stdio.h>
@ -440,9 +440,9 @@ DslScanner::DslScanner( string const & fileName ) throw( Ex, Iconv::Ex ):
if ( isName )
dictionaryName = arg;
else if ( isLangFrom )
langFrom = Utf8::encode(arg);
langFrom = arg;
else if ( isLangTo )
langTo = Utf8::encode(arg);
langTo = arg;
else
{
// The encoding
@ -802,5 +802,37 @@ void unescapeDsl( wstring & str )
str.erase( x, 1 ); // ++x would skip the next char without processing it
}
namespace
{
void cutEnding( wstring & where, wstring const & ending )
{
if ( where.size() > ending.size() &&
where.compare( where.size() - ending.size(),
ending.size(), ending ) == 0 )
where.erase( where.size() - ending.size() );
}
}
quint32 dslLanguageToId( wstring const & name )
{
static wstring newSp( GD_NATIVE_TO_WS( L"newspelling" ) );
static wstring st( GD_NATIVE_TO_WS( L"standard" ) );
static wstring ms( GD_NATIVE_TO_WS( L"modernsort" ) );
static wstring ts( GD_NATIVE_TO_WS( L"traditionalsort" ) );
static wstring prc( GD_NATIVE_TO_WS( L"prc" ) );
// Any of those endings are to be removed
wstring nameStripped = Folding::apply( name );
cutEnding( nameStripped, newSp );
cutEnding( nameStripped, st );
cutEnding( nameStripped, ms );
cutEnding( nameStripped, ts );
cutEnding( nameStripped, prc );
return LangCoder::findIdForLanguage( nameStripped );
}
}
}

View file

@ -97,7 +97,7 @@ class DslScanner
DslEncoding encoding;
DslIconv iconv;
wstring dictionaryName;
string langFrom, langTo;
wstring langFrom, langTo;
char readBuffer[ 65536 ];
char * readBufferPtr;
size_t readBufferLeft;
@ -124,11 +124,11 @@ public:
{ return dictionaryName; }
/// Returns the dictionary's source language, as was read from file's headers.
string const & getLangFrom() const
wstring const & getLangFrom() const
{ return langFrom; }
/// Returns the dictionary's target language, as was read from file's headers.
string const & getLangTo() const
wstring const & getLangTo() const
{ return langTo; }
/// Reads next line from the file. Returns true if reading succeeded --
@ -175,6 +175,10 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const
}
}
/// Converts the given language name taken from Dsl header (i.e. getLangFrom(),
/// getLangTo()) to its proper language id.
quint32 dslLanguageToId( wstring const & name );
}
}