diff --git a/src/dict/bgl_babylon.cc b/src/dict/bgl_babylon.cc index 8e0d6c2b..4f455f43 100644 --- a/src/dict/bgl_babylon.cc +++ b/src/dict/bgl_babylon.cc @@ -52,10 +52,13 @@ DEF_EX( exUserAbort, "User abort", Dictionary::Ex ) DEF_EX( exIconv, "Iconv library error", Dictionary::Ex ) DEF_EX( exAllocation, "Error memory allocation", Dictionary::Ex ) -Babylon::Babylon( std::string filename ) : -m_filename( filename ) +Babylon::Babylon( const std::string & filename ): + m_filename( filename ), + m_sourceLang( 0 ), + m_targetLang( 0 ), + m_numEntries( 0 ) { - file = NULL; + file = nullptr; } @@ -67,15 +70,13 @@ Babylon::~Babylon() bool Babylon::open() { - FILE *f; - unsigned char buf[6]; - int i; + unsigned char buf[ 6 ]; - f = gd_fopen( m_filename.c_str(), "rb" ); - if( f == NULL ) + FILE * f = gd_fopen( m_filename.c_str(), "rb" ); + if ( f == nullptr ) return false; - i = fread( buf, 1, 6, f ); + int i = fread( buf, 1, 6, f ); /* First four bytes: BGL signature 0x12340001 or 0x12340002 (big-endian) */ if( i < 6 || memcmp( buf, "\x12\x34\x00", 3 ) || buf[3] == 0 || buf[3] > 2 ) @@ -112,7 +113,7 @@ bool Babylon::open() fclose( f ); - if( file == NULL ) + if ( file == nullptr ) return false; return true; @@ -124,14 +125,14 @@ void Babylon::close() if ( file ) { gzclose( file ); - file = 0; + file = nullptr; } } bool Babylon::readBlock( bgl_block &block ) { - if ( file == NULL || gzeof( file ) ) + if ( file == nullptr || gzeof( file ) ) return false; block.length = bgl_readnum( 1 ); @@ -164,12 +165,10 @@ unsigned int Babylon::bgl_readnum( int bytes ) unsigned char buf[4]; unsigned val = 0; - if ( bytes < 1 || bytes > 4 ) return (0); + if ( bytes < 1 || bytes > 4 ) + return 0; - int res = gzread( file, buf, bytes ); - - if( res != bytes ) - { + if ( const int res = gzread( file, buf, bytes ); res != bytes ) { gzclearerr( file ); return 4; // Read error - return end of file marker } @@ -179,9 +178,10 @@ unsigned int Babylon::bgl_readnum( int bytes ) } -bool Babylon::read(std::string &source_charset, std::string &target_charset) +bool Babylon::read( const std::string & source_charset, const std::string & target_charset ) { - if( file == NULL ) return false; + if ( file == nullptr ) + return false; bgl_block block; unsigned int pos; @@ -251,8 +251,7 @@ bool Babylon::read(std::string &source_charset, std::string &target_charset) //m_sourceLang = headword; break; case 8: - m_targetLang = bgl_language[(unsigned char)(block.data[5])]; - //m_targetLang = headword; + m_targetLang = bgl_language[ (unsigned char)( block.data[ 5 ] ) ]; break; case 9: headword.reserve( block.length - 2 ); @@ -302,9 +301,7 @@ bool Babylon::read(std::string &source_charset, std::string &target_charset) } gzseek( file, 0, SEEK_SET ); - if ( isUtf8File ) - { - //FDPRINTF( stderr, "%s: utf8 file.\n", m_title.c_str() ); + if ( isUtf8File ) { m_defaultCharset = "UTF-8"; m_sourceCharset = "UTF-8"; m_targetCharset = "UTF-8"; @@ -324,8 +321,7 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler ) { bgl_entry entry; - if( file == NULL ) - { + if ( file == nullptr ) { entry.headword = ""; return entry; } @@ -355,7 +351,6 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler ) if ( pos + len > block.length ) break; std::string filename( block.data + pos, len ); - //if (filename != "8EAF66FD.bmp" && filename != "C2EEF3F6.html") { pos += len; if ( resourceHandler ) resourceHandler->handleBabylonResource( filename, block.data + pos, block.length - pos ); @@ -569,9 +564,8 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler ) { transcription = Iconv::toUtf8( "Windows-1252", block.data + pos + 4, length ); } - catch( Iconv::Ex & e ) - { - qWarning( "Bgl: charset conversion error, no trancription processing's done: %s\n", e.what() ); + catch ( Iconv::Ex & e ) { + qWarning( "Bgl: charset conversion error, no transcription processing's done: %s\n", e.what() ); transcription = std::string( block.data + pos + 4, length ); } } @@ -622,29 +616,36 @@ bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler ) //pos += len - a; //break; } - else - if (block.data[pos] == 0x14) { - defBodyEnded = true; // Presumably - pos++; - } else if ((unsigned char)block.data[pos] == 0x1A){ + else { + if ( block.data[ pos ] == 0x14 ) { + defBodyEnded = true; // Presumably + pos++; + } + else if ( (unsigned char)block.data[ pos ] == 0x1A ) { unsigned length = (unsigned char)block.data[ pos + 1 ]; - if (length <= 10){// 0x1A identifies two different data types. - // data about the Hebrew root should be shorter then - // 10 bytes, and in the other data type the byte - // after 0x1A is > 10 (at least it is in Bybylon's - // Hebrew dictionaries). - root = std::string( block.data + pos + 2, length ); - std::reverse(root.begin(),root.end()); - definition += " (" + root + ")"; - pos += length + 2; - a += length + 1; - } - else - pos++; - } else { - definition += block.data[pos++]; + if ( length <= 10 ) { // 0x1A identifies two different data types. + // data about the Hebrew root should be shorter then + // 10 bytes, and in the other data type the byte + // after 0x1A is > 10 (at least it is in Bybylon's + // Hebrew dictionaries). + root = std::string( block.data + pos + 2, length ); + std::reverse( root.begin(), root.end() ); + definition += " (" + root + ")"; + pos += length + 2; + a += length + 1; + } + else { + pos++; + } + } + else { + definition += block.data[ pos++ ]; + } } - }else definition += block.data[pos++]; + } + else { + definition += block.data[ pos++ ]; + } } convertToUtf8( definition, BGL_TARGET_CHARSET ); if( !transcription.empty() ) diff --git a/src/dict/bgl_babylon.hh b/src/dict/bgl_babylon.hh index e1694be6..c397749c 100644 --- a/src/dict/bgl_babylon.hh +++ b/src/dict/bgl_babylon.hh @@ -28,119 +28,91 @@ #include #include #include +#include -//const std::string bgl_language[] = { #ifndef blgCode2Int #define blgCode2Int( index, code0, code1 ) (((uint32_t)index) << 16 ) + (((uint32_t)code1) << 8 ) + (uint32_t)code0 #endif -const quint32 bgl_language[] = { - blgCode2Int( 0, 'e', 'n' ),// "English", - blgCode2Int( 0, 'f', 'r' ),//"French", - blgCode2Int( 0, 'i', 't' ),//"Italian", - blgCode2Int( 0, 'e', 's' ),//"Spanish", - blgCode2Int( 0, 'n', 'l' ),//"Dutch", - blgCode2Int( 0, 'p', 't' ),//"Portuguese", - blgCode2Int( 0, 'd', 'e' ),//"German", - blgCode2Int( 0, 'r', 'u' ),//"Russian", - blgCode2Int( 0, 'j', 'a' ),//"Japanese", - blgCode2Int( 1, 'z', 'h' ),//"\x01",//"Traditional Chinese", - blgCode2Int( 2, 'z', 'h' ),//"\x02",//"Simplified Chinese", - blgCode2Int( 0, 'e', 'l' ),//"Greek", - blgCode2Int( 0, 'k', 'o' ),//"Korean", - blgCode2Int( 0, 't', 'r' ),//"Turkish", - blgCode2Int( 0, 'h', 'e' ),//"Hebrew", - blgCode2Int( 0, 'a', 'r' ),//"Arabic", - blgCode2Int( 0, 't', 'h' ),//"Thai", - blgCode2Int( 3, 0, 0 ),//"\x03",//"Other", - blgCode2Int( 4, 'z', 'h' ),//"\x04",//"Other Simplified Chinese dialects", - blgCode2Int( 5, 'z', 'h' ),//"\x05",//Other Traditional Chinese dialects", - blgCode2Int( 6, 0, 0 ),//"\x06",//Other Eastern-European languages", - blgCode2Int( 7, 0, 0 ),//"\x07",//Other Western-European languages", - blgCode2Int( 8, 'r', 'u' ),//"\x08",//Other Russian languages", - blgCode2Int( 9, 'j', 'a' ),//"\x09",//Other Japanese languages", - blgCode2Int( 10, 0, 0 ),//"\x0A",//"Other Baltic languages", - blgCode2Int( 11, 'e', 'l' ),//"\x0B",//Other Greek languages", - blgCode2Int( 12, 'k', 'o' ),//"\x0C",//"Other Korean dialects", - blgCode2Int( 13, 't', 'r' ),//"\x0D",//Other Turkish dialects", - blgCode2Int( 14, 't', 'h' ),//"\x0E",//"Other Thai dialects", - blgCode2Int( 0, 'p', 'l' ),//"Polish", - blgCode2Int( 0, 'h', 'u' ),//"Hungarian", - blgCode2Int( 0, 'c', 's' ),//"Czech", - blgCode2Int( 0, 'l', 't' ),//"Lithuanian", - blgCode2Int( 0, 'l', 'v' ),//"Latvian", - blgCode2Int( 0, 'c', 'a' ),//"Catalan", - blgCode2Int( 0, 'h', 'r' ),//"Croatian", - blgCode2Int( 0, 's', 'r' ),//"Serbian", - blgCode2Int( 0, 's', 'k' ),//"Slovak", - blgCode2Int( 0, 's', 'q' ),//"Albanian", - blgCode2Int( 0, 'u', 'r' ),//"Urdu", - blgCode2Int( 0, 's', 'l' ),//"Slovenian", - blgCode2Int( 0, 'e', 't' ),//"Estonian", - blgCode2Int( 0, 'b', 'g' ),//"Bulgarian", - blgCode2Int( 0, 'd', 'a' ),//"Danish", - blgCode2Int( 0, 'f', 'i' ),//"Finnish", - blgCode2Int( 0, 'i', 's' ),//"Icelandic", - blgCode2Int( 0, 'n', 'o' ),//"Norwegian", - blgCode2Int( 0, 'r', 'o' ),//"Romanian", - blgCode2Int( 0, 's', 'v' ),//"Swedish", - blgCode2Int( 0, 'u', 'k' ),//"Ukrainian", - blgCode2Int( 0, 'b', 'e' ),//"Belarusian", - blgCode2Int( 0, 'f', 'a' ),//"Farsi"=Persian, - blgCode2Int( 0, 'e', 'u' ),//"Basque", - blgCode2Int( 0, 'm', 'k' ),//"Macedonian", - blgCode2Int( 0, 'a', 'f' ),//"Afrikaans", - blgCode2Int( 0, 'f', 'o' ),//"Faeroese"=Faroese, - blgCode2Int( 0, 'l', 'a' ),//"Latin", - blgCode2Int( 0, 'e', 'o' ),//"Esperanto", - blgCode2Int( 15, 0, 0 ),//"Tamazight", - blgCode2Int( 0, 'h', 'y' )//"Armenian" +const std::array< quint32, 60 > bgl_language = { + blgCode2Int( 0, 'e', 'n' ), // "English", + blgCode2Int( 0, 'f', 'r' ), //"French", + blgCode2Int( 0, 'i', 't' ), //"Italian", + blgCode2Int( 0, 'e', 's' ), //"Spanish", + blgCode2Int( 0, 'n', 'l' ), //"Dutch", + blgCode2Int( 0, 'p', 't' ), //"Portuguese", + blgCode2Int( 0, 'd', 'e' ), //"German", + blgCode2Int( 0, 'r', 'u' ), //"Russian", + blgCode2Int( 0, 'j', 'a' ), //"Japanese", + blgCode2Int( 1, 'z', 'h' ), //"\x01",//"Traditional Chinese", + blgCode2Int( 2, 'z', 'h' ), //"\x02",//"Simplified Chinese", + blgCode2Int( 0, 'e', 'l' ), //"Greek", + blgCode2Int( 0, 'k', 'o' ), //"Korean", + blgCode2Int( 0, 't', 'r' ), //"Turkish", + blgCode2Int( 0, 'h', 'e' ), //"Hebrew", + blgCode2Int( 0, 'a', 'r' ), //"Arabic", + blgCode2Int( 0, 't', 'h' ), //"Thai", + blgCode2Int( 3, 0, 0 ), //"\x03",//"Other", + blgCode2Int( 4, 'z', 'h' ), //"\x04",//"Other Simplified Chinese dialects", + blgCode2Int( 5, 'z', 'h' ), //"\x05",//Other Traditional Chinese dialects", + blgCode2Int( 6, 0, 0 ), //"\x06",//Other Eastern-European languages", + blgCode2Int( 7, 0, 0 ), //"\x07",//Other Western-European languages", + blgCode2Int( 8, 'r', 'u' ), //"\x08",//Other Russian languages", + blgCode2Int( 9, 'j', 'a' ), //"\x09",//Other Japanese languages", + blgCode2Int( 10, 0, 0 ), //"\x0A",//"Other Baltic languages", + blgCode2Int( 11, 'e', 'l' ), //"\x0B",//Other Greek languages", + blgCode2Int( 12, 'k', 'o' ), //"\x0C",//"Other Korean dialects", + blgCode2Int( 13, 't', 'r' ), //"\x0D",//Other Turkish dialects", + blgCode2Int( 14, 't', 'h' ), //"\x0E",//"Other Thai dialects", + blgCode2Int( 0, 'p', 'l' ), //"Polish", + blgCode2Int( 0, 'h', 'u' ), //"Hungarian", + blgCode2Int( 0, 'c', 's' ), //"Czech", + blgCode2Int( 0, 'l', 't' ), //"Lithuanian", + blgCode2Int( 0, 'l', 'v' ), //"Latvian", + blgCode2Int( 0, 'c', 'a' ), //"Catalan", + blgCode2Int( 0, 'h', 'r' ), //"Croatian", + blgCode2Int( 0, 's', 'r' ), //"Serbian", + blgCode2Int( 0, 's', 'k' ), //"Slovak", + blgCode2Int( 0, 's', 'q' ), //"Albanian", + blgCode2Int( 0, 'u', 'r' ), //"Urdu", + blgCode2Int( 0, 's', 'l' ), //"Slovenian", + blgCode2Int( 0, 'e', 't' ), //"Estonian", + blgCode2Int( 0, 'b', 'g' ), //"Bulgarian", + blgCode2Int( 0, 'd', 'a' ), //"Danish", + blgCode2Int( 0, 'f', 'i' ), //"Finnish", + blgCode2Int( 0, 'i', 's' ), //"Icelandic", + blgCode2Int( 0, 'n', 'o' ), //"Norwegian", + blgCode2Int( 0, 'r', 'o' ), //"Romanian", + blgCode2Int( 0, 's', 'v' ), //"Swedish", + blgCode2Int( 0, 'u', 'k' ), //"Ukrainian", + blgCode2Int( 0, 'b', 'e' ), //"Belarusian", + blgCode2Int( 0, 'f', 'a' ), //"Farsi"=Persian, + blgCode2Int( 0, 'e', 'u' ), //"Basque", + blgCode2Int( 0, 'm', 'k' ), //"Macedonian", + blgCode2Int( 0, 'a', 'f' ), //"Afrikaans", + blgCode2Int( 0, 'f', 'o' ), //"Faeroese"=Faroese, + blgCode2Int( 0, 'l', 'a' ), //"Latin", + blgCode2Int( 0, 'e', 'o' ), //"Esperanto", + blgCode2Int( 15, 0, 0 ), //"Tamazight", + blgCode2Int( 0, 'h', 'y' ) //"Armenian" }; +const std::vector< std::string > bgl_charset = { "WINDOWS-1252", /*Default*/ + "WINDOWS-1252", /*Latin*/ + "WINDOWS-1250", /*Eastern European*/ + "WINDOWS-1251", /*Cyriilic*/ + "CP932", /*Japanese*/ + "BIG5", /*Traditional Chinese*/ + "GB18030", /*Simplified Chinese*/ + "CP1257", /*Baltic*/ + "CP1253", /*Greek*/ + "EUC-KR", /*Korean*/ + "ISO-8859-9", /*Turkish*/ + "WINDOWS-1255", /*Hebrew*/ + "CP1256", /*Arabic*/ + "CP874" /*Thai*/ }; -const std::string bgl_charsetname[] = { - "Default" , - "Latin", - "Eastern European", - "Cyrillic", - "Japanese", - "Traditional Chinese", - "Simplified Chinese", - "Baltic", - "Greek", - "Korean", - "Turkish", - "Hebrew", - "Arabic", - "Thai" }; - -const std::string bgl_charset[] = { - "WINDOWS-1252", /*Default*/ - "WINDOWS-1252", /*Latin*/ - "WINDOWS-1250", /*Eastern European*/ - "WINDOWS-1251", /*Cyriilic*/ - "CP932", /*Japanese*/ - "BIG5", /*Traditional Chinese*/ - "GB18030", /*Simplified Chinese*/ - "CP1257", /*Baltic*/ - "CP1253", /*Greek*/ - "EUC-KR", /*Korean*/ - "ISO-8859-9", /*Turkish*/ - "WINDOWS-1255", /*Hebrew*/ - "CP1256", /*Arabic*/ - "CP874" /*Thai*/ }; - -const std::string partOfSpeech[] = { - "n.", - "adj.", - "v.", - "adv.", - "interj.", - "pron.", - "prep.", - "conj.", - "suff.", - "pref.", - "art." }; +const std::array< std::string, 11 > partOfSpeech = { + "n.", "adj.", "v.", "adv.", "interj.", "pron.", "prep.", "conj.", "suff.", "pref.", "art." }; typedef struct { unsigned type; @@ -158,20 +130,18 @@ typedef struct { class Babylon { public: - Babylon( std::string ); - ~Babylon(); + Babylon( const std::string & ); + ~Babylon(); - // Subclass this to store resources - class ResourceHandler - { - public: + // Subclass this to store resources + class ResourceHandler + { + public: - virtual void handleBabylonResource( std::string const & filename, - char const * data, size_t size )=0; + virtual void handleBabylonResource( std::string const & filename, char const * data, size_t size ) = 0; - virtual ~ResourceHandler() - {} - }; + virtual ~ResourceHandler() {} + }; /// Sets a prefix string to append to each resource reference in hyperlinks. void setResourcePrefix( std::string const & prefix ) @@ -179,31 +149,55 @@ public: bool open(); void close(); - bool readBlock( bgl_block& ); - bool read(std::string &source_charset, std::string &target_charset); + bool readBlock( bgl_block & ); + bool read( const std::string & source_charset, const std::string & target_charset ); bgl_entry readEntry( ResourceHandler * = 0 ); - inline std::string title() const { return m_title; } - inline std::string author() const { return m_author; } - inline std::string email() const { return m_email; } - inline std::string description() const { return m_description; } - inline std::string copyright() const { return m_copyright; } - inline quint32 sourceLang() const { return m_sourceLang; }//std::string sourceLang() const { return m_sourceLang; } - inline quint32 targetLang() const { return m_targetLang; }//inline std::string targetLang() const { return m_targetLang; } - inline unsigned int numEntries() const { return m_numEntries; } - inline std::string charset() const { return m_defaultCharset; } + inline std::string title() const + { + return m_title; + } + inline std::string author() const + { + return m_author; + } + inline std::string email() const + { + return m_email; + } + inline std::string description() const + { + return m_description; + } + inline std::string copyright() const + { + return m_copyright; + } + inline quint32 sourceLang() const + { + return m_sourceLang; + } + inline quint32 targetLang() const + { + return m_targetLang; + } - inline std::string filename() const { return m_filename; } + inline std::string filename() const + { + return m_filename; + } std::vector< char > const & getIcon() const - { return icon; } + { + return icon; + } enum { ParserVersion = 17 }; -private: + private: unsigned int bgl_readnum( int ); void convertToUtf8( std::string &, unsigned int = 0 ); @@ -215,8 +209,8 @@ private: std::string m_email; std::string m_description; std::string m_copyright; - quint32 m_sourceLang; //std::string m_sourceLang; - quint32 m_targetLang;//std::string m_targetLang; + quint32 m_sourceLang; + quint32 m_targetLang; unsigned int m_numEntries; std::string m_defaultCharset; std::string m_sourceCharset; @@ -225,7 +219,11 @@ private: std::string m_resourcePrefix; - enum CHARSET { BGL_DEFAULT_CHARSET, BGL_SOURCE_CHARSET, BGL_TARGET_CHARSET }; + enum CHARSET { + BGL_DEFAULT_CHARSET, + BGL_SOURCE_CHARSET, + BGL_TARGET_CHARSET + }; }; #endif // BABYLON_H