/*************************************************************************** * Copyright (C) 2007 by Raul Fernandes and Karl Grill * * rgbr@yahoo.com.br * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * ***************************************************************************/ /* Various improvements were made by Konstantin Isakov for the GoldenDict * program. */ #include "bgl_babylon.hh" #include #include #include #include #include #include #include "gddebug.hh" #include "ufile.hh" #include "iconv.hh" #include "htmlescape.hh" #include #include #include "dictionary.hh" #ifdef _WIN32 #include #define DUP _dup #else #define DUP dup #endif using std::string; DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex ) DEF_EX( exUserAbort, "User abort", Dictionary::Ex ) DEF_EX( exIconv, "Iconv library error", Dictionary::Ex ) DEF_EX( exAllocation, "Error memory allocation", Dictionary::Ex ) Babylon::Babylon( std::string filename ) : m_filename( filename ) { file = NULL; } Babylon::~Babylon() { close(); } bool Babylon::open() { FILE *f; unsigned char buf[6]; int i; f = gd_fopen( m_filename.c_str(), "rb" ); if( f == NULL ) return false; i = fread( buf, 1, 6, f ); /* First four bytes: BGL signature 0x12340001 or 0x12340002 (big-endian) */ if( i < 6 || memcmp( buf, "\x12\x34\x00", 3 ) || buf[3] == 0 || buf[3] > 2 ) { fclose( f ); return false; } /* Calculate position of gz header */ i = buf[4] << 8 | buf[5]; if( i < 6 ) { fclose( f ); return false; } if( fseek( f, i, SEEK_SET ) ) /* can't seek - emulate */ for(int j=0;j < i - 6;j++) fgetc( f ); if( ferror( f ) || feof( f ) ) { fclose( f ); return false; } /* we need to flush the file because otherwise some nfs mounts don't seem * to properly update the file position for the following reopen */ fflush( f ); #ifdef Q_OS_MAC /* Under Mac OS X the above technique don't set reopen position properly */ int fn = DUP( fileno( f ) ); lseek( fn, i, SEEK_SET ); file = gzdopen( fn, "r" ); #else file = gzdopen( DUP( fileno( f ) ), "r" ); #endif fclose( f ); if( file == NULL ) return false; return true; } void Babylon::close() { if ( file ) { gzclose( file ); file = 0; } } bool Babylon::readBlock( bgl_block &block ) { if( gzeof( file ) || file == NULL ) return false; block.length = bgl_readnum( 1 ); block.type = block.length & 0xf; if( block.type == 4 ) return false; // end of file marker block.length >>= 4; block.length = block.length < 4 ? bgl_readnum( block.length + 1 ) : block.length - 4 ; if( block.length ) { block.data = (char *)malloc( block.length ); if( !block.data ) throw exAllocation(); unsigned res = gzread( file, block.data, block.length ); if( block.length != res ) { free( block.data ); block.length = 0; gzclearerr( file ); return false; } } return true; } unsigned int Babylon::bgl_readnum( int bytes ) { unsigned char buf[4]; unsigned val = 0; if ( bytes < 1 || bytes > 4 ) return (0); int res = gzread( file, buf, bytes ); if( res != bytes ) { gzclearerr( file ); return 4; // Read error - return end of file marker } for(int i=0;i 64 ) type -= 65; if ( type >= 14 ) type = 0; m_defaultCharset = bgl_charset[type]; break; default: break; } break; case 1: case 7: case 10: case 11: // Only count entries m_numEntries++; break; case 3: pos = 2; switch( block.data[1] ) { case 1: headword.reserve( block.length - 2 ); for(unsigned int a=0;a= 5 && ( (unsigned char) block.data[ 4 ] & 0x80 ) != 0 ) isUtf8File = true; break; case 26: type = (unsigned int)block.data[2]; if( type > 64 ) type -= 65; if ( type >= 14 ) type = 0; if (m_sourceCharset.empty()) m_sourceCharset = bgl_charset[type]; break; case 27: type = (unsigned int)block.data[2]; if( type > 64 ) type -= 65; if ( type >= 14 ) type = 0; if (m_targetCharset.empty()) m_targetCharset = bgl_charset[type]; break; default: break; } break; default: ; } if( block.length ) free( block.data ); } gzseek( file, 0, SEEK_SET ); if ( isUtf8File ) { //FDPRINTF( stderr, "%s: utf8 file.\n", m_title.c_str() ); m_defaultCharset = "UTF-8"; m_sourceCharset = "UTF-8"; m_targetCharset = "UTF-8"; } convertToUtf8( m_title, BGL_TARGET_CHARSET ); convertToUtf8( m_author, BGL_TARGET_CHARSET ); convertToUtf8( m_email, BGL_TARGET_CHARSET ); convertToUtf8( m_copyright, BGL_TARGET_CHARSET ); convertToUtf8( m_description, BGL_TARGET_CHARSET ); GD_DPRINTF("Default charset: %s\nSource Charset: %s\nTargetCharset: %s\n", m_defaultCharset.c_str(), m_sourceCharset.c_str(), m_targetCharset.c_str()); return true; } bgl_entry Babylon::readEntry( ResourceHandler * resourceHandler ) { bgl_entry entry; if( file == NULL ) { entry.headword = ""; return entry; } bgl_block block; unsigned int len, pos; unsigned int alts_num; std::string headword, displayedHeadword; std::string definition; std::string temp; std::vector alternates; std::string alternate; std::string root; bool defBodyEnded = false; std::string transcription; while( readBlock( block ) ) { switch( block.type ) { case 2: { pos = 0; len = (unsigned char)block.data[pos++]; if( pos + len > block.length ) break; std::string filename( block.data+pos, len ); //if (filename != "8EAF66FD.bmp" && filename != "C2EEF3F6.html") { pos += len; if ( resourceHandler ) resourceHandler->handleBabylonResource( filename, block.data + pos, block.length - pos ); #if 0 FILE *ifile = gd_fopen(filename.c_str(), "w"); fwrite(block.data + pos, 1, block.length -pos, ifile); fclose(ifile); #endif break; } case 1: case 7: case 10: case 11: alternate.clear(); headword.clear(); displayedHeadword.clear(); root.clear(); definition.clear(); temp.clear(); pos = 0; // Headword if( block.type == 11 ) { pos = 1; if( pos + 4 > block.length ) break; len = qFromBigEndian( *reinterpret_cast< quint32 * >( block.data + pos ) ); pos += 4; } else { len = (unsigned char)block.data[pos++]; } if( pos + len > block.length ) break; headword.reserve( len ); for(unsigned int a=0;a= block.length ) break; alts_num = qFromBigEndian( *reinterpret_cast< quint32 * >( block.data + pos ) ); pos += 4; for( unsigned j = 0; j < alts_num; j++ ) { if( pos + 4 > block.length ) break; len = qFromBigEndian( *reinterpret_cast< quint32 * >( block.data + pos ) ); pos += 4; if( pos + len >= block.length ) break; alternate.reserve( len ); for(unsigned int a=0;a block.length ) break; len = qFromBigEndian( *reinterpret_cast< quint32 * >( block.data + pos ) ); pos += 4; } else { len = qFromBigEndian( *reinterpret_cast< quint16 * >( block.data + pos ) ); pos += 2; } if( pos + len > block.length ) break; definition.reserve( len ); for(unsigned int a=0;a= 0x40 && len - a >= 2 && (unsigned char)block.data[pos + 1 ] == 0x18 ) { // Hidden displayed headword (a displayed headword which // contains some garbage and shouldn't probably be visible). unsigned length = (unsigned char)block.data[ pos ] - 0x3F; if ( length > len - a - 2 ) { GD_FDPRINTF( stderr, "Hidden displayed headword is too large %s\n", headword.c_str() ); pos += len - a; break; } pos += length + 2; a += length + 1; } else if ( (unsigned char)block.data[pos] == 0x18 ) { // Displayed headword unsigned length = (unsigned char)block.data[ pos + 1 ]; if ( length > len - a - 2 ) { GD_FDPRINTF( stderr, "Displayed headword's length is too large for headword %s\n", headword.c_str() ); pos += len - a; break; } displayedHeadword = std::string( block.data + pos + 2, length ); pos += length + 2; a += length + 1; } else if ( block.data[ pos ] == 0x28 && defBodyEnded && len - a >= 3 ) { // 2-byte sized displayed headword unsigned length = qFromBigEndian( *reinterpret_cast< quint16 * >( block.data + pos + 1 ) ); if ( length > len - a - 3 ) { GD_FDPRINTF( stderr, "2-byte sized displayed headword for %s is too large\n", headword.c_str() ); pos += len - a; break; } displayedHeadword = std::string( block.data + pos + 3, length ); pos += length + 3; a += length + 2; } else if ( (unsigned char)block.data[pos] == 0x50 && len - a - 1 >= 2 && (unsigned char)block.data[pos + 1 ] == 0x1B ) { // 1-byte-sized transcription unsigned length = (unsigned char)block.data[pos + 2 ]; if ( length > len - a - 3 ) { GD_FDPRINTF( stderr, "1-byte-sized transcription's length is too large for headword %s\n", headword.c_str() ); pos += len - a; break; } if( m_targetCharset.compare( "UTF-8" ) != 0 ) { try { transcription = Iconv::toUtf8( "CP1252", block.data + pos + 3, length ); } catch( Iconv::Ex & e ) { qWarning( "Bgl: charset convertion error, no trancription processing's done: %s\n", e.what() ); transcription = std::string( block.data + pos + 3, length ); } } else transcription = std::string( block.data + pos + 3, length ); pos += length + 3; a += length + 2; } else if ( (unsigned char)block.data[pos] == 0x60 && len - a - 1 >= 3 && (unsigned char)block.data[pos + 1 ] == 0x1B ) { // 2-byte-sized transcription unsigned length = qFromBigEndian( *reinterpret_cast< quint16 * >( block.data + pos + 2 ) ); if ( length > len - a - 4) { GD_FDPRINTF( stderr, "2-byte-sized transcription's length is too large for headword %s\n", headword.c_str() ); pos += len - a; break; } if( m_targetCharset.compare( "UTF-8" ) != 0 ) { try { transcription = Iconv::toUtf8( "CP1252", block.data + pos + 4, length ); } catch( Iconv::Ex & e ) { qWarning( "Bgl: charset convertion error, no trancription processing's done: %s\n", e.what() ); transcription = std::string( block.data + pos + 4, length ); } } else transcription = std::string( block.data + pos + 4, length ); pos += length + 4; a += length + 3; } else if ( (unsigned char)block.data[pos] >= 0x40 && len - a >= 2 && (unsigned char)block.data[pos + 1 ] == 0x1B ) { // Hidden transcription (a transcription which is usually the same // as the headword and shouldn't probably be visible). unsigned length = (unsigned char)block.data[ pos ] - 0x3F; if ( length > len - a - 2 ) { GD_FDPRINTF( stderr, "Hidden transcription is too large %s\n", headword.c_str() ); pos += len - a; break; } pos += length + 2; a += length + 1; } else if ( (unsigned char)block.data[pos] == 0x1E ) { // Resource reference begin marker definition += m_resourcePrefix; ++pos; } else if ( (unsigned char)block.data[pos] == 0x1F ) { // Resource reference end marker ++pos; } else if( (unsigned char)block.data[pos] < 0x20 ) { if( a <= len - 3 && block.data[pos] == 0x14 && block.data[pos+1] == 0x02 ) { int index = (unsigned char)block.data[pos+2] - 0x30; if (index >= 0 && index <= 10) { definition = "" + partOfSpeech[index] + " " + definition; } pos += 3; a += 2; //pos += len - a; //break; } else if (block.data[pos] == 0x14) { defBodyEnded = true; // Presumably pos++; } else if ((unsigned char)block.data[pos] == 0x1A){ unsigned length = (unsigned char)block.data[ pos + 1 ]; if (length <= 10){// 0x1A identifies two different data types. // data about the Hebrew root should be shorter then // 10 bytes, and in the other data type the byte // after 0x1A is > 10 (at least it is in Bybylon's // Hebrew dictionaries). root = std::string( block.data + pos + 2, length ); std::reverse(root.begin(),root.end()); definition += " (" + root + ")"; pos += length + 2; a += length + 1; } else pos++; } else { definition += block.data[pos++]; } }else definition += block.data[pos++]; } convertToUtf8( definition, BGL_TARGET_CHARSET ); if( !transcription.empty() ) definition = std::string( "" ) + transcription + "" + definition; if ( displayedHeadword.size() ) convertToUtf8( displayedHeadword, BGL_TARGET_CHARSET ); // Alternate forms while( pos < block.length ) { len = (unsigned char)block.data[pos++]; if( pos + len > block.length ) break; alternate.reserve( len ); for(unsigned int a=0;a= '0' && *p <= '9' ) { if ( ++totalDigits > 1 ) break; } if ( totalDigits > 1 ) { // Ok, let's add it. // Does it contain HTML? If it does, we need to strip it if ( displayedHeadword.find( '<' ) != string::npos || displayedHeadword.find( '&' ) != string::npos ) { string result = Html::unescapeUtf8( displayedHeadword ); if ( result != headword ) alternates.push_back( result ); } else alternates.push_back(displayedHeadword); } } entry.headword = headword; entry.displayedHeadword = displayedHeadword; entry.definition = definition; entry.alternates = alternates; if( block.length ) free( block.data ); // Some dictionaries can in fact have an empty headword, so we // make it non-empty here to differentiate between the end of entries. if ( entry.headword.empty() ) entry.headword += ' '; return entry; break; default: ; } if( block.length ) free( block.data ); } entry.headword = ""; return entry; } void Babylon::convertToUtf8( std::string &s, unsigned int type ) { if( s.size() < 1 ) return; if( type > 2 ) return; if( s.compare( 0, 13, "") == 0 ) return; std::string charset; switch( type ) { case BGL_DEFAULT_CHARSET: if(!m_defaultCharset.empty()) charset = m_defaultCharset; else charset = m_sourceCharset; break; case BGL_SOURCE_CHARSET: if(!m_sourceCharset.empty()) charset = m_sourceCharset; else charset = m_defaultCharset; break; case BGL_TARGET_CHARSET: if(!m_targetCharset.empty()) charset = m_targetCharset; else charset = m_defaultCharset; break; default: ; } if( charset == "UTF-8" ) return; iconv_t cd = iconv_open( "UTF-8", charset.c_str() ); if( cd == (iconv_t)(-1) ) throw exIconv(); char *outbuf, *defbuf; size_t inbufbytes, outbufbytes; inbufbytes = s.size(); outbufbytes = s.size() * 6; char *inbuf; inbuf = (char *)s.data(); outbuf = (char*)malloc( outbufbytes + 1 ); if( !outbuf ) { iconv_close( cd ); throw exAllocation(); } memset( outbuf, '\0', outbufbytes + 1 ); defbuf = outbuf; while (inbufbytes) { if (iconv(cd, &inbuf, &inbufbytes, &outbuf, &outbufbytes) == (size_t)-1) { gdWarning( "\"%s\" - error in iconv conversion (%s)\n", inbuf, strerror( errno ) ); break; // inbuf++; // inbufbytes--; } } // Flush the state. This fixes CP1255 problems. iconv( cd, 0, 0, &outbuf, &outbufbytes ); if( inbufbytes == 0 ) s = std::string( defbuf ); free( defbuf ); iconv_close( cd ); }