/* This file is (c) 2008-2009 Konstantin Isakov * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "dictdfiles.hh" #include "btreeidx.hh" #include "folding.hh" #include "utf8.hh" #include "dictzip.h" #include "htmlescape.hh" #include "fsencoding.hh" #include #include #include #include #include #include #include namespace DictdFiles { using std::map; using std::multimap; using std::pair; using std::set; using std::string; using std::wstring; using std::vector; using std::list; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; namespace { DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex ) DEF_EX( exFailedToReadLineFromIndex, "Failed to read line from index file", Dictionary::Ex ) DEF_EX( exMalformedIndexFileLine, "Malformed index file line encountered", Dictionary::Ex ) DEF_EX( exInvalidBase64, "Invalid base64 sequence encountered", Dictionary::Ex ) enum { Signature = 0x58444344, // DCDX on little-endian, XDCD on big-endian CurrentFormatVersion = 2 + BtreeIndexing::FormatVersion + Folding::Version }; struct IdxHeader { uint32_t signature; // First comes the signature, DCDX uint32_t formatVersion; // File format version (CurrentFormatVersion) uint32_t wordCount; // Total number of words uint32_t indexOffset; // The offset of the index in the file } __attribute__((packed)); bool indexIsOldOrBad( string const & indexFile ) { File::Class idx( indexFile, "rb" ); IdxHeader header; return idx.readRecords( &header, sizeof( header ), 1 ) != 1 || header.signature != Signature || header.formatVersion != CurrentFormatVersion; } class DictdDictionary: public BtreeIndexing::BtreeDictionary { Mutex idxMutex; File::Class idx, indexFile; // The later is .index file IdxHeader idxHeader; dictData * dz; public: DictdDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ); ~DictdDictionary(); virtual string getName() throw(); virtual map< Dictionary::Property, string > getProperties() throw() { return map< Dictionary::Property, string >(); } virtual unsigned long getArticleCount() throw() { return idxHeader.wordCount; } virtual unsigned long getWordCount() throw() { return idxHeader.wordCount; } virtual sptr< Dictionary::DataRequest > getArticle( wstring const &, vector< wstring > const & alts ) throw( std::exception ); }; DictdDictionary::DictdDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ): BtreeDictionary( id, dictionaryFiles ), idx( indexFile, "rb" ), indexFile( dictionaryFiles[ 0 ], "rb" ), idxHeader( idx.read< IdxHeader >() ) { // Open the .dict file dz = dict_data_open( dictionaryFiles[ 1 ].c_str(), 0 ); if ( !dz ) throw exCantReadFile( dictionaryFiles[ 1 ] ); // Initialize the index idx.seek( idxHeader.indexOffset ); openIndex( idx, idxMutex ); } DictdDictionary::~DictdDictionary() { if ( dz ) dict_data_close( dz ); } string nameFromFileName( string const & indexFileName ) { if ( indexFileName.empty() ) return string(); char const * sep = strrchr( indexFileName.c_str(), FsEncoding::separator() ); if ( !sep ) sep = indexFileName.c_str(); char const * dot = strrchr( sep, '.' ); if ( !dot ) dot = indexFileName.c_str() + indexFileName.size(); return Utf8::encode( FsEncoding::decode( string( sep + 1, dot - sep - 1 ) ) ); } string DictdDictionary::getName() throw() { return nameFromFileName( getDictionaryFilenames()[ 0 ] ); } uint32_t decodeBase64( string const & str ) { static char const digits[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; uint32_t number = 0; for( char const * next = str.c_str(); *next; ++next ) { char const * d = strchr( digits, *next ); if ( !d ) throw exInvalidBase64(); number = number * 64 + ( d - digits ); } return number; } sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & word, vector< wstring > const & alts ) throw( std::exception ) { try { vector< WordArticleLink > chain = findArticles( word ); for( unsigned x = 0; x < alts.size(); ++x ) { /// Make an additional query for each alt vector< WordArticleLink > altChain = findArticles( alts[ x ] ); chain.insert( chain.end(), altChain.begin(), altChain.end() ); } multimap< wstring, string > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonyms make it that the articles // appear several times. We combat this // by only allowing them to appear once. wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); char buf[ 16384 ]; for( unsigned x = 0; x < chain.size(); ++x ) { if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() ) continue; // We already have this article in the body. // Now load that article indexFile.seek( chain[ x ].articleOffset ); if ( !indexFile.gets( buf, sizeof( buf ), true ) ) throw exFailedToReadLineFromIndex(); char * tab1 = strchr( buf, '\t' ); if ( !tab1 ) throw exMalformedIndexFileLine(); char * tab2 = strchr( tab1 + 1, '\t' ); if ( !tab2 ) throw exMalformedIndexFileLine(); // After tab1 should be article offset, after tab2 -- article size uint32_t articleOffset = decodeBase64( string( tab1 + 1, tab2 - tab1 - 1 ) ); uint32_t articleSize = decodeBase64( tab2 + 1 ); char * articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 ); if ( !articleBody ) throw exCantReadFile( getDictionaryFilenames()[ 1 ] ); //sprintf( buf, "Offset: %u, Size: %u\n", articleOffset, articleSize ); string articleText = string( "

" ) + Html::preformat( articleBody ) + "

"; free( articleBody ); // Ok. Now, does it go to main articles, or to alternate ones? We list // main ones first, and alternates after. // We do the case-folded comparison here. wstring headwordStripped = Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ); multimap< wstring, string > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair< wstring, string >( Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ), articleText ) ); articlesIncluded.insert( chain[ x ].articleOffset ); } if ( mainArticles.empty() && alternateArticles.empty() ) return new Dictionary::DataRequestInstant( false ); string result; multimap< wstring, string >::const_iterator i; for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) result += i->second; for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i ) result += i->second; sptr< Dictionary::DataRequestInstant > ret = new Dictionary::DataRequestInstant( true ); ret->getData().resize( result.size() ); memcpy( &(ret->getData().front()), result.data(), result.size() ); return ret; } catch( std::exception & e ) { return new Dictionary::DataRequestInstant( QString( e.what() ) ); } } } // anonymous namespace static bool tryPossibleName( string const & name, string & copyTo ) { try { File::Class f( name, "rb" ); copyTo = name; return true; } catch( ... ) { return false; } } vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, Dictionary::Initializing & initializing ) throw( std::exception ) { vector< sptr< Dictionary::Class > > dictionaries; for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); ++i ) { // Only allow .index suffixes if ( i->size() < 6 || strcasecmp( i->c_str() + ( i->size() - 6 ), ".index" ) != 0 ) continue; try { vector< string > dictFiles( 1, *i ); // Check if there is an 'abrv' file present string baseName( *i, 0, i->size() - 5 ); dictFiles.push_back( string() ); if ( !tryPossibleName( baseName + "dict", dictFiles[ 1 ] ) && !tryPossibleName( baseName + "dict.dz", dictFiles[ 1 ] ) ) { // No corresponding .dict file, skipping continue; } string dictId = Dictionary::makeDictionaryId( dictFiles ); string indexFile = indicesDir + dictId; if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( indexFile ) ) { // Building the index initializing.indexingDictionary( nameFromFileName( dictFiles[ 0 ] ) ); File::Class idx( indexFile, "wb" ); IdxHeader idxHeader; memset( &idxHeader, 0, sizeof( idxHeader ) ); // We write a dummy header first. At the end of the process the header // will be rewritten with the right values. idx.write( idxHeader ); IndexedWords indexedWords; File::Class indexFile( dictFiles[ 0 ], "rb" ); // Read words from index until none's left. char buf[ 16384 ]; do { uint32_t curOffset = indexFile.tell(); if ( !indexFile.gets( buf, sizeof( buf ), true ) ) break; // Check that there are exactly two tabs in the record. char * tab = strchr( buf, '\t' ); if ( !tab || ! ( tab = strchr( tab + 1, '\t' ) ) || strchr( tab + 1, '\t' ) ) { printf( "Warning: incorrect amount of tabs in a line, skipping: %s\n", buf ); continue; } indexedWords.addWord( Utf8::decode( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset ); ++idxHeader.wordCount; } while( !indexFile.eof() ); // Build index idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); // That concludes it. Update the header. idxHeader.signature = Signature; idxHeader.formatVersion = CurrentFormatVersion; idx.rewind(); idx.write( &idxHeader, sizeof( idxHeader ) ); } dictionaries.push_back( new DictdDictionary( dictId, indexFile, dictFiles ) ); } catch( std::exception & e ) { fprintf( stderr, "Dictd dictionary reading failed: %s, error: %s\n", i->c_str(), e.what() ); } } return dictionaries; } }