diff --git a/src/article-style.css b/src/article-style.css index 0da099be..2f90a3bb 100644 --- a/src/article-style.css +++ b/src/article-style.css @@ -246,6 +246,13 @@ div.sdct_x margin-top: 1em; } +/************* Dictd articles *****************/ +.dictd_article +{ + /* Add some vertical space before the article */ + margin-top: 1em; +} + /************* MediaWiki articles ***************** The following consist of excerpts from different .css files edited with a .mwiki prepended to each record. diff --git a/src/dictdfiles.cc b/src/dictdfiles.cc new file mode 100644 index 00000000..a23dc244 --- /dev/null +++ b/src/dictdfiles.cc @@ -0,0 +1,409 @@ +/* This file is (c) 2008-2009 Konstantin Isakov + * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ + +#include "dictdfiles.hh" +#include "btreeidx.hh" +#include "folding.hh" +#include "utf8.hh" +#include "dictzip.h" +#include "htmlescape.hh" +#include "fsencoding.hh" +#include +#include +#include +#include +#include +#include +#include + +namespace DictdFiles { + +using std::map; +using std::multimap; +using std::pair; +using std::set; +using std::string; +using std::wstring; +using std::vector; +using std::list; + +using BtreeIndexing::WordArticleLink; +using BtreeIndexing::IndexedWords; + +namespace { + +DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex ) +DEF_EX( exFailedToReadLineFromIndex, "Failed to read line from index file", Dictionary::Ex ) +DEF_EX( exMalformedIndexFileLine, "Malformed index file line encountered", Dictionary::Ex ) +DEF_EX( exInvalidBase64, "Invalid base64 sequence encountered", Dictionary::Ex ) + +enum +{ + Signature = 0x58444344, // DCDX on little-endian, XDCD on big-endian + CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version +}; + +struct IdxHeader +{ + uint32_t signature; // First comes the signature, DCDX + uint32_t formatVersion; // File format version (CurrentFormatVersion) + uint32_t wordCount; // Total number of words + uint32_t indexOffset; // The offset of the index in the file +} __attribute__((packed)); + +bool indexIsOldOrBad( string const & indexFile ) +{ + File::Class idx( indexFile, "rb" ); + + IdxHeader header; + + return idx.readRecords( &header, sizeof( header ), 1 ) != 1 || + header.signature != Signature || + header.formatVersion != CurrentFormatVersion; +} + +class DictdDictionary: public BtreeIndexing::BtreeDictionary +{ + Mutex idxMutex; + File::Class idx, indexFile; // The later is .index file + IdxHeader idxHeader; + dictData * dz; + +public: + + DictdDictionary( string const & id, string const & indexFile, + vector< string > const & dictionaryFiles ); + + ~DictdDictionary(); + + virtual string getName() throw(); + + virtual map< Dictionary::Property, string > getProperties() throw() + { return map< Dictionary::Property, string >(); } + + virtual unsigned long getArticleCount() throw() + { return idxHeader.wordCount; } + + virtual unsigned long getWordCount() throw() + { return idxHeader.wordCount; } + + virtual sptr< Dictionary::DataRequest > getArticle( wstring const &, + vector< wstring > const & alts ) + throw( std::exception ); +}; + +DictdDictionary::DictdDictionary( string const & id, + string const & indexFile, + vector< string > const & dictionaryFiles ): + BtreeDictionary( id, dictionaryFiles ), + idx( indexFile, "rb" ), + indexFile( dictionaryFiles[ 0 ], "rb" ), + idxHeader( idx.read< IdxHeader >() ) +{ + // Open the .dict file + + dz = dict_data_open( dictionaryFiles[ 1 ].c_str(), 0 ); + + if ( !dz ) + throw exCantReadFile( dictionaryFiles[ 1 ] ); + + // Initialize the index + + idx.seek( idxHeader.indexOffset ); + + openIndex( idx, idxMutex ); +} + +DictdDictionary::~DictdDictionary() +{ + if ( dz ) + dict_data_close( dz ); +} + +string nameFromFileName( string const & indexFileName ) +{ + if ( indexFileName.empty() ) + return string(); + + char const * sep = strrchr( indexFileName.c_str(), FsEncoding::separator() ); + + if ( !sep ) + sep = indexFileName.c_str(); + + char const * dot = strrchr( sep, '.' ); + + if ( !dot ) + dot = indexFileName.c_str() + indexFileName.size(); + + return Utf8::encode( FsEncoding::decode( string( sep + 1, dot - sep - 1 ) ) ); +} + +string DictdDictionary::getName() throw() +{ + return nameFromFileName( getDictionaryFilenames()[ 0 ] ); +} + +uint32_t decodeBase64( string const & str ) +{ + static char const digits[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + uint32_t number = 0; + + for( char const * next = str.c_str(); *next; ++next ) + { + char const * d = strchr( digits, *next ); + + if ( !d ) + throw exInvalidBase64(); + + number = number * 64 + ( d - digits ); + } + + return number; +} + +sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & word, + vector< wstring > const & alts ) + throw( std::exception ) +{ + try + { + vector< WordArticleLink > chain = findArticles( word ); + + for( unsigned x = 0; x < alts.size(); ++x ) + { + /// Make an additional query for each alt + + vector< WordArticleLink > altChain = findArticles( alts[ x ] ); + + chain.insert( chain.end(), altChain.begin(), altChain.end() ); + } + + multimap< wstring, string > mainArticles, alternateArticles; + + set< uint32_t > articlesIncluded; // Some synonyms make it that the articles + // appear several times. We combat this + // by only allowing them to appear once. + + wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + + char buf[ 16384 ]; + + for( unsigned x = 0; x < chain.size(); ++x ) + { + if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() ) + continue; // We already have this article in the body. + + // Now load that article + + indexFile.seek( chain[ x ].articleOffset ); + + if ( !indexFile.gets( buf, sizeof( buf ), true ) ) + throw exFailedToReadLineFromIndex(); + + char * tab1 = strchr( buf, '\t' ); + + if ( !tab1 ) + throw exMalformedIndexFileLine(); + + char * tab2 = strchr( tab1 + 1, '\t' ); + + if ( !tab2 ) + throw exMalformedIndexFileLine(); + + // After tab1 should be article offset, after tab2 -- article size + + uint32_t articleOffset = decodeBase64( string( tab1 + 1, tab2 - tab1 - 1 ) ); + uint32_t articleSize = decodeBase64( tab2 + 1 ); + + char * articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 ); + + if ( !articleBody ) + throw exCantReadFile( getDictionaryFilenames()[ 1 ] ); + + //sprintf( buf, "Offset: %u, Size: %u\n", articleOffset, articleSize ); + + string articleText = string( "
" ) + + Html::preformat( articleBody ) + "
"; + + free( articleBody ); + + // Ok. Now, does it go to main articles, or to alternate ones? We list + // main ones first, and alternates after. + + // We do the case-folded comparison here. + + wstring headwordStripped = + Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ); + + multimap< wstring, string > & mapToUse = + ( wordCaseFolded == headwordStripped ) ? + mainArticles : alternateArticles; + + mapToUse.insert( pair< wstring, string >( + Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ), + articleText ) ); + + articlesIncluded.insert( chain[ x ].articleOffset ); + } + + if ( mainArticles.empty() && alternateArticles.empty() ) + return new Dictionary::DataRequestInstant( false ); + + string result; + + multimap< wstring, string >::const_iterator i; + + for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) + result += i->second; + + for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i ) + result += i->second; + + sptr< Dictionary::DataRequestInstant > ret = + new Dictionary::DataRequestInstant( true ); + + ret->getData().resize( result.size() ); + + memcpy( &(ret->getData().front()), result.data(), result.size() ); + + return ret; + } + catch( std::exception & e ) + { + return new Dictionary::DataRequestInstant( QString( e.what() ) ); + } +} + +} // anonymous namespace + +static bool tryPossibleName( string const & name, string & copyTo ) +{ + try + { + File::Class f( name, "rb" ); + + copyTo = name; + + return true; + } + catch( ... ) + { + return false; + } +} + +vector< sptr< Dictionary::Class > > makeDictionaries( + vector< string > const & fileNames, + string const & indicesDir, + Dictionary::Initializing & initializing ) + throw( std::exception ) +{ + vector< sptr< Dictionary::Class > > dictionaries; + + for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); + ++i ) + { + // Only allow .index suffixes + + if ( i->size() < 6 || + strcasecmp( i->c_str() + ( i->size() - 6 ), ".index" ) != 0 ) + continue; + + try + { + vector< string > dictFiles( 1, *i ); + + // Check if there is an 'abrv' file present + string baseName( *i, 0, i->size() - 5 ); + + dictFiles.push_back( string() ); + + if ( !tryPossibleName( baseName + "dict", dictFiles[ 1 ] ) && + !tryPossibleName( baseName + "dict.dz", dictFiles[ 1 ] ) ) + { + // No corresponding .dict file, skipping + continue; + } + + string dictId = Dictionary::makeDictionaryId( dictFiles ); + + string indexFile = indicesDir + dictId; + + if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || + indexIsOldOrBad( indexFile ) ) + { + // Building the index + initializing.indexingDictionary( nameFromFileName( dictFiles[ 0 ] ) ); + + File::Class idx( indexFile, "wb" ); + + IdxHeader idxHeader; + + memset( &idxHeader, 0, sizeof( idxHeader ) ); + + // We write a dummy header first. At the end of the process the header + // will be rewritten with the right values. + + idx.write( idxHeader ); + + IndexedWords indexedWords; + + File::Class indexFile( dictFiles[ 0 ], "r" ); + + // Read words from index until none's left. + + char buf[ 16384 ]; + + do + { + uint32_t curOffset = indexFile.tell(); + + if ( !indexFile.gets( buf, sizeof( buf ), true ) ) + break; + + // Check that there are exactly two tabs in the record. + + char * tab = strchr( buf, '\t' ); + + if ( !tab || ! ( tab = strchr( tab + 1, '\t' ) ) || strchr( tab + 1, '\t' ) ) + { + printf( "Warning: incorrect amount of tabs in a line, skipping: %s\n", buf ); + continue; + } + + indexedWords.addWord( Utf8::decode( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset ); + + ++idxHeader.wordCount; + + } while( !indexFile.eof() ); + + // Build index + + idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); + + // That concludes it. Update the header. + + idxHeader.signature = Signature; + idxHeader.formatVersion = CurrentFormatVersion; + + idx.rewind(); + + idx.write( &idxHeader, sizeof( idxHeader ) ); + } + + dictionaries.push_back( new DictdDictionary( dictId, + indexFile, + dictFiles ) ); + } + catch( std::exception & e ) + { + fprintf( stderr, "Dictd dictionary reading failed: %s, error: %s\n", + i->c_str(), e.what() ); + } + } + + return dictionaries; +} + +} diff --git a/src/dictdfiles.hh b/src/dictdfiles.hh new file mode 100644 index 00000000..d32203b7 --- /dev/null +++ b/src/dictdfiles.hh @@ -0,0 +1,23 @@ +/* This file is (c) 2008-2009 Konstantin Isakov + * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ + +#ifndef __DICTDFILES_HH_INCLUDED__ +#define __DICTDFILES_HH_INCLUDED__ + +#include "dictionary.hh" + +/// Support for the dictd (.index/dict.dz) files. +namespace DictdFiles { + +using std::vector; +using std::string; + +vector< sptr< Dictionary::Class > > makeDictionaries( + vector< string > const & fileNames, + string const & indicesDir, + Dictionary::Initializing & ) + throw( std::exception ); + +} + +#endif diff --git a/src/dictionary.hh b/src/dictionary.hh index 7888abf3..6ee75fcf 100644 --- a/src/dictionary.hh +++ b/src/dictionary.hh @@ -206,6 +206,9 @@ public: DataRequestInstant( bool succeeded ) { hasAnyData = succeeded; finish(); } + DataRequestInstant( QString const & errorString ) + { setErrorString( errorString ); finish(); } + virtual void cancel() {} diff --git a/src/fsencoding.cc b/src/fsencoding.cc index ac422678..517a7869 100644 --- a/src/fsencoding.cc +++ b/src/fsencoding.cc @@ -18,6 +18,11 @@ string encode( string const & str ) return string( QString::fromUtf8( str.c_str() ).toLocal8Bit().data() ); } +wstring decode( string const & str ) +{ + return QString::fromLocal8Bit( str.c_str() ).toStdWString(); +} + char separator() { return QDir::separator().toAscii(); diff --git a/src/fsencoding.hh b/src/fsencoding.hh index 0afd873b..a4468c90 100644 --- a/src/fsencoding.hh +++ b/src/fsencoding.hh @@ -20,6 +20,9 @@ string encode( wstring const & ); /// Encodes the given string in utf8 to the system 8bit encoding. string encode( string const & ); +/// Decodes the given 8bit-encoded string to a wide string. +wstring decode( string const & str ); + /// Returns the filesystem separator (/ on Unix and clones, \ on Windows). char separator(); diff --git a/src/goldendict.pro b/src/goldendict.pro index e303adc9..29379d4d 100644 --- a/src/goldendict.pro +++ b/src/goldendict.pro @@ -68,7 +68,8 @@ HEADERS += folding.hh \ mutex.hh \ mediawiki.hh \ sounddir.hh \ - hunspell.hh + hunspell.hh \ + dictdfiles.hh FORMS += groups.ui dictgroupwidget.ui mainwindow.ui sources.ui initializing.ui\ @@ -83,7 +84,7 @@ SOURCES += folding.cc main.cc dictionary.cc config.cc sources.cc \ groups_widgets.cc instances.cc article_maker.cc scanpopup.cc \ articleview.cc externalviewer.cc wordfinder.cc \ groupcombobox.cc keyboardstate.cc mouseover.cc preferences.cc \ - mutex.cc mediawiki.cc sounddir.cc hunspell.cc + mutex.cc mediawiki.cc sounddir.cc hunspell.cc dictdfiles.cc win32 { SOURCES += mouseover_win32/ThTypes.c diff --git a/src/htmlescape.cc b/src/htmlescape.cc index e7dd0622..9b795aeb 100644 --- a/src/htmlescape.cc +++ b/src/htmlescape.cc @@ -39,4 +39,47 @@ string escape( string const & str ) return result; } +string preformat( string const & str ) +{ + string escaped = escape( str ), result; + + result.reserve( escaped.size() ); + + bool leading = true; + + for( char const * nextChar = escaped.c_str(); *nextChar; ++nextChar ) + { + if ( leading ) + { + if ( *nextChar == ' ' ) + { + result += " "; + continue; + } + else + if ( *nextChar == '\t' ) + { + result += "    "; + continue; + } + } + + if ( *nextChar == '\n' ) + { + result += "
"; + leading = true; + continue; + } + + if ( *nextChar == '\r' ) + continue; // Just skip all \r + + result.push_back( *nextChar ); + + leading = false; + } + + return result; +} + } diff --git a/src/htmlescape.hh b/src/htmlescape.hh index 792d0007..9bed5014 100644 --- a/src/htmlescape.hh +++ b/src/htmlescape.hh @@ -15,6 +15,10 @@ using std::string; // to make the result suitable for inserting as attributes' values. string escape( string const & ); +// Converts the given preformatted text to html. Each end of line is replaced by +//
, each leading space is converted to  . +string preformat( string const & ); + } #endif diff --git a/src/mainwindow.cc b/src/mainwindow.cc index 5dd71efe..acc2112c 100644 --- a/src/mainwindow.cc +++ b/src/mainwindow.cc @@ -12,6 +12,7 @@ #include "mediawiki.hh" #include "sounddir.hh" #include "hunspell.hh" +#include "dictdfiles.hh" #include "ui_about.h" #include #include @@ -270,6 +271,14 @@ void LoadDictionaries::handlePath( Config::Path const & path ) dictionaries.insert( dictionaries.end(), dslDictionaries.begin(), dslDictionaries.end() ); } + + { + vector< sptr< Dictionary::Class > > dictdDictionaries = + DictdFiles::makeDictionaries( allFiles, Config::getIndexDir().toLocal8Bit().data(), *this ); + + dictionaries.insert( dictionaries.end(), dictdDictionaries.begin(), + dictdDictionaries.end() ); + } } void LoadDictionaries::indexingDictionary( string const & dictionaryName ) throw()