+ Support for dictd files (.index/.dict(.dz)) added, among with other small

accompanying changes.
2024-11-27 15:24:05 +00:00 · 2009-04-09 18:50:49 +00:00 · 2009-04-09 18:50:49 +00:00 · 9df2db4011
parent ccd235e9c3
commit 9df2db4011
10 changed files with 509 additions and 2 deletions
--- a/src/article-style.css
+++ b/src/article-style.css
@ -246,6 +246,13 @@ div.sdct_x
  margin-top: 1em;
 }

+/************* Dictd articles *****************/
+.dictd_article
+{
+  /* Add some vertical space before the article */
+  margin-top: 1em;
+}
+
 /************* MediaWiki articles *****************
 The following consist of excerpts from different .css files edited
 with a .mwiki prepended to each record.
--- a/src/dictdfiles.cc
+++ b/src/dictdfiles.cc
@ -0,0 +1,409 @@
+/* This file is (c) 2008-2009 Konstantin Isakov <ikm@users.berlios.de>
+ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
+
+#include "dictdfiles.hh"
+#include "btreeidx.hh"
+#include "folding.hh"
+#include "utf8.hh"
+#include "dictzip.h"
+#include "htmlescape.hh"
+#include "fsencoding.hh"
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include <list>
+#include <wctype.h>
+#include <stdlib.h>
+
+namespace DictdFiles {
+
+using std::map;
+using std::multimap;
+using std::pair;
+using std::set;
+using std::string;
+using std::wstring;
+using std::vector;
+using std::list;
+
+using BtreeIndexing::WordArticleLink;
+using BtreeIndexing::IndexedWords;
+
+namespace {
+
+DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
+DEF_EX( exFailedToReadLineFromIndex, "Failed to read line from index file", Dictionary::Ex )
+DEF_EX( exMalformedIndexFileLine, "Malformed index file line encountered", Dictionary::Ex )
+DEF_EX( exInvalidBase64, "Invalid base64 sequence encountered", Dictionary::Ex )
+
+enum
+{
+  Signature = 0x58444344, // DCDX on little-endian, XDCD on big-endian
+  CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version
+};
+
+struct IdxHeader
+{
+  uint32_t signature; // First comes the signature, DCDX
+  uint32_t formatVersion; // File format version (CurrentFormatVersion)
+  uint32_t wordCount; // Total number of words
+  uint32_t indexOffset; // The offset of the index in the file
+} __attribute__((packed));
+
+bool indexIsOldOrBad( string const & indexFile )
+{
+  File::Class idx( indexFile, "rb" );
+
+  IdxHeader header;
+
+  return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
+         header.signature != Signature ||
+         header.formatVersion != CurrentFormatVersion;
+}
+
+class DictdDictionary: public BtreeIndexing::BtreeDictionary
+{
+  Mutex idxMutex;
+  File::Class idx, indexFile; // The later is .index file
+  IdxHeader idxHeader;
+  dictData * dz;
+
+public:
+
+  DictdDictionary( string const & id, string const & indexFile,
+                   vector< string > const & dictionaryFiles );
+
+  ~DictdDictionary();
+
+  virtual string getName() throw();
+
+  virtual map< Dictionary::Property, string > getProperties() throw()
+  { return map< Dictionary::Property, string >(); }
+
+  virtual unsigned long getArticleCount() throw()
+  { return idxHeader.wordCount; }
+
+  virtual unsigned long getWordCount() throw()
+  { return idxHeader.wordCount; }
+
+  virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
+                                                      vector< wstring > const & alts )
+    throw( std::exception );
+};
+
+DictdDictionary::DictdDictionary( string const & id,
+                                  string const & indexFile,
+                                  vector< string > const & dictionaryFiles ):
+  BtreeDictionary( id, dictionaryFiles ),
+  idx( indexFile, "rb" ),
+  indexFile( dictionaryFiles[ 0 ], "rb" ),
+  idxHeader( idx.read< IdxHeader >() )
+{
+  // Open the .dict file
+
+  dz = dict_data_open( dictionaryFiles[ 1 ].c_str(), 0 );
+
+  if ( !dz )
+    throw exCantReadFile( dictionaryFiles[ 1 ] );
+
+  // Initialize the index
+
+  idx.seek( idxHeader.indexOffset );
+
+  openIndex( idx, idxMutex );
+}
+
+DictdDictionary::~DictdDictionary()
+{
+  if ( dz )
+    dict_data_close( dz );
+}
+
+string nameFromFileName( string const & indexFileName )
+{
+  if ( indexFileName.empty() )
+    return string();
+
+  char const * sep = strrchr( indexFileName.c_str(), FsEncoding::separator() );
+
+  if ( !sep )
+    sep = indexFileName.c_str();
+
+  char const * dot = strrchr( sep, '.' );
+
+  if ( !dot )
+    dot = indexFileName.c_str() + indexFileName.size();
+
+  return Utf8::encode( FsEncoding::decode( string( sep + 1, dot - sep - 1 ) ) );
+}
+
+string DictdDictionary::getName() throw()
+{
+  return nameFromFileName( getDictionaryFilenames()[ 0 ] );
+}
+
+uint32_t decodeBase64( string const & str )
+{
+  static char const digits[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+  uint32_t number = 0;
+
+  for( char const * next = str.c_str(); *next; ++next )
+  {
+    char const * d = strchr( digits, *next );
+
+    if ( !d )
+      throw exInvalidBase64();
+
+    number = number * 64 + ( d - digits );
+  }
+
+  return number;
+}
+
+sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & word,
+                                                             vector< wstring > const & alts )
+  throw( std::exception )
+{
+  try
+  {
+    vector< WordArticleLink > chain = findArticles( word );
+  
+    for( unsigned x = 0; x < alts.size(); ++x )
+    {
+      /// Make an additional query for each alt
+  
+      vector< WordArticleLink > altChain = findArticles( alts[ x ] );
+  
+      chain.insert( chain.end(), altChain.begin(), altChain.end() );
+    }
+  
+    multimap< wstring, string > mainArticles, alternateArticles;
+  
+    set< uint32_t > articlesIncluded; // Some synonyms make it that the articles
+                                      // appear several times. We combat this
+                                      // by only allowing them to appear once.
+  
+    wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
+  
+    char buf[ 16384 ];
+
+    for( unsigned x = 0; x < chain.size(); ++x )
+    {
+      if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
+        continue; // We already have this article in the body.
+  
+      // Now load that article
+
+      indexFile.seek( chain[ x ].articleOffset );
+
+      if ( !indexFile.gets( buf, sizeof( buf ), true ) )
+        throw exFailedToReadLineFromIndex();
+
+      char * tab1 = strchr( buf, '\t' );
+
+      if ( !tab1 )
+        throw exMalformedIndexFileLine();
+
+      char * tab2 = strchr( tab1 + 1, '\t' );
+
+      if ( !tab2 )
+        throw exMalformedIndexFileLine();
+
+      // After tab1 should be article offset, after tab2 -- article size
+
+      uint32_t articleOffset = decodeBase64( string( tab1 + 1, tab2 - tab1 - 1 ) );
+      uint32_t articleSize = decodeBase64( tab2 + 1 );
+
+      char * articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
+
+      if ( !articleBody )
+        throw exCantReadFile( getDictionaryFilenames()[ 1 ] );
+
+      //sprintf( buf, "Offset: %u, Size: %u\n", articleOffset, articleSize );
+
+      string articleText = string( "<div class=\"dictd_article\">" ) +
+        Html::preformat( articleBody ) + "</div>";
+
+      free( articleBody );
+    
+      // Ok. Now, does it go to main articles, or to alternate ones? We list
+      // main ones first, and alternates after.
+  
+      // We do the case-folded comparison here.
+  
+      wstring headwordStripped =
+        Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) );
+  
+      multimap< wstring, string > & mapToUse = 
+        ( wordCaseFolded == headwordStripped ) ?
+          mainArticles : alternateArticles;
+  
+      mapToUse.insert( pair< wstring, string >(
+        Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ),
+        articleText ) );
+
+      articlesIncluded.insert( chain[ x ].articleOffset );
+    }
+  
+    if ( mainArticles.empty() && alternateArticles.empty() )
+      return new Dictionary::DataRequestInstant( false );
+  
+    string result;
+  
+    multimap< wstring, string >::const_iterator i;
+  
+    for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
+      result += i->second;
+  
+    for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
+      result += i->second;
+  
+    sptr< Dictionary::DataRequestInstant > ret =
+      new Dictionary::DataRequestInstant( true );
+  
+    ret->getData().resize( result.size() );
+  
+    memcpy( &(ret->getData().front()), result.data(), result.size() );
+  
+    return ret;
+  }
+  catch( std::exception & e )
+  {
+    return new Dictionary::DataRequestInstant( QString( e.what() ) );
+  }
+}
+
+} // anonymous namespace
+
+static bool tryPossibleName( string const & name, string & copyTo )
+{
+  try
+  {
+    File::Class f( name, "rb" );
+
+    copyTo = name;
+
+    return true;
+  }
+  catch( ... )
+  {
+    return false;
+  }
+}
+
+vector< sptr< Dictionary::Class > > makeDictionaries(
+                                      vector< string > const & fileNames,
+                                      string const & indicesDir,
+                                      Dictionary::Initializing & initializing )
+  throw( std::exception )
+{
+  vector< sptr< Dictionary::Class > > dictionaries;
+
+  for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
+       ++i )
+  {
+    // Only allow .index suffixes
+
+    if ( i->size() < 6 ||
+         strcasecmp( i->c_str() + ( i->size() - 6 ), ".index" ) != 0 )
+      continue;
+
+    try
+    {
+      vector< string > dictFiles( 1, *i );
+
+      // Check if there is an 'abrv' file present
+      string baseName( *i, 0, i->size() - 5 );
+
+      dictFiles.push_back( string() );
+
+      if ( !tryPossibleName( baseName + "dict", dictFiles[ 1 ] ) &&
+           !tryPossibleName( baseName + "dict.dz", dictFiles[ 1 ] ) )
+      {
+        // No corresponding .dict file, skipping
+        continue;
+      }
+
+      string dictId = Dictionary::makeDictionaryId( dictFiles );
+
+      string indexFile = indicesDir + dictId;
+
+      if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
+           indexIsOldOrBad( indexFile ) )
+      {
+        // Building the index
+        initializing.indexingDictionary( nameFromFileName( dictFiles[ 0 ] ) );
+
+        File::Class idx( indexFile, "wb" );
+
+        IdxHeader idxHeader;
+
+        memset( &idxHeader, 0, sizeof( idxHeader ) );
+
+        // We write a dummy header first. At the end of the process the header
+        // will be rewritten with the right values.
+
+        idx.write( idxHeader );
+
+        IndexedWords indexedWords;
+
+        File::Class indexFile( dictFiles[ 0 ], "r" );
+
+        // Read words from index until none's left.
+
+        char buf[ 16384 ];
+
+        do
+        {
+          uint32_t curOffset = indexFile.tell();
+
+          if ( !indexFile.gets( buf, sizeof( buf ), true ) )
+            break;
+
+          // Check that there are exactly two tabs in the record.
+
+          char * tab = strchr( buf, '\t' );
+
+          if ( !tab || ! ( tab = strchr( tab + 1, '\t' ) ) || strchr( tab + 1, '\t' ) )
+          {
+            printf( "Warning: incorrect amount of tabs in a line, skipping: %s\n", buf );
+            continue;
+          }
+
+          indexedWords.addWord( Utf8::decode( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset );
+
+          ++idxHeader.wordCount;
+
+        } while( !indexFile.eof() );
+
+        // Build index
+
+        idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
+
+        // That concludes it. Update the header.
+
+        idxHeader.signature = Signature;
+        idxHeader.formatVersion = CurrentFormatVersion;
+
+        idx.rewind();
+
+        idx.write( &idxHeader, sizeof( idxHeader ) );
+      }
+
+      dictionaries.push_back( new DictdDictionary( dictId,
+                                                   indexFile,
+                                                   dictFiles ) );
+    }
+    catch( std::exception & e )
+    {
+      fprintf( stderr, "Dictd dictionary reading failed: %s, error: %s\n",
+        i->c_str(), e.what() );
+    }
+  }
+
+  return dictionaries;
+}
+
+}
--- a/src/dictdfiles.hh
+++ b/src/dictdfiles.hh
@ -0,0 +1,23 @@
+/* This file is (c) 2008-2009 Konstantin Isakov <ikm@users.berlios.de>
+ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
+
+#ifndef __DICTDFILES_HH_INCLUDED__
+#define __DICTDFILES_HH_INCLUDED__
+
+#include "dictionary.hh"
+
+/// Support for the dictd (.index/dict.dz) files.
+namespace DictdFiles {
+
+using std::vector;
+using std::string;
+
+vector< sptr< Dictionary::Class > > makeDictionaries(
+                                      vector< string > const & fileNames,
+                                      string const & indicesDir,
+                                      Dictionary::Initializing & )
+    throw( std::exception );
+
+}
+
+#endif
--- a/src/dictionary.hh
+++ b/src/dictionary.hh
@ -206,6 +206,9 @@ public:
  DataRequestInstant( bool succeeded )
  { hasAnyData = succeeded; finish(); }

+  DataRequestInstant( QString const & errorString )
+  { setErrorString( errorString ); finish(); }
+
  virtual void cancel()
  {}

--- a/src/fsencoding.cc
+++ b/src/fsencoding.cc
@ -18,6 +18,11 @@ string encode( string const & str )
  return string( QString::fromUtf8( str.c_str() ).toLocal8Bit().data() );
 }

+wstring decode( string const & str )
+{
+  return QString::fromLocal8Bit( str.c_str() ).toStdWString();
+}
+
 char separator()
 {
  return QDir::separator().toAscii();
--- a/src/fsencoding.hh
+++ b/src/fsencoding.hh
@ -20,6 +20,9 @@ string encode( wstring const & );
 /// Encodes the given string in utf8 to the system 8bit encoding.
 string encode( string const & );

+/// Decodes the given 8bit-encoded string to a wide string.
+wstring decode( string const & str );
+
 /// Returns the filesystem separator (/ on Unix and clones, \ on Windows).
 char separator();

--- a/src/goldendict.pro
+++ b/src/goldendict.pro
@ -68,7 +68,8 @@ HEADERS += folding.hh \
 	   mutex.hh \
 	   mediawiki.hh \
 	   sounddir.hh \
-           hunspell.hh
+           hunspell.hh \
+           dictdfiles.hh
 	  

 FORMS += groups.ui dictgroupwidget.ui mainwindow.ui sources.ui initializing.ui\
@ -83,7 +84,7 @@ SOURCES += folding.cc main.cc dictionary.cc config.cc sources.cc \
 	   groups_widgets.cc instances.cc article_maker.cc scanpopup.cc \
 	   articleview.cc externalviewer.cc wordfinder.cc \
 	   groupcombobox.cc keyboardstate.cc mouseover.cc preferences.cc \
-	   mutex.cc mediawiki.cc sounddir.cc hunspell.cc
+	   mutex.cc mediawiki.cc sounddir.cc hunspell.cc dictdfiles.cc

 win32 {
  SOURCES += mouseover_win32/ThTypes.c
--- a/src/htmlescape.cc
+++ b/src/htmlescape.cc
@ -39,4 +39,47 @@ string escape( string const & str )
  return result;
 }

+string preformat( string const & str )
+{
+  string escaped = escape( str ), result;
+
+  result.reserve( escaped.size() );
+
+  bool leading = true;
+
+  for( char const * nextChar = escaped.c_str(); *nextChar; ++nextChar )
+  {
+    if ( leading )
+    {
+      if ( *nextChar == ' ' )
+      {
+        result += "&nbsp;";
+        continue;
+      }
+      else
+      if ( *nextChar == '\t' )
+      {
+        result += "&nbsp;&nbsp;&nbsp;&nbsp;";
+        continue;
+      }
+    }
+
+    if ( *nextChar == '\n' )
+    {
+      result += "<br/>";
+      leading = true;
+      continue;
+    }
+
+    if ( *nextChar == '\r' )
+      continue; // Just skip all \r
+
+    result.push_back( *nextChar );
+
+    leading = false;
+  }
+
+  return result;
+}
+
 }
--- a/src/htmlescape.hh
+++ b/src/htmlescape.hh
@ -15,6 +15,10 @@ using std::string;
 // to make the result suitable for inserting as attributes' values.
 string escape( string const & );

+// Converts the given preformatted text to html. Each end of line is replaced by
+// <br>, each leading space is converted to &nbsp;.
+string preformat( string const & );
+
 }

 #endif
--- a/src/mainwindow.cc
+++ b/src/mainwindow.cc
@ -12,6 +12,7 @@
 #include "mediawiki.hh"
 #include "sounddir.hh"
 #include "hunspell.hh"
+#include "dictdfiles.hh"
 #include "ui_about.h"
 #include <QDir>
 #include <QMessageBox>
@ -270,6 +271,14 @@ void LoadDictionaries::handlePath( Config::Path const & path )
    dictionaries.insert( dictionaries.end(), dslDictionaries.begin(),
                         dslDictionaries.end() );
  }
+
+  {
+    vector< sptr< Dictionary::Class > > dictdDictionaries =
+      DictdFiles::makeDictionaries( allFiles, Config::getIndexDir().toLocal8Bit().data(), *this );
+
+    dictionaries.insert( dictionaries.end(), dictdDictionaries.begin(),
+                         dictdDictionaries.end() );
+  }
 }

 void LoadDictionaries::indexingDictionary( string const & dictionaryName ) throw()