mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 19:24:08 +00:00
2c901d5bd3
See the dictfmt man: --index-keep-orig When --utf-8 is specified headwords are lowercased and non- alphanumeric characters are removed from it before saving to .index file in order to simplify the search. When --index-keep-orig option is used fourth column is created (if necessary) in .index file, and contains an original headword which is returned by MATCH command. This option may be useful to prevent converting " AT&T" to " ATT" or to keep proper nouns with uppercased first letter.
543 lines
16 KiB
C++
543 lines
16 KiB
C++
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#include "dictdfiles.hh"
|
|
#include "btreeidx.hh"
|
|
#include "folding.hh"
|
|
#include "utf8.hh"
|
|
#include "dictzip.h"
|
|
#include "htmlescape.hh"
|
|
#include "fsencoding.hh"
|
|
#include "langcoder.hh"
|
|
#include <map>
|
|
#include <set>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <list>
|
|
#include <wctype.h>
|
|
#include <stdlib.h>
|
|
#include "dprintf.hh"
|
|
|
|
#ifdef _MSC_VER
|
|
#include <stub_msvc.h>
|
|
#endif
|
|
|
|
namespace DictdFiles {
|
|
|
|
using std::map;
|
|
using std::multimap;
|
|
using std::pair;
|
|
using std::set;
|
|
using std::string;
|
|
using gd::wstring;
|
|
using std::vector;
|
|
using std::list;
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
using BtreeIndexing::IndexedWords;
|
|
using BtreeIndexing::IndexInfo;
|
|
|
|
namespace {
|
|
|
|
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
|
|
DEF_EX( exFailedToReadLineFromIndex, "Failed to read line from index file", Dictionary::Ex )
|
|
DEF_EX( exMalformedIndexFileLine, "Malformed index file line encountered", Dictionary::Ex )
|
|
DEF_EX( exInvalidBase64, "Invalid base64 sequence encountered", Dictionary::Ex )
|
|
|
|
enum
|
|
{
|
|
Signature = 0x58444344, // DCDX on little-endian, XDCD on big-endian
|
|
CurrentFormatVersion = 5 + BtreeIndexing::FormatVersion + Folding::Version
|
|
};
|
|
|
|
struct IdxHeader
|
|
{
|
|
uint32_t signature; // First comes the signature, DCDX
|
|
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
|
uint32_t wordCount; // Total number of words
|
|
uint32_t articleCount; // Total number of articles
|
|
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
|
uint32_t indexRootOffset;
|
|
uint32_t langFrom; // Source language
|
|
uint32_t langTo; // Target language
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
bool indexIsOldOrBad( string const & indexFile )
|
|
{
|
|
File::Class idx( indexFile, "rb" );
|
|
|
|
IdxHeader header;
|
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
|
|
header.signature != Signature ||
|
|
header.formatVersion != CurrentFormatVersion;
|
|
}
|
|
|
|
class DictdDictionary: public BtreeIndexing::BtreeDictionary
|
|
{
|
|
Mutex idxMutex;
|
|
File::Class idx, indexFile; // The later is .index file
|
|
IdxHeader idxHeader;
|
|
dictData * dz;
|
|
string dictionaryName;
|
|
|
|
public:
|
|
|
|
DictdDictionary( string const & id, string const & indexFile,
|
|
vector< string > const & dictionaryFiles );
|
|
|
|
~DictdDictionary();
|
|
|
|
virtual string getName() throw()
|
|
{ return dictionaryName; }
|
|
|
|
virtual map< Dictionary::Property, string > getProperties() throw()
|
|
{ return map< Dictionary::Property, string >(); }
|
|
|
|
virtual unsigned long getArticleCount() throw()
|
|
{ return idxHeader.articleCount; }
|
|
|
|
virtual unsigned long getWordCount() throw()
|
|
{ return idxHeader.wordCount; }
|
|
|
|
virtual void loadIcon() throw();
|
|
|
|
inline virtual quint32 getLangFrom() const
|
|
{ return idxHeader.langFrom; }
|
|
|
|
inline virtual quint32 getLangTo() const
|
|
{ return idxHeader.langTo; }
|
|
|
|
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
|
|
vector< wstring > const & alts,
|
|
wstring const & )
|
|
throw( std::exception );
|
|
};
|
|
|
|
DictdDictionary::DictdDictionary( string const & id,
|
|
string const & indexFile,
|
|
vector< string > const & dictionaryFiles ):
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
idx( indexFile, "rb" ),
|
|
indexFile( dictionaryFiles[ 0 ], "rb" ),
|
|
idxHeader( idx.read< IdxHeader >() )
|
|
{
|
|
|
|
// Read the dictionary name
|
|
idx.seek( sizeof( idxHeader ) );
|
|
|
|
vector< char > dName( idx.read< uint32_t >() );
|
|
idx.read( &dName.front(), dName.size() );
|
|
dictionaryName = string( &dName.front(), dName.size() );
|
|
|
|
// Open the .dict file
|
|
|
|
dz = dict_data_open( dictionaryFiles[ 1 ].c_str(), 0 );
|
|
|
|
if ( !dz )
|
|
throw exCantReadFile( dictionaryFiles[ 1 ] );
|
|
|
|
// Initialize the index
|
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
|
idxHeader.indexRootOffset ),
|
|
idx, idxMutex );
|
|
}
|
|
|
|
DictdDictionary::~DictdDictionary()
|
|
{
|
|
if ( dz )
|
|
dict_data_close( dz );
|
|
}
|
|
|
|
string nameFromFileName( string const & indexFileName )
|
|
{
|
|
if ( indexFileName.empty() )
|
|
return string();
|
|
|
|
char const * sep = strrchr( indexFileName.c_str(), FsEncoding::separator() );
|
|
|
|
if ( !sep )
|
|
sep = indexFileName.c_str();
|
|
|
|
char const * dot = strrchr( sep, '.' );
|
|
|
|
if ( !dot )
|
|
dot = indexFileName.c_str() + indexFileName.size();
|
|
|
|
return Utf8::encode( FsEncoding::decode( string( sep + 1, dot - sep - 1 ) ) );
|
|
}
|
|
|
|
void DictdDictionary::loadIcon() throw()
|
|
{
|
|
if ( dictionaryIconLoaded )
|
|
return;
|
|
|
|
QString fileName =
|
|
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
|
|
|
|
// Remove the extension
|
|
fileName.chop( 5 );
|
|
|
|
if( !loadIconFromFile( fileName ) )
|
|
{
|
|
// Load failed -- use default icons
|
|
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_dictd.png");
|
|
}
|
|
|
|
dictionaryIconLoaded = true;
|
|
}
|
|
|
|
uint32_t decodeBase64( string const & str )
|
|
{
|
|
static char const digits[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
|
|
uint32_t number = 0;
|
|
|
|
for( char const * next = str.c_str(); *next; ++next )
|
|
{
|
|
char const * d = strchr( digits, *next );
|
|
|
|
if ( !d )
|
|
throw exInvalidBase64();
|
|
|
|
number = number * 64 + ( d - digits );
|
|
}
|
|
|
|
return number;
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & word,
|
|
vector< wstring > const & alts,
|
|
wstring const & )
|
|
throw( std::exception )
|
|
{
|
|
try
|
|
{
|
|
vector< WordArticleLink > chain = findArticles( word );
|
|
|
|
for( unsigned x = 0; x < alts.size(); ++x )
|
|
{
|
|
/// Make an additional query for each alt
|
|
|
|
vector< WordArticleLink > altChain = findArticles( alts[ x ] );
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
}
|
|
|
|
multimap< wstring, string > mainArticles, alternateArticles;
|
|
|
|
set< uint32_t > articlesIncluded; // Some synonyms make it that the articles
|
|
// appear several times. We combat this
|
|
// by only allowing them to appear once.
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
|
|
|
char buf[ 16384 ];
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
|
continue; // We already have this article in the body.
|
|
|
|
// Now load that article
|
|
|
|
indexFile.seek( chain[ x ].articleOffset );
|
|
|
|
if ( !indexFile.gets( buf, sizeof( buf ), true ) )
|
|
throw exFailedToReadLineFromIndex();
|
|
|
|
char * tab1 = strchr( buf, '\t' );
|
|
|
|
if ( !tab1 )
|
|
throw exMalformedIndexFileLine();
|
|
|
|
char * tab2 = strchr( tab1 + 1, '\t' );
|
|
|
|
if ( !tab2 )
|
|
throw exMalformedIndexFileLine();
|
|
|
|
// After tab1 should be article offset, after tab2 -- article size
|
|
|
|
uint32_t articleOffset = decodeBase64( string( tab1 + 1, tab2 - tab1 - 1 ) );
|
|
|
|
char * tab3 = strchr( tab2 + 1, '\t');
|
|
|
|
uint32_t articleSize;
|
|
if ( tab3 )
|
|
{
|
|
articleSize = decodeBase64( string( tab2 + 1, tab3 - tab2 - 1 ) );
|
|
}
|
|
else
|
|
{
|
|
articleSize = decodeBase64( tab2 + 1 );
|
|
}
|
|
|
|
string articleText;
|
|
|
|
char * articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
|
|
|
|
if ( !articleBody )
|
|
{
|
|
// throw exCantReadFile( getDictionaryFilenames()[ 1 ] );
|
|
articleText = string( "<div class=\"dictd_article\">DICTZIP error: " )
|
|
+ dict_error_str( dz ) + "</div>";
|
|
}
|
|
else
|
|
{
|
|
//sprintf( buf, "Offset: %u, Size: %u\n", articleOffset, articleSize );
|
|
|
|
articleText = string( "<div class=\"dictd_article\">" ) +
|
|
Html::preformat( articleBody ) + "</div>";
|
|
free( articleBody );
|
|
}
|
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
// main ones first, and alternates after.
|
|
|
|
// We do the case-folded comparison here.
|
|
|
|
wstring headwordStripped =
|
|
Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) );
|
|
|
|
multimap< wstring, string > & mapToUse =
|
|
( wordCaseFolded == headwordStripped ) ?
|
|
mainArticles : alternateArticles;
|
|
|
|
mapToUse.insert( pair< wstring, string >(
|
|
Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ),
|
|
articleText ) );
|
|
|
|
articlesIncluded.insert( chain[ x ].articleOffset );
|
|
}
|
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() )
|
|
return new Dictionary::DataRequestInstant( false );
|
|
|
|
string result;
|
|
|
|
multimap< wstring, string >::const_iterator i;
|
|
|
|
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
|
|
result += i->second;
|
|
|
|
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
|
|
result += i->second;
|
|
|
|
sptr< Dictionary::DataRequestInstant > ret =
|
|
new Dictionary::DataRequestInstant( true );
|
|
|
|
ret->getData().resize( result.size() );
|
|
|
|
memcpy( &(ret->getData().front()), result.data(), result.size() );
|
|
|
|
return ret;
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
return new Dictionary::DataRequestInstant( QString( e.what() ) );
|
|
}
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries(
|
|
vector< string > const & fileNames,
|
|
string const & indicesDir,
|
|
Dictionary::Initializing & initializing )
|
|
throw( std::exception )
|
|
{
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
|
|
++i )
|
|
{
|
|
// Only allow .index suffixes
|
|
|
|
if ( i->size() < 6 ||
|
|
strcasecmp( i->c_str() + ( i->size() - 6 ), ".index" ) != 0 )
|
|
continue;
|
|
|
|
try
|
|
{
|
|
vector< string > dictFiles( 1, *i );
|
|
|
|
// Check if there is an 'abrv' file present
|
|
string baseName( *i, 0, i->size() - 5 );
|
|
|
|
dictFiles.push_back( string() );
|
|
|
|
if ( !File::tryPossibleName( baseName + "dict", dictFiles[ 1 ] ) &&
|
|
!File::tryPossibleName( baseName + "dict.dz", dictFiles[ 1 ] ) )
|
|
{
|
|
// No corresponding .dict file, skipping
|
|
continue;
|
|
}
|
|
|
|
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
|
|
|
string indexFile = indicesDir + dictId;
|
|
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
|
|
indexIsOldOrBad( indexFile ) )
|
|
{
|
|
// Building the index
|
|
string dictionaryName = nameFromFileName( dictFiles[ 0 ] );
|
|
initializing.indexingDictionary( dictionaryName );
|
|
|
|
File::Class idx( indexFile, "wb" );
|
|
|
|
IdxHeader idxHeader;
|
|
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
// will be rewritten with the right values.
|
|
|
|
idx.write( idxHeader );
|
|
|
|
IndexedWords indexedWords;
|
|
|
|
File::Class indexFile( dictFiles[ 0 ], "rb" );
|
|
|
|
// Read words from index until none's left.
|
|
|
|
char buf[ 16384 ];
|
|
|
|
do
|
|
{
|
|
uint32_t curOffset = indexFile.tell();
|
|
|
|
if ( !indexFile.gets( buf, sizeof( buf ), true ) )
|
|
break;
|
|
|
|
// Check that there are exactly two or three tabs in the record.
|
|
char * tab1 = strchr( buf, '\t' );
|
|
if ( tab1 )
|
|
{
|
|
char * tab2 = strchr( tab1 + 1, '\t' );
|
|
if ( tab2 )
|
|
{
|
|
char * tab3 = strchr( tab2 + 1, '\t');
|
|
if ( tab3 )
|
|
{
|
|
char * tab4 = strchr( tab3 + 1, '\t');
|
|
if ( tab4 )
|
|
{
|
|
DPRINTF( "Warning: too many tabs present, skipping: %s\n", buf );
|
|
continue;
|
|
}
|
|
|
|
// Handle the forth entry, if it exists. From dictfmt man:
|
|
// When --index-keep-orig option is used fourth column is created
|
|
// (if necessary) in .index file.
|
|
indexedWords.addWord( Utf8::decode( string( tab3 + 1, strlen ( tab3 + 1 ) ) ), curOffset );
|
|
++idxHeader.wordCount;
|
|
}
|
|
indexedWords.addWord( Utf8::decode( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset );
|
|
++idxHeader.wordCount;
|
|
++idxHeader.articleCount;
|
|
|
|
// Check for proper dictionary name
|
|
if ( !strncmp( buf, "00databaseshort", 15 ) || !strncmp( buf, "00-database-short", 17 ) )
|
|
{
|
|
// After tab1 should be article offset, after tab2 -- article size
|
|
uint32_t articleOffset = decodeBase64( string( tab1 + 1, tab2 - tab1 - 1 ) );
|
|
uint32_t articleSize = decodeBase64( tab2 + 1 );
|
|
dictData * dz = dict_data_open( dictFiles[ 1 ].c_str(), 0 );
|
|
|
|
if ( dz )
|
|
{
|
|
char * articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
|
|
if ( articleBody )
|
|
{
|
|
char * eol = strchr( articleBody, '\n' ); // skip the first line (headword itself)
|
|
if ( eol )
|
|
{
|
|
while( *eol && isspace( *eol ) ) ++eol; // skip spaces
|
|
|
|
// use only the single line for the dictionary title
|
|
char * endEol = strchr( eol, '\n' );
|
|
if ( endEol )
|
|
*endEol = 0;
|
|
|
|
DPRINTF( "DICT NAME: '%s'\n", eol );
|
|
dictionaryName = eol;
|
|
}
|
|
}
|
|
dict_data_close( dz );
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
DPRINTF( "Warning: only a single tab present, skipping: %s\n", buf );
|
|
continue;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
DPRINTF( "Warning: no tabs present, skipping: %s\n", buf );
|
|
continue;
|
|
}
|
|
|
|
|
|
} while( !indexFile.eof() );
|
|
|
|
|
|
// Write dictionary name
|
|
|
|
idx.write( (uint32_t) dictionaryName.size() );
|
|
idx.write( dictionaryName.data(), dictionaryName.size() );
|
|
|
|
// Build index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
|
|
|
// That concludes it. Update the header.
|
|
|
|
idxHeader.signature = Signature;
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
|
|
|
// read languages
|
|
QPair<quint32,quint32> langs =
|
|
LangCoder::findIdsForFilename( QString::fromStdString( dictFiles[ 0 ] ) );
|
|
|
|
// if no languages found, try dictionary's name
|
|
if ( langs.first == 0 || langs.second == 0 )
|
|
{
|
|
langs =
|
|
LangCoder::findIdsForFilename( QString::fromStdString( nameFromFileName( dictFiles[ 0 ] ) ) );
|
|
}
|
|
|
|
idxHeader.langFrom = langs.first;
|
|
idxHeader.langTo = langs.second;
|
|
|
|
idx.rewind();
|
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
}
|
|
|
|
dictionaries.push_back( new DictdDictionary( dictId,
|
|
indexFile,
|
|
dictFiles ) );
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
FDPRINTF( stderr, "Dictd dictionary reading failed: %s, error: %s\n",
|
|
i->c_str(), e.what() );
|
|
}
|
|
}
|
|
|
|
return dictionaries;
|
|
}
|
|
|
|
}
|