Merge branch 'mdict'

Conflicts:
	mdx.cc
This commit is contained in:
Timon Wong 2013-05-27 14:12:46 +08:00
commit 427c0df90e
4 changed files with 170 additions and 109 deletions

View file

@ -92,28 +92,30 @@ size_t MdictParser::RecordIndex::bsearch( const vector<MdictParser::RecordIndex>
return ( size_t ) ( -1 ); return ( size_t ) ( -1 );
} }
MdictParser::MdictParser( const char * filename ): filename_( QString::fromUtf8( filename ) ) MdictParser::MdictParser() :
version_( 0 ),
numHeadWordBlocks_( 0 ),
headWordBlockInfoSize_( 0 ),
headWordBlockSize_( 0 ),
headWordBlockInfoPos_( 0 ),
headWordPos_( 0 ),
totalRecordsSize_( 0 ),
recordPos_( 0 ),
wordCount_( 0 ),
numberTypeSize_( 0 ),
rtl_( false ),
bruteForce_( false ),
bruteForceEnd_( true )
{ {
version_ = 0;
numHeadWordBlocks_ = 0;
headWordBlockInfoSize_ = 0;
headWordBlockSize_ = 0;
headWordBlockInfoPos_ = 0;
headWordPos_ = 0;
totalRecordsSize_ = 0;
recordPos_ = 0;
wordCount_ = 0;
numberTypeSize_ = 0;
rtl_ = false;
bruteForce_ = false;
bruteForceEnd_ = true;
} }
bool MdictParser::open() bool MdictParser::open( const char * filename )
{ {
filename_ = QString::fromUtf8( filename );
file_ = new QFile( filename_ ); file_ = new QFile( filename_ );
qDebug() << "MdictParser: open " << filename_;
if ( file_.isNull() || !file_->exists() ) if ( file_.isNull() || !file_->exists() )
return false; return false;

View file

@ -151,11 +151,10 @@ public:
return rtl_; return rtl_;
} }
MdictParser( char const * filename ); MdictParser();
~MdictParser() {} ~MdictParser() {}
bool open(); bool open( const char * filename );
void close();
bool readNextHeadWordIndex( HeadWordIndex & headWordIndex ); bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler ); bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler );

238
mdx.cc
View file

@ -1,4 +1,4 @@
/* This file is (c) 2013 Timon Wong <timon86.wang.gmail.com> /* This file is (c) 2013 Timon Wong <timon86.wang AT gmail DOT com>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "mdx.hh" #include "mdx.hh"
@ -53,18 +53,10 @@ using BtreeIndexing::IndexInfo;
using namespace Mdict; using namespace Mdict;
/// Checks if the given string ends with the given substring
static bool endsWith( string const & str, string const & tail )
{
return str.size() >= tail.size() &&
str.compare( str.size() - tail.size(), tail.size(), tail ) == 0;
}
enum enum
{ {
kSignature = 0x4349444d, // MDIC kSignature = 0x4349444d, // MDIC
kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion kCurrentFormatVersion = 8 + BtreeIndexing::FormatVersion
}; };
DEF_EX( exCorruptDictionary, "dictionary file was tampered or corrupted", std::exception ) DEF_EX( exCorruptDictionary, "dictionary file was tampered or corrupted", std::exception )
@ -97,9 +89,8 @@ struct IdxHeader
uint32_t langFrom; // Source language uint32_t langFrom; // Source language
uint32_t langTo; // Target language uint32_t langTo; // Target language
uint32_t hasMddFile; uint32_t mddIndexInfosOffset; // address of IndexInfos for resource files (.mdd)
uint32_t mddIndexBtreeMaxElements; uint32_t mddIndexInfosCount; // count of IndexInfos for resource files
uint32_t mddIndexRootOffset;
} }
#ifndef _MSC_VER #ifndef _MSC_VER
__attribute__( ( packed ) ) __attribute__( ( packed ) )
@ -195,7 +186,7 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
string encoding; string encoding;
ChunkedStorage::Reader chunks; ChunkedStorage::Reader chunks;
QFile dictFile; QFile dictFile;
IndexedMdd mddResource; vector< sptr< IndexedMdd > > mddResources;
MdictParser::StyleSheets styleSheets; MdictParser::StyleSheets styleSheets;
QAtomicInt deferredInitDone; QAtomicInt deferredInitDone;
@ -276,7 +267,6 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
idx( indexFile, "rb" ), idx( indexFile, "rb" ),
idxHeader( idx.read< IdxHeader >() ), idxHeader( idx.read< IdxHeader >() ),
chunks( idx, idxHeader.chunksOffset ), chunks( idx, idxHeader.chunksOffset ),
mddResource( idxMutex, chunks ),
deferredInitRunnableStarted( false ) deferredInitRunnableStarted( false )
{ {
// Read the dictionary's name // Read the dictionary's name
@ -395,20 +385,30 @@ void MdxDictionary::doDeferredInit()
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ), idx, idxMutex ); idxHeader.indexRootOffset ), idx, idxMutex );
for ( vector<string>::const_iterator i = getDictionaryFilenames().begin(); vector< string > mddFileNames;
i != getDictionaryFilenames().end(); i++ ) vector< IndexInfo > mddIndexInfos;
idx.seek( idxHeader.mddIndexInfosOffset );
for ( uint32_t i = 0; i < idxHeader.mddIndexInfosCount; i++ )
{ {
if ( endsWith( *i, ".mdd" ) && File::exists( *i ) ) string::size_type sz = idx.read<string::size_type>();
{ vector< char > buf( sz );
if ( idxHeader.hasMddFile && ( idxHeader.mddIndexBtreeMaxElements || idx.read( &buf.front(), sz );
idxHeader.mddIndexRootOffset ) ) uint32_t btreeMaxElements = idx.read<uint32_t>();
{ uint32_t rootOffset = idx.read<uint32_t>();
mddResource.openIndex( IndexInfo( idxHeader.mddIndexBtreeMaxElements, mddFileNames.push_back( string( &buf.front() ) );
idxHeader.mddIndexRootOffset ), mddIndexInfos.push_back( IndexInfo( btreeMaxElements, rootOffset ) );
idx, idxMutex ); }
mddResource.open( i->c_str() );
} vector< string > const dictFiles = getDictionaryFilenames();
} for ( uint32_t i = 1; i < dictFiles.size() && i < mddFileNames.size() + 1; i++ )
{
if ( dictFiles[ i ] != mddFileNames[ i - 1 ] || !File::exists( dictFiles[ i ] ) )
continue;
sptr< IndexedMdd > mdd = new IndexedMdd( idxMutex, chunks );
mdd->openIndex( mddIndexInfos[ i - 1 ], idx, idxMutex );
mdd->open( dictFiles[ i ].c_str() );
mddResources.push_back( mdd );
} }
} }
catch ( std::exception & e ) catch ( std::exception & e )
@ -696,6 +696,12 @@ void MddResourceRequest::run()
return; return;
} }
string u8ResourceName = Utf8::encode( resourceName );
QCryptographicHash hash( QCryptographicHash::Md5 );
hash.addData( u8ResourceName.data(), u8ResourceName.size() );
if ( !resourceIncluded.insert( hash.result() ).second )
continue;
// Convert to the Windows separator // Convert to the Windows separator
std::replace( resourceName.begin(), resourceName.end(), '/', '\\' ); std::replace( resourceName.begin(), resourceName.end(), '/', '\\' );
if ( resourceName[ 0 ] != '\\' ) if ( resourceName[ 0 ] != '\\' )
@ -703,37 +709,51 @@ void MddResourceRequest::run()
resourceName.insert( 0, 1, '\\' ); resourceName.insert( 0, 1, '\\' );
} }
string u8ResourceName = Utf8::encode( resourceName );
QCryptographicHash hash( QCryptographicHash::Md5 );
hash.addData( u8ResourceName.data(), u8ResourceName.size() );
if ( !resourceIncluded.insert( hash.result() ).second )
continue;
// Get actual resource
Mutex::Lock _( dataMutex ); Mutex::Lock _( dataMutex );
data.clear(); data.clear();
if ( dict.mddResource.loadFile( resourceName, data ) )
{
// Check if this file has a redirection
// Always encoded in UTF16-LE
// L"@@@LINK="
static const char pattern[16] =
{
'@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0'
};
if ( data.size() > sizeof( pattern ) ) try
{
// local file takes precedence
string fn = FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
FsEncoding::separator() + u8ResourceName;
File::loadFromFile( fn, data );
}
catch ( File::exCantOpen & )
{
for ( vector< sptr< IndexedMdd > >::const_iterator i = dict.mddResources.begin();
i != dict.mddResources.end(); i++ )
{ {
if ( memcmp( &data.front(), pattern, sizeof( pattern ) ) == 0 ) sptr< IndexedMdd > mddResource = *i;
{ if ( mddResource->loadFile( resourceName, data ) )
data.push_back( '\0' ); break;
data.push_back( '\0' );
QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
data.size() - sizeof( pattern ) );
resourceName = gd::toWString( target.trimmed() );
continue;
}
} }
}
// Check if this file has a redirection
// Always encoded in UTF16-LE
// L"@@@LINK="
static const char pattern[16] =
{
'@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0'
};
if ( data.size() > sizeof( pattern ) )
{
if ( memcmp( &data.front(), pattern, sizeof( pattern ) ) == 0 )
{
data.push_back( '\0' );
data.push_back( '\0' );
QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
data.size() - sizeof( pattern ) );
resourceName = gd::toWString( target.trimmed() );
continue;
}
}
if ( data.size() > 0 )
{
hasAnyData = true;
if( Filetype::isNameOfCSS( u8ResourceName ) ) if( Filetype::isNameOfCSS( u8ResourceName ) )
{ {
@ -743,8 +763,6 @@ void MddResourceRequest::run()
data.resize( bytes.size() ); data.resize( bytes.size() );
memcpy( &data.front(), bytes.constData(), bytes.size() ); memcpy( &data.front(), bytes.constData(), bytes.size() );
} }
hasAnyData = true;
} }
break; break;
@ -926,7 +944,7 @@ private:
}; };
static bool indexIsOldOrBad( string const & indexFile, bool hasMddFile ) static bool indexIsOldOrBad( vector< string > const & dictFiles, string const & indexFile )
{ {
File::Class idx( indexFile, "rb" ); File::Class idx( indexFile, "rb" );
IdxHeader header; IdxHeader header;
@ -936,7 +954,32 @@ static bool indexIsOldOrBad( string const & indexFile, bool hasMddFile )
header.formatVersion != kCurrentFormatVersion || header.formatVersion != kCurrentFormatVersion ||
header.parserVersion != MdictParser::kParserVersion || header.parserVersion != MdictParser::kParserVersion ||
header.foldingVersion != Folding::Version || header.foldingVersion != Folding::Version ||
header.hasMddFile != hasMddFile; header.mddIndexInfosCount != dictFiles.size() - 1;
}
static void findResourceFiles( string const & mdx, vector< string > & dictFiles )
{
string base( mdx, 0, mdx.size() - 4 );
// Check if there' is any file end with .mdd, which is the resource file for the dictionary
string resFile;
if ( File::tryPossibleName( base + ".mdd", resFile ) )
{
dictFiles.push_back( resFile );
// Find complementary .mdd file (volumes), like follows:
// demo.mdx <- main dictionary file
// demo.mdd <- main resource file ( 1st volume )
// demo.1.mdd <- 2nd volume
// ...
// demo.n.mdd <- nth volume
QString baseU8 = QString::fromUtf8( base.c_str() );
int vol = 1;
while ( File::tryPossibleName( string( QString( "%1.%2.mdd" ).arg( baseU8 ).arg( vol )
.toUtf8().constBegin() ), resFile ) )
{
dictFiles.push_back( resFile );
vol++;
}
}
} }
vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames,
@ -953,42 +996,39 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
continue; continue;
vector< string > dictFiles( 1, *i ); vector< string > dictFiles( 1, *i );
findResourceFiles( *i, dictFiles );
string baseName = ( ( *i )[ i->size() - 4 ] == '.' ) ?
string( *i, 0, i->size() - 4 ) : string( *i, 0, i->size() - 7 );
// Check if there' is any file end with .mdd, which is the resource file for the dictionary
string mddFileName;
if ( File::tryPossibleName( baseName + ".mdd", mddFileName ) )
dictFiles.push_back( mddFileName );
string dictId = Dictionary::makeDictionaryId( dictFiles ); string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId; string indexFile = indicesDir + dictId;
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
indexIsOldOrBad( indexFile, !mddFileName.empty() ) ) indexIsOldOrBad( dictFiles, indexFile ) )
{ {
// Building the index // Building the index
MdictParser parser( i->c_str() ); MdictParser parser;
sptr<MdictParser> mddParser = NULL; list< sptr< MdictParser > > mddParsers;
if ( !parser.open() ) if ( !parser.open( i->c_str() ) )
continue; continue;
if ( File::exists( mddFileName ) )
{
mddParser = new MdictParser( mddFileName.c_str() );
if ( !mddParser->open() )
{
FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() );
continue;
}
}
string title = string( parser.title().toUtf8().constData() ); string title = string( parser.title().toUtf8().constData() );
initializing.indexingDictionary( title ); initializing.indexingDictionary( title );
for ( vector< string >::const_iterator mddIter = dictFiles.begin() + 1;
mddIter != dictFiles.end(); mddIter++ )
{
if ( File::exists( *mddIter ) )
{
sptr< MdictParser > mddParser = new MdictParser();
if ( !mddParser->open( mddIter->c_str() ) )
{
FDPRINTF( stderr, "Warning: Broken mdd (resource) file: %s\n", mddIter->c_str() );
continue;
}
mddParsers.push_back( mddParser );
}
}
File::Class idx( indexFile, "wb" ); File::Class idx( indexFile, "wb" );
IdxHeader idxHeader; IdxHeader idxHeader;
memset( &idxHeader, 0, sizeof( idxHeader ) ); memset( &idxHeader, 0, sizeof( idxHeader ) );
@ -1035,16 +1075,22 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
// enumerating resources if there's any // enumerating resources if there's any
sptr<IndexedWords> mddIndexedWords; vector< sptr< IndexedWords > > mddIndices;
if ( mddParser ) vector< string > mddFileNames;
while ( !mddParsers.empty() )
{ {
mddIndexedWords = new IndexedWords(); sptr< MdictParser > mddParser = mddParsers.front();
sptr< IndexedWords > mddIndexedWords = new IndexedWords();
ResourceHandler resourceHandler( chunks, *mddIndexedWords ); ResourceHandler resourceHandler( chunks, *mddIndexedWords );
while ( mddParser->readNextHeadWordIndex( headWordIndex ) ) while ( mddParser->readNextHeadWordIndex( headWordIndex ) )
{ {
mddParser->readRecordBlock( headWordIndex, resourceHandler ); mddParser->readRecordBlock( headWordIndex, resourceHandler );
} }
mddIndices.push_back( mddIndexedWords );
mddFileNames.push_back( string( mddParser->filename().toUtf8().constData() ) );
mddParsers.pop_front();
} }
// Finish with the chunks // Finish with the chunks
@ -1092,12 +1138,26 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idxHeader.langFrom = langs.first; idxHeader.langFrom = langs.first;
idxHeader.langTo = langs.second; idxHeader.langTo = langs.second;
if ( mddParser ) // Build index info for each mdd file
vector< IndexInfo > mddIndexInfos;
for ( vector< sptr< IndexedWords > >::const_iterator mddIndexIter = mddIndices.begin();
mddIndexIter != mddIndices.end(); mddIndexIter++ )
{ {
IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *mddIndexedWords, idx ); IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *( *mddIndexIter ), idx );
idxHeader.hasMddFile = true; mddIndexInfos.push_back( resourceIdxInfo );
idxHeader.mddIndexBtreeMaxElements = resourceIdxInfo.btreeMaxElements; }
idxHeader.mddIndexRootOffset = resourceIdxInfo.rootOffset;
// Save address of IndexInfos for resource files
idxHeader.mddIndexInfosOffset = idx.tell();
idxHeader.mddIndexInfosCount = mddIndexInfos.size();
for ( uint32_t mi = 0; mi < mddIndexInfos.size(); mi++ )
{
const string & mddfile = mddFileNames[ mi ];
idx.write<string::size_type>( mddfile.size() + 1 );
idx.write( mddfile.c_str(), mddfile.size() + 1 );
idx.write<uint32_t>( mddIndexInfos[ mi ].btreeMaxElements );
idx.write<uint32_t>( mddIndexInfos[ mi ].rootOffset );
} }
// That concludes it. Update the header. // That concludes it. Update the header.

2
mdx.hh
View file

@ -1,4 +1,4 @@
/* This file is (c) 2013 Timon Wong <timon86.wang.gmail.com> /* This file is (c) 2013 Timon Wong <timon86.wang AT gmail DOT com>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __MDX_HH_INCLUDED__ #ifndef __MDX_HH_INCLUDED__