MDict: Add volumes support for mdd files

This commit is contained in:
Timon Wong 2013-05-08 21:39:47 +08:00
parent 3583ac5b4a
commit 76b5b55ff0
2 changed files with 146 additions and 87 deletions

231
mdx.cc
View file

@ -1,4 +1,4 @@
/* This file is (c) 2013 Timon Wong <timon86.wang.gmail.com> /* This file is (c) 2013 Timon Wong <timon86.wang AT gmail DOT com>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "mdx.hh" #include "mdx.hh"
@ -52,18 +52,10 @@ using BtreeIndexing::IndexInfo;
using namespace Mdict; using namespace Mdict;
/// Checks if the given string ends with the given substring
static bool endsWith( string const & str, string const & tail )
{
return str.size() >= tail.size() &&
str.compare( str.size() - tail.size(), tail.size(), tail ) == 0;
}
enum enum
{ {
kSignature = 0x4349444d, // MDIC kSignature = 0x4349444d, // MDIC
kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion kCurrentFormatVersion = 8 + BtreeIndexing::FormatVersion
}; };
struct IdxHeader struct IdxHeader
@ -94,9 +86,8 @@ struct IdxHeader
uint32_t langFrom; // Source language uint32_t langFrom; // Source language
uint32_t langTo; // Target language uint32_t langTo; // Target language
uint32_t hasMddFile; uint32_t mddIndexInfosOffset; // address of IndexInfos for resource files (.mdd)
uint32_t mddIndexBtreeMaxElements; uint32_t mddIndexInfosCount; // count of IndexInfos for resource files
uint32_t mddIndexRootOffset;
} }
#ifndef _MSC_VER #ifndef _MSC_VER
__attribute__( ( packed ) ) __attribute__( ( packed ) )
@ -192,7 +183,7 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
string encoding; string encoding;
ChunkedStorage::Reader chunks; ChunkedStorage::Reader chunks;
QFile dictFile; QFile dictFile;
IndexedMdd mddResource; vector< sptr< IndexedMdd > > mddResources;
MdictParser::StyleSheets styleSheets; MdictParser::StyleSheets styleSheets;
QAtomicInt deferredInitDone; QAtomicInt deferredInitDone;
@ -273,7 +264,6 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
idx( indexFile, "rb" ), idx( indexFile, "rb" ),
idxHeader( idx.read< IdxHeader >() ), idxHeader( idx.read< IdxHeader >() ),
chunks( idx, idxHeader.chunksOffset ), chunks( idx, idxHeader.chunksOffset ),
mddResource( idxMutex, chunks ),
deferredInitRunnableStarted( false ) deferredInitRunnableStarted( false )
{ {
// Read the dictionary's name // Read the dictionary's name
@ -392,20 +382,30 @@ void MdxDictionary::doDeferredInit()
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ), idx, idxMutex ); idxHeader.indexRootOffset ), idx, idxMutex );
for ( vector<string>::const_iterator i = getDictionaryFilenames().begin(); vector< string > mddFileNames;
i != getDictionaryFilenames().end(); i++ ) vector< IndexInfo > mddIndexInfos;
idx.seek( idxHeader.mddIndexInfosOffset );
for ( uint32_t i = 0; i < idxHeader.mddIndexInfosCount; i++ )
{ {
if ( endsWith( *i, ".mdd" ) && File::exists( *i ) ) string::size_type sz = idx.read<string::size_type>();
{ vector< char > buf( sz );
if ( idxHeader.hasMddFile && ( idxHeader.mddIndexBtreeMaxElements || idx.read( &buf.front(), sz );
idxHeader.mddIndexRootOffset ) ) uint32_t btreeMaxElements = idx.read<uint32_t>();
{ uint32_t rootOffset = idx.read<uint32_t>();
mddResource.openIndex( IndexInfo( idxHeader.mddIndexBtreeMaxElements, mddFileNames.push_back( string( &buf.front() ) );
idxHeader.mddIndexRootOffset ), mddIndexInfos.push_back( IndexInfo( btreeMaxElements, rootOffset ) );
idx, idxMutex ); }
mddResource.open( i->c_str() );
} vector< string > const dictFiles = getDictionaryFilenames();
} for ( uint32_t i = 1; i < dictFiles.size() && i < mddFileNames.size() + 1; i++ )
{
if ( dictFiles[ i ] != mddFileNames[ i - 1 ] || !File::exists( dictFiles[ i ] ) )
continue;
IndexedMdd * mdd = new IndexedMdd( idxMutex, chunks );
mdd->openIndex( mddIndexInfos[ i - 1 ], idx, idxMutex );
mdd->open( dictFiles[ i ].c_str() );
mddResources.push_back( mdd );
} }
} }
catch ( std::exception & e ) catch ( std::exception & e )
@ -669,6 +669,12 @@ void MddResourceRequest::run()
return; return;
} }
string u8ResourceName = Utf8::encode( resourceName );
QCryptographicHash hash( QCryptographicHash::Md5 );
hash.addData( u8ResourceName.data(), u8ResourceName.size() );
if ( !resourceIncluded.insert( hash.result() ).second )
continue;
// Convert to the Windows separator // Convert to the Windows separator
std::replace( resourceName.begin(), resourceName.end(), '/', '\\' ); std::replace( resourceName.begin(), resourceName.end(), '/', '\\' );
if ( resourceName[ 0 ] != '\\' ) if ( resourceName[ 0 ] != '\\' )
@ -676,41 +682,51 @@ void MddResourceRequest::run()
resourceName.insert( 0, 1, '\\' ); resourceName.insert( 0, 1, '\\' );
} }
string u8ResourceName = Utf8::encode( resourceName );
QCryptographicHash hash( QCryptographicHash::Md5 );
hash.addData( u8ResourceName.data(), u8ResourceName.size() );
if ( !resourceIncluded.insert( hash.result() ).second )
continue;
// Get actual resource
Mutex::Lock _( dataMutex ); Mutex::Lock _( dataMutex );
data.clear(); data.clear();
if ( dict.mddResource.loadFile( resourceName, data ) )
try
{ {
// Check if this file has a redirection // local file takes precedence
// Always encoded in UTF16-LE string fn = FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
// L"@@@LINK=" FsEncoding::separator() + u8ResourceName;
static const char pattern[16] = File::loadFromFile( fn, data );
}
catch ( File::exCantOpen & )
{
for ( vector< sptr< IndexedMdd > >::const_iterator i = dict.mddResources.begin();
i != dict.mddResources.end(); i++ )
{ {
'@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0' sptr< IndexedMdd > mddResource = *i;
};
if ( data.size() > sizeof( pattern ) ) if ( mddResource->loadFile( resourceName, data ) )
{ break;
if ( memcmp( &data.front(), pattern, sizeof( pattern ) ) == 0 )
{
data.push_back( '\0' );
data.push_back( '\0' );
QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
data.size() - sizeof( pattern ) );
resourceName = gd::toWString( target.trimmed() );
continue;
}
} }
hasAnyData = true;
} }
// Check if this file has a redirection
// Always encoded in UTF16-LE
// L"@@@LINK="
static const char pattern[16] =
{
'@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0'
};
if ( data.size() > sizeof( pattern ) )
{
if ( memcmp( &data.front(), pattern, sizeof( pattern ) ) == 0 )
{
data.push_back( '\0' );
data.push_back( '\0' );
QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
data.size() - sizeof( pattern ) );
resourceName = gd::toWString( target.trimmed() );
continue;
}
}
if ( data.size() > 0 )
hasAnyData = true;
break; break;
} }
@ -907,7 +923,7 @@ private:
}; };
static bool indexIsOldOrBad( string const & indexFile, bool hasMddFile ) static bool indexIsOldOrBad( vector< string > const & dictFiles, string const & indexFile )
{ {
File::Class idx( indexFile, "rb" ); File::Class idx( indexFile, "rb" );
IdxHeader header; IdxHeader header;
@ -917,7 +933,32 @@ static bool indexIsOldOrBad( string const & indexFile, bool hasMddFile )
header.formatVersion != kCurrentFormatVersion || header.formatVersion != kCurrentFormatVersion ||
header.parserVersion != MdictParser::kParserVersion || header.parserVersion != MdictParser::kParserVersion ||
header.foldingVersion != Folding::Version || header.foldingVersion != Folding::Version ||
header.hasMddFile != hasMddFile; header.mddIndexInfosCount != dictFiles.size() - 1;
}
static void findResourceFiles( string const & mdx, vector< string > & dictFiles )
{
string base( mdx, 0, mdx.size() - 4 );
// Check if there' is any file end with .mdd, which is the resource file for the dictionary
string resFile;
if ( File::tryPossibleName( base + ".mdd", resFile ) )
{
dictFiles.push_back( resFile );
// Find complementary .mdd file (volumes), like follows:
// demo.mdx <- main dictionary file
// demo.mdd <- main resource file ( 1st volume )
// demo.1.mdd <- 2nd volume
// ...
// demo.n.mdd <- nth volume
QString baseU8 = QString::fromUtf8( base.c_str() );
int vol = 1;
while ( File::tryPossibleName( string( QString( "%1.%2.mdd" ).arg( baseU8 ).arg( vol )
.toUtf8().constBegin() ), resFile ) )
{
dictFiles.push_back( resFile );
vol++;
}
}
} }
vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames,
@ -934,42 +975,39 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
continue; continue;
vector< string > dictFiles( 1, *i ); vector< string > dictFiles( 1, *i );
findResourceFiles( *i, dictFiles );
string baseName = ( ( *i )[ i->size() - 4 ] == '.' ) ?
string( *i, 0, i->size() - 4 ) : string( *i, 0, i->size() - 7 );
// Check if there' is any file end with .mdd, which is the resource file for the dictionary
string mddFileName;
if ( File::tryPossibleName( baseName + ".mdd", mddFileName ) )
dictFiles.push_back( mddFileName );
string dictId = Dictionary::makeDictionaryId( dictFiles ); string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId; string indexFile = indicesDir + dictId;
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
indexIsOldOrBad( indexFile, !mddFileName.empty() ) ) indexIsOldOrBad( dictFiles, indexFile ) )
{ {
// Building the index // Building the index
MdictParser parser( i->c_str() ); MdictParser parser( i->c_str() );
sptr<MdictParser> mddParser = NULL; list< sptr< MdictParser > > mddParsers;
if ( !parser.open() ) if ( !parser.open() )
continue; continue;
if ( File::exists( mddFileName ) )
{
mddParser = new MdictParser( mddFileName.c_str() );
if ( !mddParser->open() )
{
FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() );
continue;
}
}
string title = string( parser.title().toUtf8().constData() ); string title = string( parser.title().toUtf8().constData() );
initializing.indexingDictionary( title ); initializing.indexingDictionary( title );
for ( vector< string >::const_iterator mddIter = dictFiles.begin() + 1;
mddIter != dictFiles.end(); mddIter++ )
{
if ( File::exists( *mddIter ) )
{
MdictParser * mddParser = new MdictParser( mddIter->c_str() );
if ( !mddParser->open() )
{
FDPRINTF( stderr, "Warning: Broken mdd (resource) file: %s\n", mddIter->c_str() );
continue;
}
mddParsers.push_back( mddParser );
}
}
File::Class idx( indexFile, "wb" ); File::Class idx( indexFile, "wb" );
IdxHeader idxHeader; IdxHeader idxHeader;
memset( &idxHeader, 0, sizeof( idxHeader ) ); memset( &idxHeader, 0, sizeof( idxHeader ) );
@ -1016,16 +1054,23 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
// enumerating resources if there's any // enumerating resources if there's any
sptr<IndexedWords> mddIndexedWords; vector< sptr< IndexedWords > > mddIndices;
if ( mddParser ) vector< string > mddFileNames;
while ( !mddParsers.empty() )
{ {
mddIndexedWords = new IndexedWords(); sptr< MdictParser > mddParser = mddParsers.front();
IndexedWords * mddIndexedWords = new IndexedWords();
ResourceHandler resourceHandler( chunks, *mddIndexedWords ); ResourceHandler resourceHandler( chunks, *mddIndexedWords );
while ( mddParser->readNextHeadWordIndex( headWordIndex ) ) while ( mddParser->readNextHeadWordIndex( headWordIndex ) )
{ {
mddParser->readRecordBlock( headWordIndex, resourceHandler ); mddParser->readRecordBlock( headWordIndex, resourceHandler );
} }
mddIndices.push_back( mddIndexedWords );
mddFileNames.push_back( string( mddParser->filename().toUtf8().constData() ) );
mddParsers.pop_front();
} }
// Finish with the chunks // Finish with the chunks
@ -1073,12 +1118,26 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idxHeader.langFrom = langs.first; idxHeader.langFrom = langs.first;
idxHeader.langTo = langs.second; idxHeader.langTo = langs.second;
if ( mddParser ) // Build index info for each mdd file
vector< IndexInfo > mddIndexInfos;
for ( vector< sptr< IndexedWords > >::const_iterator mddIndexIter = mddIndices.begin();
mddIndexIter != mddIndices.end(); mddIndexIter++ )
{ {
IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *mddIndexedWords, idx ); IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *( *mddIndexIter ), idx );
idxHeader.hasMddFile = true; mddIndexInfos.push_back( resourceIdxInfo );
idxHeader.mddIndexBtreeMaxElements = resourceIdxInfo.btreeMaxElements; }
idxHeader.mddIndexRootOffset = resourceIdxInfo.rootOffset;
// Save address of IndexInfos for resource files
idxHeader.mddIndexInfosOffset = idx.tell();
idxHeader.mddIndexInfosCount = mddIndexInfos.size();
for ( uint32_t mi = 0; mi < mddIndexInfos.size(); mi++ )
{
const string & mddfile = mddFileNames[ mi ];
idx.write<string::size_type>( mddfile.size() + 1 );
idx.write( mddfile.c_str(), mddfile.size() + 1 );
idx.write<uint32_t>( mddIndexInfos[ mi ].btreeMaxElements );
idx.write<uint32_t>( mddIndexInfos[ mi ].rootOffset );
} }
// That concludes it. Update the header. // That concludes it. Update the header.

2
mdx.hh
View file

@ -1,4 +1,4 @@
/* This file is (c) 2013 Timon Wong <timon86.wang.gmail.com> /* This file is (c) 2013 Timon Wong <timon86.wang AT gmail DOT com>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __MDX_HH_INCLUDED__ #ifndef __MDX_HH_INCLUDED__