From 76b5b55ff05a8fa797229dec0aa3574e212f1ffa Mon Sep 17 00:00:00 2001 From: Timon Wong Date: Wed, 8 May 2013 21:39:47 +0800 Subject: [PATCH 1/2] MDict: Add volumes support for mdd files --- mdx.cc | 231 ++++++++++++++++++++++++++++++++++++--------------------- mdx.hh | 2 +- 2 files changed, 146 insertions(+), 87 deletions(-) diff --git a/mdx.cc b/mdx.cc index da64e191..a9269401 100644 --- a/mdx.cc +++ b/mdx.cc @@ -1,4 +1,4 @@ -/* This file is (c) 2013 Timon Wong +/* This file is (c) 2013 Timon Wong * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "mdx.hh" @@ -52,18 +52,10 @@ using BtreeIndexing::IndexInfo; using namespace Mdict; - -/// Checks if the given string ends with the given substring -static bool endsWith( string const & str, string const & tail ) -{ - return str.size() >= tail.size() && - str.compare( str.size() - tail.size(), tail.size(), tail ) == 0; -} - enum { kSignature = 0x4349444d, // MDIC - kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion + kCurrentFormatVersion = 8 + BtreeIndexing::FormatVersion }; struct IdxHeader @@ -94,9 +86,8 @@ struct IdxHeader uint32_t langFrom; // Source language uint32_t langTo; // Target language - uint32_t hasMddFile; - uint32_t mddIndexBtreeMaxElements; - uint32_t mddIndexRootOffset; + uint32_t mddIndexInfosOffset; // address of IndexInfos for resource files (.mdd) + uint32_t mddIndexInfosCount; // count of IndexInfos for resource files } #ifndef _MSC_VER __attribute__( ( packed ) ) @@ -192,7 +183,7 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary string encoding; ChunkedStorage::Reader chunks; QFile dictFile; - IndexedMdd mddResource; + vector< sptr< IndexedMdd > > mddResources; MdictParser::StyleSheets styleSheets; QAtomicInt deferredInitDone; @@ -273,7 +264,6 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile, idx( indexFile, "rb" ), idxHeader( idx.read< IdxHeader >() ), chunks( idx, idxHeader.chunksOffset ), - mddResource( idxMutex, chunks ), deferredInitRunnableStarted( false ) { // Read the dictionary's name @@ -392,20 +382,30 @@ void MdxDictionary::doDeferredInit() openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); - for ( vector::const_iterator i = getDictionaryFilenames().begin(); - i != getDictionaryFilenames().end(); i++ ) + vector< string > mddFileNames; + vector< IndexInfo > mddIndexInfos; + idx.seek( idxHeader.mddIndexInfosOffset ); + for ( uint32_t i = 0; i < idxHeader.mddIndexInfosCount; i++ ) { - if ( endsWith( *i, ".mdd" ) && File::exists( *i ) ) - { - if ( idxHeader.hasMddFile && ( idxHeader.mddIndexBtreeMaxElements || - idxHeader.mddIndexRootOffset ) ) - { - mddResource.openIndex( IndexInfo( idxHeader.mddIndexBtreeMaxElements, - idxHeader.mddIndexRootOffset ), - idx, idxMutex ); - mddResource.open( i->c_str() ); - } - } + string::size_type sz = idx.read(); + vector< char > buf( sz ); + idx.read( &buf.front(), sz ); + uint32_t btreeMaxElements = idx.read(); + uint32_t rootOffset = idx.read(); + mddFileNames.push_back( string( &buf.front() ) ); + mddIndexInfos.push_back( IndexInfo( btreeMaxElements, rootOffset ) ); + } + + vector< string > const dictFiles = getDictionaryFilenames(); + for ( uint32_t i = 1; i < dictFiles.size() && i < mddFileNames.size() + 1; i++ ) + { + if ( dictFiles[ i ] != mddFileNames[ i - 1 ] || !File::exists( dictFiles[ i ] ) ) + continue; + + IndexedMdd * mdd = new IndexedMdd( idxMutex, chunks ); + mdd->openIndex( mddIndexInfos[ i - 1 ], idx, idxMutex ); + mdd->open( dictFiles[ i ].c_str() ); + mddResources.push_back( mdd ); } } catch ( std::exception & e ) @@ -669,6 +669,12 @@ void MddResourceRequest::run() return; } + string u8ResourceName = Utf8::encode( resourceName ); + QCryptographicHash hash( QCryptographicHash::Md5 ); + hash.addData( u8ResourceName.data(), u8ResourceName.size() ); + if ( !resourceIncluded.insert( hash.result() ).second ) + continue; + // Convert to the Windows separator std::replace( resourceName.begin(), resourceName.end(), '/', '\\' ); if ( resourceName[ 0 ] != '\\' ) @@ -676,41 +682,51 @@ void MddResourceRequest::run() resourceName.insert( 0, 1, '\\' ); } - string u8ResourceName = Utf8::encode( resourceName ); - QCryptographicHash hash( QCryptographicHash::Md5 ); - hash.addData( u8ResourceName.data(), u8ResourceName.size() ); - if ( !resourceIncluded.insert( hash.result() ).second ) - continue; - - // Get actual resource Mutex::Lock _( dataMutex ); data.clear(); - if ( dict.mddResource.loadFile( resourceName, data ) ) + + try { - // Check if this file has a redirection - // Always encoded in UTF16-LE - // L"@@@LINK=" - static const char pattern[16] = + // local file takes precedence + string fn = FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) + + FsEncoding::separator() + u8ResourceName; + File::loadFromFile( fn, data ); + } + catch ( File::exCantOpen & ) + { + for ( vector< sptr< IndexedMdd > >::const_iterator i = dict.mddResources.begin(); + i != dict.mddResources.end(); i++ ) { - '@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0' - }; + sptr< IndexedMdd > mddResource = *i; - if ( data.size() > sizeof( pattern ) ) - { - if ( memcmp( &data.front(), pattern, sizeof( pattern ) ) == 0 ) - { - data.push_back( '\0' ); - data.push_back( '\0' ); - QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ), - data.size() - sizeof( pattern ) ); - resourceName = gd::toWString( target.trimmed() ); - continue; - } + if ( mddResource->loadFile( resourceName, data ) ) + break; } - - hasAnyData = true; } + // Check if this file has a redirection + // Always encoded in UTF16-LE + // L"@@@LINK=" + static const char pattern[16] = + { + '@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0' + }; + + if ( data.size() > sizeof( pattern ) ) + { + if ( memcmp( &data.front(), pattern, sizeof( pattern ) ) == 0 ) + { + data.push_back( '\0' ); + data.push_back( '\0' ); + QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ), + data.size() - sizeof( pattern ) ); + resourceName = gd::toWString( target.trimmed() ); + continue; + } + } + + if ( data.size() > 0 ) + hasAnyData = true; break; } @@ -907,7 +923,7 @@ private: }; -static bool indexIsOldOrBad( string const & indexFile, bool hasMddFile ) +static bool indexIsOldOrBad( vector< string > const & dictFiles, string const & indexFile ) { File::Class idx( indexFile, "rb" ); IdxHeader header; @@ -917,7 +933,32 @@ static bool indexIsOldOrBad( string const & indexFile, bool hasMddFile ) header.formatVersion != kCurrentFormatVersion || header.parserVersion != MdictParser::kParserVersion || header.foldingVersion != Folding::Version || - header.hasMddFile != hasMddFile; + header.mddIndexInfosCount != dictFiles.size() - 1; +} + +static void findResourceFiles( string const & mdx, vector< string > & dictFiles ) +{ + string base( mdx, 0, mdx.size() - 4 ); + // Check if there' is any file end with .mdd, which is the resource file for the dictionary + string resFile; + if ( File::tryPossibleName( base + ".mdd", resFile ) ) + { + dictFiles.push_back( resFile ); + // Find complementary .mdd file (volumes), like follows: + // demo.mdx <- main dictionary file + // demo.mdd <- main resource file ( 1st volume ) + // demo.1.mdd <- 2nd volume + // ... + // demo.n.mdd <- nth volume + QString baseU8 = QString::fromUtf8( base.c_str() ); + int vol = 1; + while ( File::tryPossibleName( string( QString( "%1.%2.mdd" ).arg( baseU8 ).arg( vol ) + .toUtf8().constBegin() ), resFile ) ) + { + dictFiles.push_back( resFile ); + vol++; + } + } } vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, @@ -934,42 +975,39 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f continue; vector< string > dictFiles( 1, *i ); - - string baseName = ( ( *i )[ i->size() - 4 ] == '.' ) ? - string( *i, 0, i->size() - 4 ) : string( *i, 0, i->size() - 7 ); - - // Check if there' is any file end with .mdd, which is the resource file for the dictionary - string mddFileName; - if ( File::tryPossibleName( baseName + ".mdd", mddFileName ) ) - dictFiles.push_back( mddFileName ); + findResourceFiles( *i, dictFiles ); string dictId = Dictionary::makeDictionaryId( dictFiles ); - string indexFile = indicesDir + dictId; if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || - indexIsOldOrBad( indexFile, !mddFileName.empty() ) ) + indexIsOldOrBad( dictFiles, indexFile ) ) { // Building the index MdictParser parser( i->c_str() ); - sptr mddParser = NULL; + list< sptr< MdictParser > > mddParsers; if ( !parser.open() ) continue; - if ( File::exists( mddFileName ) ) - { - mddParser = new MdictParser( mddFileName.c_str() ); - if ( !mddParser->open() ) - { - FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() ); - continue; - } - } - string title = string( parser.title().toUtf8().constData() ); initializing.indexingDictionary( title ); + for ( vector< string >::const_iterator mddIter = dictFiles.begin() + 1; + mddIter != dictFiles.end(); mddIter++ ) + { + if ( File::exists( *mddIter ) ) + { + MdictParser * mddParser = new MdictParser( mddIter->c_str() ); + if ( !mddParser->open() ) + { + FDPRINTF( stderr, "Warning: Broken mdd (resource) file: %s\n", mddIter->c_str() ); + continue; + } + mddParsers.push_back( mddParser ); + } + } + File::Class idx( indexFile, "wb" ); IdxHeader idxHeader; memset( &idxHeader, 0, sizeof( idxHeader ) ); @@ -1016,16 +1054,23 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f } // enumerating resources if there's any - sptr mddIndexedWords; - if ( mddParser ) + vector< sptr< IndexedWords > > mddIndices; + vector< string > mddFileNames; + while ( !mddParsers.empty() ) { - mddIndexedWords = new IndexedWords(); + sptr< MdictParser > mddParser = mddParsers.front(); + + IndexedWords * mddIndexedWords = new IndexedWords(); ResourceHandler resourceHandler( chunks, *mddIndexedWords ); while ( mddParser->readNextHeadWordIndex( headWordIndex ) ) { mddParser->readRecordBlock( headWordIndex, resourceHandler ); } + + mddIndices.push_back( mddIndexedWords ); + mddFileNames.push_back( string( mddParser->filename().toUtf8().constData() ) ); + mddParsers.pop_front(); } // Finish with the chunks @@ -1073,12 +1118,26 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f idxHeader.langFrom = langs.first; idxHeader.langTo = langs.second; - if ( mddParser ) + // Build index info for each mdd file + vector< IndexInfo > mddIndexInfos; + for ( vector< sptr< IndexedWords > >::const_iterator mddIndexIter = mddIndices.begin(); + mddIndexIter != mddIndices.end(); mddIndexIter++ ) { - IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *mddIndexedWords, idx ); - idxHeader.hasMddFile = true; - idxHeader.mddIndexBtreeMaxElements = resourceIdxInfo.btreeMaxElements; - idxHeader.mddIndexRootOffset = resourceIdxInfo.rootOffset; + IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *( *mddIndexIter ), idx ); + mddIndexInfos.push_back( resourceIdxInfo ); + } + + // Save address of IndexInfos for resource files + idxHeader.mddIndexInfosOffset = idx.tell(); + idxHeader.mddIndexInfosCount = mddIndexInfos.size(); + for ( uint32_t mi = 0; mi < mddIndexInfos.size(); mi++ ) + { + const string & mddfile = mddFileNames[ mi ]; + + idx.write( mddfile.size() + 1 ); + idx.write( mddfile.c_str(), mddfile.size() + 1 ); + idx.write( mddIndexInfos[ mi ].btreeMaxElements ); + idx.write( mddIndexInfos[ mi ].rootOffset ); } // That concludes it. Update the header. diff --git a/mdx.hh b/mdx.hh index 08c9026b..82d5223e 100644 --- a/mdx.hh +++ b/mdx.hh @@ -1,4 +1,4 @@ -/* This file is (c) 2013 Timon Wong +/* This file is (c) 2013 Timon Wong * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #ifndef __MDX_HH_INCLUDED__ From 4c4feb2c5214fa3f763d9148bb47198a9c3eb60b Mon Sep 17 00:00:00 2001 From: Timon Wong Date: Sat, 11 May 2013 13:41:26 +0800 Subject: [PATCH 2/2] MDict: minor parser refactor, fix inappropriate use of sptr --- mdictparser.cc | 34 ++++++++++++++++++---------------- mdictparser.hh | 5 ++--- mdx.cc | 14 ++++++-------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/mdictparser.cc b/mdictparser.cc index 9ef1f2f2..ea3a4b2a 100644 --- a/mdictparser.cc +++ b/mdictparser.cc @@ -92,28 +92,30 @@ size_t MdictParser::RecordIndex::bsearch( const vector return ( size_t ) ( -1 ); } -MdictParser::MdictParser( const char * filename ): filename_( QString::fromUtf8( filename ) ) +MdictParser::MdictParser() : + version_( 0 ), + numHeadWordBlocks_( 0 ), + headWordBlockInfoSize_( 0 ), + headWordBlockSize_( 0 ), + headWordBlockInfoPos_( 0 ), + headWordPos_( 0 ), + totalRecordsSize_( 0 ), + recordPos_( 0 ), + wordCount_( 0 ), + numberTypeSize_( 0 ), + rtl_( false ), + bruteForce_( false ), + bruteForceEnd_( true ) { - version_ = 0; - numHeadWordBlocks_ = 0; - headWordBlockInfoSize_ = 0; - headWordBlockSize_ = 0; - headWordBlockInfoPos_ = 0; - headWordPos_ = 0; - totalRecordsSize_ = 0; - recordPos_ = 0; - - wordCount_ = 0; - numberTypeSize_ = 0; - rtl_ = false; - bruteForce_ = false; - bruteForceEnd_ = true; } -bool MdictParser::open() +bool MdictParser::open( const char * filename ) { + filename_ = QString::fromUtf8( filename ); file_ = new QFile( filename_ ); + qDebug() << "MdictParser: open " << filename_; + if ( file_.isNull() || !file_->exists() ) return false; diff --git a/mdictparser.hh b/mdictparser.hh index 39df86da..ee042c71 100644 --- a/mdictparser.hh +++ b/mdictparser.hh @@ -151,11 +151,10 @@ public: return rtl_; } - MdictParser( char const * filename ); + MdictParser(); ~MdictParser() {} - bool open(); - void close(); + bool open( const char * filename ); bool readNextHeadWordIndex( HeadWordIndex & headWordIndex ); bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler ); diff --git a/mdx.cc b/mdx.cc index a9269401..58a7db60 100644 --- a/mdx.cc +++ b/mdx.cc @@ -402,7 +402,7 @@ void MdxDictionary::doDeferredInit() if ( dictFiles[ i ] != mddFileNames[ i - 1 ] || !File::exists( dictFiles[ i ] ) ) continue; - IndexedMdd * mdd = new IndexedMdd( idxMutex, chunks ); + sptr< IndexedMdd > mdd = new IndexedMdd( idxMutex, chunks ); mdd->openIndex( mddIndexInfos[ i - 1 ], idx, idxMutex ); mdd->open( dictFiles[ i ].c_str() ); mddResources.push_back( mdd ); @@ -698,7 +698,6 @@ void MddResourceRequest::run() i != dict.mddResources.end(); i++ ) { sptr< IndexedMdd > mddResource = *i; - if ( mddResource->loadFile( resourceName, data ) ) break; } @@ -984,10 +983,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f indexIsOldOrBad( dictFiles, indexFile ) ) { // Building the index - MdictParser parser( i->c_str() ); + MdictParser parser; list< sptr< MdictParser > > mddParsers; - if ( !parser.open() ) + if ( !parser.open( i->c_str() ) ) continue; string title = string( parser.title().toUtf8().constData() ); @@ -998,8 +997,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f { if ( File::exists( *mddIter ) ) { - MdictParser * mddParser = new MdictParser( mddIter->c_str() ); - if ( !mddParser->open() ) + sptr< MdictParser > mddParser = new MdictParser(); + if ( !mddParser->open( mddIter->c_str() ) ) { FDPRINTF( stderr, "Warning: Broken mdd (resource) file: %s\n", mddIter->c_str() ); continue; @@ -1059,8 +1058,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f while ( !mddParsers.empty() ) { sptr< MdictParser > mddParser = mddParsers.front(); - - IndexedWords * mddIndexedWords = new IndexedWords(); + sptr< IndexedWords > mddIndexedWords = new IndexedWords(); ResourceHandler resourceHandler( chunks, *mddIndexedWords ); while ( mddParser->readNextHeadWordIndex( headWordIndex ) )