From 0101f52abd25e627d88ea4a407338139bad4f4ef Mon Sep 17 00:00:00 2001 From: Timon Wong Date: Sun, 28 Apr 2013 16:26:04 +0800 Subject: [PATCH] Support large dictionary files; indexing speed improvement --- mdictparser.cc | 97 ++++++++++----------------- mdictparser.hh | 104 +++++++++++++++------------- mdx.cc | 179 +++++++++++++++++++++++++++---------------------- 3 files changed, 192 insertions(+), 188 deletions(-) diff --git a/mdictparser.cc b/mdictparser.cc index 01569b7d..48f565f9 100644 --- a/mdictparser.cc +++ b/mdictparser.cc @@ -38,6 +38,9 @@ #include "decompress.hh" +namespace Mdict +{ + static inline int u16StrSize( const ushort * unicode ) { int size = 0; @@ -141,10 +144,12 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn headWordIndex.clear(); - file_->seek( headWordPos_ ); - QByteArray data = file_->read( headWordBlockSize_ ); - const char * pDataStart = data.constData(); - const char * pDataEnd = pDataStart + data.size(); + ScopedMemMap mapping( *file_, headWordPos_, headWordBlockSize_ ); + if ( !mapping.startAddress() ) + return false; + + const char * pDataStart = ( const char * )mapping.startAddress(); + const char * pDataEnd = pDataStart + headWordBlockSize_; const char pattern[] = {0x02, 0x00, 0x00, 0x00}; const char * patternBegin = pattern; const char * patternEnd = pattern + 4; @@ -168,17 +173,20 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() ) return false; - file_->seek( headWordPos_ ); qint64 compressedSize = headWordBlockInfosIter_->first; qint64 decompressedSize = headWordBlockInfosIter_->second; if ( compressedSize < 8 ) return false; - QByteArray compressed = file_->read( compressedSize ); - headWordPos_ = file_->pos(); + ScopedMemMap compressed( *file_, headWordPos_, compressedSize ); + if ( !compressed.startAddress() ) + return false; + + headWordPos_ += compressedSize; QByteArray decompressed; - if ( !parseCompressedBlock( compressedSize, compressed, decompressedSize, decompressed ) ) + if ( !parseCompressedBlock( compressedSize, ( char * )compressed.startAddress(), + decompressedSize, decompressed ) ) return false; headWordIndex = splitHeadWordBlock( decompressed ); @@ -381,7 +389,7 @@ bool MdictParser::readHeader( QDataStream & in ) // Read metadata rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes"; QString title = headerAttributes.namedItem( "Title" ).toAttr().value(); - if ( title == "Title (No HTML code allowed)" ) + if ( title.isEmpty() || title.length() < 5 || title == "Title (No HTML code allowed)" ) { // Use filename instead QFileInfo fi( filename_ ); @@ -587,49 +595,42 @@ MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & b return index; } -bool MdxParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex, - MdxParser::ArticleHandler & articleHandler ) +bool MdictParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex, + MdictParser::RecordHandler & recordHandler ) { - size_t prevIdx = ( size_t ) ( -1 ); - QByteArray decompressed; + // cache the index, the headWordIndex is already sorted + size_t idx = 0; for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ ) { - size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first ); - RecordIndex const & recordIndex = recordBlockInfos_[idx]; + if ( recordBlockInfos_[idx].endPos <= i->first ) + idx = RecordIndex::bsearch( recordBlockInfos_, i->first ); if ( idx == ( size_t )( -1 ) ) return false; - // Reload if index changes - if ( prevIdx != idx ) - { - prevIdx = idx; - file_->seek( recordPos_ + recordIndex.startPos ); - - QByteArray compressed; - compressed.resize( recordIndex.compressedSize ); - file_->read( compressed.data(), recordIndex.compressedSize ); - - if ( !parseCompressedBlock( recordIndex.compressedSize, compressed, - recordIndex.decompressedSize, decompressed ) ) - return false; - } - + RecordIndex const & recordIndex = recordBlockInfos_[idx]; HeadWordIndex::const_iterator iNext = i + 1; - size_t articleSize; + size_t recordSize; if ( iNext == headWordIndex.end() ) - articleSize = recordIndex.shadowEndPos - i->first; + recordSize = recordIndex.shadowEndPos - i->first; else - articleSize = iNext->first - i->first; - QString article = toUtf16( encoding_, decompressed.constData() + i->first - recordIndex.shadowStartPos, articleSize ); - articleHandler.handleAritcle( i->second, article ); + recordSize = iNext->first - i->first; + + RecordInfo recordInfo; + recordInfo.compressedBlockPos = recordPos_ + recordIndex.startPos; + recordInfo.recordOffset = i->first - recordIndex.shadowStartPos; + recordInfo.decompressedBlockSize = recordIndex.decompressedSize; + recordInfo.compressedBlockSize = recordIndex.compressedSize; + recordInfo.recordSize = recordSize; + + recordHandler.handleRecord( i->second, recordInfo ); } return true; } -QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSheets const & styleSheets ) +QString & MdictParser::substituteStylesheet( QString & article, MdictParser::StyleSheets const & styleSheets ) { QRegExp rx( "`(\\d+)`" ); QString endStyle; @@ -658,28 +659,4 @@ QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSh return article; } -bool MddParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex, - MddParser::ResourceHandler & resourceHandler ) -{ - for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ ) - { - size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first ); - RecordIndex const & recordIndex = recordBlockInfos_[idx]; - - if ( idx == ( size_t )( -1 ) ) - return false; - - HeadWordIndex::const_iterator iNext = i + 1; - size_t resourceSize; - if ( iNext == headWordIndex.end() ) - resourceSize = recordIndex.shadowEndPos - i->first; - else - resourceSize = iNext->first - i->first; - - resourceHandler.handleResource( i->second, recordIndex.decompressedSize, - recordPos_ + recordIndex.startPos, recordIndex.compressedSize, - i->first - recordIndex.shadowStartPos, resourceSize ); - } - - return true; } diff --git a/mdictparser.hh b/mdictparser.hh index 58c46861..2f03be8d 100644 --- a/mdictparser.hh +++ b/mdictparser.hh @@ -27,18 +27,46 @@ #include #include +namespace Mdict +{ + using std::string; using std::vector; using std::pair; using std::map; +// A helper class to handle memory map for QFile +class ScopedMemMap +{ + QFile & file; + uchar * address; + +public: + ScopedMemMap( QFile & file, qint64 offset, qint64 size ) : + file( file ), + address( file.map( offset, size ) ) + { + } + + ~ScopedMemMap() + { + if ( address ) + file.unmap( address ); + } + + inline uchar * startAddress() + { + return address; + } +}; + class MdictParser { public: enum { - kParserVersion = 0x0000009 + kParserVersion = 0x000000b }; struct RecordIndex @@ -68,6 +96,22 @@ public: static size_t bsearch( vector const & offsets, qint64 val ); }; + struct RecordInfo + { + qint64 compressedBlockPos; + qint64 recordOffset; + + size_t decompressedBlockSize; + size_t compressedBlockSize; + size_t recordSize; + }; + + class RecordHandler + { + public: + virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0; + }; + typedef vector< pair > BlockInfoVector; typedef vector< pair > HeadWordIndex; typedef map > StyleSheets; @@ -107,9 +151,13 @@ public: return rtl_; } + MdictParser( char const * filename ); + ~MdictParser() {} + bool open(); void close(); bool readNextHeadWordIndex( HeadWordIndex & headWordIndex ); + bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler ); // helpers static QString toUtf16( const char * fromCode, const char * from, size_t fromSize ); @@ -120,11 +168,15 @@ public: static bool parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr, size_t decompressedBlockSize, QByteArray & decompressedBlock ); + static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets ); + static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets ) + { + QString s = QString::fromUtf8( article.c_str() ); + substituteStylesheet( s, styleSheets ); + return string( s.toUtf8().constData() ); + } + protected: - MdictParser( char const * filename ); - - ~MdictParser() {} - qint64 readNumber( QDataStream & in ); static quint32 readU8OrU16( QDataStream & in, bool isU16 ); bool readHeader( QDataStream & in ); @@ -161,46 +213,6 @@ protected: bool bruteForceEnd_; }; -class MdxParser: public MdictParser -{ -public: - class ArticleHandler - { - public: - virtual void handleAritcle( QString const & headWord, QString const & article ) = 0; - }; - - MdxParser( const char * filename ): MdictParser( filename ) {} - ~MdxParser() {} - - bool readRecordBlock( HeadWordIndex & headWordIndex, ArticleHandler & articleHandler ); - static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets ); - static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets ) - { - QString s = QString::fromUtf8( article.c_str() ); - substituteStylesheet( s, styleSheets ); - return string( s.toUtf8().constData() ); - } -}; - -class MddParser: public MdictParser -{ -public: - class ResourceHandler - { - public: - virtual void handleResource( QString const & fileName, quint32 decompressedBlockSize, - quint32 compressedBlockPos, quint32 compressedBlockSize, - quint32 resourceOffset, quint32 resourceSize ) = 0; - }; - - MddParser( const char * filename ) : MdictParser( filename ) {} - ~MddParser() {} - - bool readRecordBlock( HeadWordIndex & headWordIndex, ResourceHandler & resourceHandler ); - -private: - -}; +} #endif // __MDICTPARSER_HH_INCLUDED__ diff --git a/mdx.cc b/mdx.cc index 8c98b0d6..56539066 100644 --- a/mdx.cc +++ b/mdx.cc @@ -13,6 +13,7 @@ #include "langcoder.hh" #include "fsencoding.hh" #include "audiolink.hh" +#include "ex.hh" #include "mdictparser.hh" #include @@ -49,22 +50,20 @@ using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexInfo; -namespace -{ +using namespace Mdict; + /// Checks if the given string ends with the given substring -bool endsWith( string const & str, string const & tail ) +static bool endsWith( string const & str, string const & tail ) { return str.size() >= tail.size() && str.compare( str.size() - tail.size(), tail.size(), tail ) == 0; } -} - enum { kSignature = 0x4349444d, // MDIC - kCurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion }; struct IdxHeader @@ -104,15 +103,6 @@ __attribute__( ( packed ) ) #endif ; -struct MddIndexEntry -{ - size_t decompressedBlockSize; - size_t compressedBlockPos; - size_t compressedBlockSize; - size_t resourceOffset; - size_t resourceSize; -}; - // A helper method to read resources from .mdd file class IndexedMdd: public BtreeIndexing::BtreeIndex { @@ -167,26 +157,27 @@ public: if ( links.empty() ) return false; - MddIndexEntry indexEntry; + MdictParser::RecordInfo indexEntry; + vector< char > chunk; + Mutex::Lock _( idxMutex ); + const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk ); + memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) ); + + ScopedMemMap compressed( mddFile, indexEntry.compressedBlockPos, indexEntry.compressedBlockSize ); + if ( !compressed.startAddress() ) { - vector< char > chunk; - Mutex::Lock _( idxMutex ); - const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk ); - memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) ); + return false; } QByteArray decompressed; - mddFile.seek( indexEntry.compressedBlockPos ); - QByteArray compressed = mddFile.read( indexEntry.compressedBlockSize ); - if ( !MdictParser::parseCompressedBlock( compressed.size(), compressed.constData(), + if ( !MdictParser::parseCompressedBlock( indexEntry.compressedBlockSize, ( char * )compressed.startAddress(), indexEntry.decompressedBlockSize, decompressed ) ) { return false; } - compressed.clear(); - result.resize( indexEntry.resourceSize ); - memcpy( &result.front(), decompressed.constData() + indexEntry.resourceOffset, indexEntry.resourceSize ); + result.resize( indexEntry.recordSize ); + memcpy( &result.front(), decompressed.constData() + indexEntry.recordOffset, indexEntry.recordSize ); return true; } @@ -198,7 +189,9 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary File::Class idx; IdxHeader idxHeader; string dictionaryName; + string encoding; ChunkedStorage::Reader chunks; + QFile dictFile; IndexedMdd mddResource; MdictParser::StyleSheets styleSheets; @@ -263,7 +256,7 @@ private: void doDeferredInit(); /// Loads an article with the given offset, filling the given strings. - void loadArticle( uint32_t offset, string & headword, string & articleText ); + void loadArticle( uint32_t offset, string & articleText ); /// Process resource links (images, audios, etc) string filterResource( const char * articleId, const char * article ); @@ -283,14 +276,21 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile, mddResource( idxMutex, chunks ), deferredInitRunnableStarted( false ) { - idx.seek( sizeof( idxHeader ) ); - // Read the dictionary's name + idx.seek( sizeof( idxHeader ) ); size_t len = idx.read< uint32_t >(); - vector< char > nameBuf( len ); - idx.read( &nameBuf.front(), len ); + vector< char > buf( len ); + idx.read( &buf.front(), len ); + dictionaryName = string( &buf.front(), len ); - dictionaryName = string( &nameBuf.front(), len ); + // then read the dictionary's encoding + len = idx.read< uint32_t >(); + buf.resize( len ); + idx.read( &buf.front(), len ); + encoding = string( &buf.front(), len ); + + dictFile.setFileName( QString::fromUtf8( dictionaryFiles[ 0 ].c_str() ) ); + dictFile.open( QIODevice::ReadOnly ); } MdxDictionary::~MdxDictionary() @@ -300,6 +300,8 @@ MdxDictionary::~MdxDictionary() // Wait for init runnable to complete if it was ever started if ( deferredInitRunnableStarted ) deferredInitRunnableExited.acquire(); + + dictFile.close(); } //////// MdxDictionary::deferredInit() @@ -530,10 +532,9 @@ void MdxArticleRequest::run() continue; // We already have this article in the body. // Grab that article - string headword; string articleBody; - dict.loadArticle( chain[ x ].articleOffset, headword, articleBody ); + dict.loadArticle( chain[ x ].articleOffset, articleBody ); if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() ) continue; // We already have this article in the body. @@ -700,8 +701,8 @@ void MddResourceRequest::run() { data.push_back( '\0' ); data.push_back( '\0' ); - QString target = MdxParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ), - data.size() - sizeof( pattern ) ); + QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ), + data.size() - sizeof( pattern ) ); resourceName = gd::toWString( target.trimmed() ); continue; } @@ -761,21 +762,52 @@ void MdxDictionary::loadIcon() throw() dictionaryIconLoaded = true; } -void MdxDictionary::loadArticle( uint32_t offset, string & headword, string & articleText ) +DEF_EX( exCorruptDictionary, "dictionary file tampered or corrupted", std::exception ) + +void MdxDictionary::loadArticle( uint32_t offset, string & articleText ) { vector< char > chunk; Mutex::Lock _( idxMutex ); - char * articleData = chunks.getBlock( offset, chunk ); + // Load record info from index + MdictParser::RecordInfo recordInfo; + char * pRecordInfo = chunks.getBlock( offset, chunk ); + memcpy( &recordInfo, pRecordInfo, sizeof( recordInfo ) ); // Make an sub unique id for this article QString articleId; - articleId.setNum( ( quint64 )articleData, 16 ); + articleId.setNum( ( quint64 )pRecordInfo, 16 ); - headword = articleData; - articleText = string( articleData + headword.size() + 1 ); - articleText = MdxParser::substituteStylesheet( articleText, styleSheets ); - articleText = filterResource( articleId.toLatin1().constData(), articleText.c_str() ); + articleText = "Article loading error"; + + try + { + ScopedMemMap compressed( dictFile, recordInfo.compressedBlockPos, recordInfo.compressedBlockSize ); + if ( !compressed.startAddress() ) + throw exCorruptDictionary(); + + QByteArray decompressed; + if ( !MdictParser::parseCompressedBlock( recordInfo.compressedBlockSize, ( char * )compressed.startAddress(), + recordInfo.decompressedBlockSize, decompressed ) ) + return; + + QString article = MdictParser::toUtf16( encoding.c_str(), + decompressed.constData() + recordInfo.recordOffset, + recordInfo.recordSize ); + + article = MdictParser::substituteStylesheet( article, styleSheets ); + articleText = filterResource( articleId.toLatin1().constData(), article.toUtf8().constData() ); + } + catch ( std::exception & e ) + { + FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n", + getDictionaryFilenames()[ 0 ].c_str(), e.what() ); + } + catch ( ... ) + { + FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n", + getDictionaryFilenames()[ 0 ].c_str(), "unknown error" ); + } } string MdxDictionary::filterResource( const char * articleId, const char * article ) @@ -820,36 +852,20 @@ static void addEntryToIndexSingle( QString const & word, uint32_t offset, Indexe indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset ); } -class ArticleHandler: public MdxParser::ArticleHandler +class ArticleHandler: public MdictParser::RecordHandler { public: ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) : chunks( chunks ), - indexedWords( indexedWords ), - articleCount_( 0 ) + indexedWords( indexedWords ) { } - inline size_t articleCount() + virtual void handleRecord( QString const & headWord, MdictParser::RecordInfo const & recordInfo ) { - return articleCount_; - } - - void handleAritcle( QString const & headWord, QString const & article ) - { - if ( !article.startsWith( "@@@LINK=" ) ) - { - articleCount_++; - } - - // Save the article's body itself first + // Save the article's record info uint32_t articleAddress = chunks.startNewBlock(); - string headWordU8 = string( headWord.toUtf8().constData() ); - string articleU8 = string( article.toUtf8().constData() ); - - chunks.addToBlock( headWordU8.c_str(), headWordU8.size() + 1 ); - chunks.addToBlock( articleU8.c_str(), articleU8.size() + 1 ); - + chunks.addToBlock( &recordInfo, sizeof( recordInfo ) ); // Add entries to the index addEntryToIndex( headWord, articleAddress, indexedWords ); } @@ -857,10 +873,9 @@ public: private: ChunkedStorage::Writer & chunks; IndexedWords & indexedWords; - size_t articleCount_; }; -class ResourceHandler: public MddParser::ResourceHandler +class ResourceHandler: public MdictParser::RecordHandler { public: ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ): @@ -869,18 +884,10 @@ public: { } - void handleResource( QString const & fileName, quint32 decompressedBlockSize, - quint32 compressedBlockPos, quint32 compressedBlockSize, - quint32 resourceOffset, quint32 resourceSize ) + virtual void handleRecord( QString const & fileName, MdictParser::RecordInfo const & recordInfo ) { uint32_t resourceInfoAddress = chunks.startNewBlock(); - MddIndexEntry mddIndexEntry; - mddIndexEntry.decompressedBlockSize = decompressedBlockSize; - mddIndexEntry.compressedBlockPos = compressedBlockPos; - mddIndexEntry.compressedBlockSize = compressedBlockSize; - mddIndexEntry.resourceOffset = resourceOffset; - mddIndexEntry.resourceSize = resourceSize; - chunks.addToBlock( &mddIndexEntry, sizeof( mddIndexEntry ) ); + chunks.addToBlock( &recordInfo, sizeof( recordInfo ) ); // Add entries to the index addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords ); } @@ -935,15 +942,15 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f indexIsOldOrBad( indexFile, !mddFileName.empty() ) ) { // Building the index - MdxParser parser( i->c_str() ); - sptr mddParser = NULL; + MdictParser parser( i->c_str() ); + sptr mddParser = NULL; if ( !parser.open() ) continue; if ( File::exists( mddFileName ) ) { - mddParser = new MddParser( mddFileName.c_str() ); + mddParser = new MdictParser( mddFileName.c_str() ); if ( !mddParser->open() ) { FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() ); @@ -960,9 +967,18 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // We write a dummy header first. At the end of the process the header // will be rewritten with the right values. idx.write( idxHeader ); + + // Write the title first idx.write< uint32_t >( title.size() ); idx.write( title.data(), title.size() ); + // then the encoding + { + string encoding = string( parser.encoding().toUtf8().constData() ); + idx.write< uint32_t >( encoding.size() ); + idx.write( encoding.data(), encoding.size() ); + } + // This is our index data that we accumulate during the loading process. // For each new word encountered, we emit the article's body to the file // immediately, inserting the word itself and its offset in this map. @@ -976,10 +992,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Save dictionary description if there's one { string description = string( parser.description().toUtf8().constData() ); - idxHeader.descriptionSize = 0; idxHeader.descriptionAddress = chunks.startNewBlock(); chunks.addToBlock( description.c_str(), description.size() + 1 ); - idxHeader.descriptionSize += description.size() + 1; + idxHeader.descriptionSize = description.size() + 1; } ArticleHandler articleHandler( chunks, indexedWords ); @@ -1062,7 +1077,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f idxHeader.formatVersion = kCurrentFormatVersion; idxHeader.parserVersion = MdictParser::kParserVersion; idxHeader.foldingVersion = Folding::Version; - idxHeader.articleCount = articleHandler.articleCount(); + idxHeader.articleCount = parser.wordCount(); idxHeader.wordCount = parser.wordCount(); idx.rewind();