Support large dictionary files; indexing speed improvement

This commit is contained in:
Timon Wong 2013-04-28 16:26:04 +08:00
parent 1e047df679
commit 0101f52abd
3 changed files with 192 additions and 188 deletions

View file

@ -38,6 +38,9 @@
#include "decompress.hh"
namespace Mdict
{
static inline int u16StrSize( const ushort * unicode )
{
int size = 0;
@ -141,10 +144,12 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
headWordIndex.clear();
file_->seek( headWordPos_ );
QByteArray data = file_->read( headWordBlockSize_ );
const char * pDataStart = data.constData();
const char * pDataEnd = pDataStart + data.size();
ScopedMemMap mapping( *file_, headWordPos_, headWordBlockSize_ );
if ( !mapping.startAddress() )
return false;
const char * pDataStart = ( const char * )mapping.startAddress();
const char * pDataEnd = pDataStart + headWordBlockSize_;
const char pattern[] = {0x02, 0x00, 0x00, 0x00};
const char * patternBegin = pattern;
const char * patternEnd = pattern + 4;
@ -168,17 +173,20 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
return false;
file_->seek( headWordPos_ );
qint64 compressedSize = headWordBlockInfosIter_->first;
qint64 decompressedSize = headWordBlockInfosIter_->second;
if ( compressedSize < 8 )
return false;
QByteArray compressed = file_->read( compressedSize );
headWordPos_ = file_->pos();
ScopedMemMap compressed( *file_, headWordPos_, compressedSize );
if ( !compressed.startAddress() )
return false;
headWordPos_ += compressedSize;
QByteArray decompressed;
if ( !parseCompressedBlock( compressedSize, compressed, decompressedSize, decompressed ) )
if ( !parseCompressedBlock( compressedSize, ( char * )compressed.startAddress(),
decompressedSize, decompressed ) )
return false;
headWordIndex = splitHeadWordBlock( decompressed );
@ -381,7 +389,7 @@ bool MdictParser::readHeader( QDataStream & in )
// Read metadata
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
if ( title == "Title (No HTML code allowed)" )
if ( title.isEmpty() || title.length() < 5 || title == "Title (No HTML code allowed)" )
{
// Use filename instead
QFileInfo fi( filename_ );
@ -587,49 +595,42 @@ MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & b
return index;
}
bool MdxParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
MdxParser::ArticleHandler & articleHandler )
bool MdictParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
MdictParser::RecordHandler & recordHandler )
{
size_t prevIdx = ( size_t ) ( -1 );
QByteArray decompressed;
// cache the index, the headWordIndex is already sorted
size_t idx = 0;
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
{
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
RecordIndex const & recordIndex = recordBlockInfos_[idx];
if ( recordBlockInfos_[idx].endPos <= i->first )
idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
if ( idx == ( size_t )( -1 ) )
return false;
// Reload if index changes
if ( prevIdx != idx )
{
prevIdx = idx;
file_->seek( recordPos_ + recordIndex.startPos );
QByteArray compressed;
compressed.resize( recordIndex.compressedSize );
file_->read( compressed.data(), recordIndex.compressedSize );
if ( !parseCompressedBlock( recordIndex.compressedSize, compressed,
recordIndex.decompressedSize, decompressed ) )
return false;
}
RecordIndex const & recordIndex = recordBlockInfos_[idx];
HeadWordIndex::const_iterator iNext = i + 1;
size_t articleSize;
size_t recordSize;
if ( iNext == headWordIndex.end() )
articleSize = recordIndex.shadowEndPos - i->first;
recordSize = recordIndex.shadowEndPos - i->first;
else
articleSize = iNext->first - i->first;
QString article = toUtf16( encoding_, decompressed.constData() + i->first - recordIndex.shadowStartPos, articleSize );
articleHandler.handleAritcle( i->second, article );
recordSize = iNext->first - i->first;
RecordInfo recordInfo;
recordInfo.compressedBlockPos = recordPos_ + recordIndex.startPos;
recordInfo.recordOffset = i->first - recordIndex.shadowStartPos;
recordInfo.decompressedBlockSize = recordIndex.decompressedSize;
recordInfo.compressedBlockSize = recordIndex.compressedSize;
recordInfo.recordSize = recordSize;
recordHandler.handleRecord( i->second, recordInfo );
}
return true;
}
QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSheets const & styleSheets )
QString & MdictParser::substituteStylesheet( QString & article, MdictParser::StyleSheets const & styleSheets )
{
QRegExp rx( "`(\\d+)`" );
QString endStyle;
@ -658,28 +659,4 @@ QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSh
return article;
}
bool MddParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
MddParser::ResourceHandler & resourceHandler )
{
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
{
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
RecordIndex const & recordIndex = recordBlockInfos_[idx];
if ( idx == ( size_t )( -1 ) )
return false;
HeadWordIndex::const_iterator iNext = i + 1;
size_t resourceSize;
if ( iNext == headWordIndex.end() )
resourceSize = recordIndex.shadowEndPos - i->first;
else
resourceSize = iNext->first - i->first;
resourceHandler.handleResource( i->second, recordIndex.decompressedSize,
recordPos_ + recordIndex.startPos, recordIndex.compressedSize,
i->first - recordIndex.shadowStartPos, resourceSize );
}
return true;
}

View file

@ -27,18 +27,46 @@
#include <QPointer>
#include <QFile>
namespace Mdict
{
using std::string;
using std::vector;
using std::pair;
using std::map;
// A helper class to handle memory map for QFile
class ScopedMemMap
{
QFile & file;
uchar * address;
public:
ScopedMemMap( QFile & file, qint64 offset, qint64 size ) :
file( file ),
address( file.map( offset, size ) )
{
}
~ScopedMemMap()
{
if ( address )
file.unmap( address );
}
inline uchar * startAddress()
{
return address;
}
};
class MdictParser
{
public:
enum
{
kParserVersion = 0x0000009
kParserVersion = 0x000000b
};
struct RecordIndex
@ -68,6 +96,22 @@ public:
static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val );
};
struct RecordInfo
{
qint64 compressedBlockPos;
qint64 recordOffset;
size_t decompressedBlockSize;
size_t compressedBlockSize;
size_t recordSize;
};
class RecordHandler
{
public:
virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0;
};
typedef vector< pair<qint64, qint64> > BlockInfoVector;
typedef vector< pair<qint64, QString> > HeadWordIndex;
typedef map<int, pair<QString, QString> > StyleSheets;
@ -107,9 +151,13 @@ public:
return rtl_;
}
MdictParser( char const * filename );
~MdictParser() {}
bool open();
void close();
bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler );
// helpers
static QString toUtf16( const char * fromCode, const char * from, size_t fromSize );
@ -120,11 +168,15 @@ public:
static bool parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr,
size_t decompressedBlockSize, QByteArray & decompressedBlock );
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
{
QString s = QString::fromUtf8( article.c_str() );
substituteStylesheet( s, styleSheets );
return string( s.toUtf8().constData() );
}
protected:
MdictParser( char const * filename );
~MdictParser() {}
qint64 readNumber( QDataStream & in );
static quint32 readU8OrU16( QDataStream & in, bool isU16 );
bool readHeader( QDataStream & in );
@ -161,46 +213,6 @@ protected:
bool bruteForceEnd_;
};
class MdxParser: public MdictParser
{
public:
class ArticleHandler
{
public:
virtual void handleAritcle( QString const & headWord, QString const & article ) = 0;
};
MdxParser( const char * filename ): MdictParser( filename ) {}
~MdxParser() {}
bool readRecordBlock( HeadWordIndex & headWordIndex, ArticleHandler & articleHandler );
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
{
QString s = QString::fromUtf8( article.c_str() );
substituteStylesheet( s, styleSheets );
return string( s.toUtf8().constData() );
}
};
class MddParser: public MdictParser
{
public:
class ResourceHandler
{
public:
virtual void handleResource( QString const & fileName, quint32 decompressedBlockSize,
quint32 compressedBlockPos, quint32 compressedBlockSize,
quint32 resourceOffset, quint32 resourceSize ) = 0;
};
MddParser( const char * filename ) : MdictParser( filename ) {}
~MddParser() {}
bool readRecordBlock( HeadWordIndex & headWordIndex, ResourceHandler & resourceHandler );
private:
};
}
#endif // __MDICTPARSER_HH_INCLUDED__

179
mdx.cc
View file

@ -13,6 +13,7 @@
#include "langcoder.hh"
#include "fsencoding.hh"
#include "audiolink.hh"
#include "ex.hh"
#include "mdictparser.hh"
#include <map>
@ -49,22 +50,20 @@ using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
namespace
{
using namespace Mdict;
/// Checks if the given string ends with the given substring
bool endsWith( string const & str, string const & tail )
static bool endsWith( string const & str, string const & tail )
{
return str.size() >= tail.size() &&
str.compare( str.size() - tail.size(), tail.size(), tail ) == 0;
}
}
enum
{
kSignature = 0x4349444d, // MDIC
kCurrentFormatVersion = 4 + BtreeIndexing::FormatVersion
kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion
};
struct IdxHeader
@ -104,15 +103,6 @@ __attribute__( ( packed ) )
#endif
;
struct MddIndexEntry
{
size_t decompressedBlockSize;
size_t compressedBlockPos;
size_t compressedBlockSize;
size_t resourceOffset;
size_t resourceSize;
};
// A helper method to read resources from .mdd file
class IndexedMdd: public BtreeIndexing::BtreeIndex
{
@ -167,26 +157,27 @@ public:
if ( links.empty() )
return false;
MddIndexEntry indexEntry;
MdictParser::RecordInfo indexEntry;
vector< char > chunk;
Mutex::Lock _( idxMutex );
const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
ScopedMemMap compressed( mddFile, indexEntry.compressedBlockPos, indexEntry.compressedBlockSize );
if ( !compressed.startAddress() )
{
vector< char > chunk;
Mutex::Lock _( idxMutex );
const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
return false;
}
QByteArray decompressed;
mddFile.seek( indexEntry.compressedBlockPos );
QByteArray compressed = mddFile.read( indexEntry.compressedBlockSize );
if ( !MdictParser::parseCompressedBlock( compressed.size(), compressed.constData(),
if ( !MdictParser::parseCompressedBlock( indexEntry.compressedBlockSize, ( char * )compressed.startAddress(),
indexEntry.decompressedBlockSize, decompressed ) )
{
return false;
}
compressed.clear();
result.resize( indexEntry.resourceSize );
memcpy( &result.front(), decompressed.constData() + indexEntry.resourceOffset, indexEntry.resourceSize );
result.resize( indexEntry.recordSize );
memcpy( &result.front(), decompressed.constData() + indexEntry.recordOffset, indexEntry.recordSize );
return true;
}
@ -198,7 +189,9 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
File::Class idx;
IdxHeader idxHeader;
string dictionaryName;
string encoding;
ChunkedStorage::Reader chunks;
QFile dictFile;
IndexedMdd mddResource;
MdictParser::StyleSheets styleSheets;
@ -263,7 +256,7 @@ private:
void doDeferredInit();
/// Loads an article with the given offset, filling the given strings.
void loadArticle( uint32_t offset, string & headword, string & articleText );
void loadArticle( uint32_t offset, string & articleText );
/// Process resource links (images, audios, etc)
string filterResource( const char * articleId, const char * article );
@ -283,14 +276,21 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
mddResource( idxMutex, chunks ),
deferredInitRunnableStarted( false )
{
idx.seek( sizeof( idxHeader ) );
// Read the dictionary's name
idx.seek( sizeof( idxHeader ) );
size_t len = idx.read< uint32_t >();
vector< char > nameBuf( len );
idx.read( &nameBuf.front(), len );
vector< char > buf( len );
idx.read( &buf.front(), len );
dictionaryName = string( &buf.front(), len );
dictionaryName = string( &nameBuf.front(), len );
// then read the dictionary's encoding
len = idx.read< uint32_t >();
buf.resize( len );
idx.read( &buf.front(), len );
encoding = string( &buf.front(), len );
dictFile.setFileName( QString::fromUtf8( dictionaryFiles[ 0 ].c_str() ) );
dictFile.open( QIODevice::ReadOnly );
}
MdxDictionary::~MdxDictionary()
@ -300,6 +300,8 @@ MdxDictionary::~MdxDictionary()
// Wait for init runnable to complete if it was ever started
if ( deferredInitRunnableStarted )
deferredInitRunnableExited.acquire();
dictFile.close();
}
//////// MdxDictionary::deferredInit()
@ -530,10 +532,9 @@ void MdxArticleRequest::run()
continue; // We already have this article in the body.
// Grab that article
string headword;
string articleBody;
dict.loadArticle( chain[ x ].articleOffset, headword, articleBody );
dict.loadArticle( chain[ x ].articleOffset, articleBody );
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
continue; // We already have this article in the body.
@ -700,8 +701,8 @@ void MddResourceRequest::run()
{
data.push_back( '\0' );
data.push_back( '\0' );
QString target = MdxParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
data.size() - sizeof( pattern ) );
QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
data.size() - sizeof( pattern ) );
resourceName = gd::toWString( target.trimmed() );
continue;
}
@ -761,21 +762,52 @@ void MdxDictionary::loadIcon() throw()
dictionaryIconLoaded = true;
}
void MdxDictionary::loadArticle( uint32_t offset, string & headword, string & articleText )
DEF_EX( exCorruptDictionary, "dictionary file tampered or corrupted", std::exception )
void MdxDictionary::loadArticle( uint32_t offset, string & articleText )
{
vector< char > chunk;
Mutex::Lock _( idxMutex );
char * articleData = chunks.getBlock( offset, chunk );
// Load record info from index
MdictParser::RecordInfo recordInfo;
char * pRecordInfo = chunks.getBlock( offset, chunk );
memcpy( &recordInfo, pRecordInfo, sizeof( recordInfo ) );
// Make an sub unique id for this article
QString articleId;
articleId.setNum( ( quint64 )articleData, 16 );
articleId.setNum( ( quint64 )pRecordInfo, 16 );
headword = articleData;
articleText = string( articleData + headword.size() + 1 );
articleText = MdxParser::substituteStylesheet( articleText, styleSheets );
articleText = filterResource( articleId.toLatin1().constData(), articleText.c_str() );
articleText = "Article loading error";
try
{
ScopedMemMap compressed( dictFile, recordInfo.compressedBlockPos, recordInfo.compressedBlockSize );
if ( !compressed.startAddress() )
throw exCorruptDictionary();
QByteArray decompressed;
if ( !MdictParser::parseCompressedBlock( recordInfo.compressedBlockSize, ( char * )compressed.startAddress(),
recordInfo.decompressedBlockSize, decompressed ) )
return;
QString article = MdictParser::toUtf16( encoding.c_str(),
decompressed.constData() + recordInfo.recordOffset,
recordInfo.recordSize );
article = MdictParser::substituteStylesheet( article, styleSheets );
articleText = filterResource( articleId.toLatin1().constData(), article.toUtf8().constData() );
}
catch ( std::exception & e )
{
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
getDictionaryFilenames()[ 0 ].c_str(), e.what() );
}
catch ( ... )
{
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
getDictionaryFilenames()[ 0 ].c_str(), "unknown error" );
}
}
string MdxDictionary::filterResource( const char * articleId, const char * article )
@ -820,36 +852,20 @@ static void addEntryToIndexSingle( QString const & word, uint32_t offset, Indexe
indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset );
}
class ArticleHandler: public MdxParser::ArticleHandler
class ArticleHandler: public MdictParser::RecordHandler
{
public:
ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) :
chunks( chunks ),
indexedWords( indexedWords ),
articleCount_( 0 )
indexedWords( indexedWords )
{
}
inline size_t articleCount()
virtual void handleRecord( QString const & headWord, MdictParser::RecordInfo const & recordInfo )
{
return articleCount_;
}
void handleAritcle( QString const & headWord, QString const & article )
{
if ( !article.startsWith( "@@@LINK=" ) )
{
articleCount_++;
}
// Save the article's body itself first
// Save the article's record info
uint32_t articleAddress = chunks.startNewBlock();
string headWordU8 = string( headWord.toUtf8().constData() );
string articleU8 = string( article.toUtf8().constData() );
chunks.addToBlock( headWordU8.c_str(), headWordU8.size() + 1 );
chunks.addToBlock( articleU8.c_str(), articleU8.size() + 1 );
chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
// Add entries to the index
addEntryToIndex( headWord, articleAddress, indexedWords );
}
@ -857,10 +873,9 @@ public:
private:
ChunkedStorage::Writer & chunks;
IndexedWords & indexedWords;
size_t articleCount_;
};
class ResourceHandler: public MddParser::ResourceHandler
class ResourceHandler: public MdictParser::RecordHandler
{
public:
ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ):
@ -869,18 +884,10 @@ public:
{
}
void handleResource( QString const & fileName, quint32 decompressedBlockSize,
quint32 compressedBlockPos, quint32 compressedBlockSize,
quint32 resourceOffset, quint32 resourceSize )
virtual void handleRecord( QString const & fileName, MdictParser::RecordInfo const & recordInfo )
{
uint32_t resourceInfoAddress = chunks.startNewBlock();
MddIndexEntry mddIndexEntry;
mddIndexEntry.decompressedBlockSize = decompressedBlockSize;
mddIndexEntry.compressedBlockPos = compressedBlockPos;
mddIndexEntry.compressedBlockSize = compressedBlockSize;
mddIndexEntry.resourceOffset = resourceOffset;
mddIndexEntry.resourceSize = resourceSize;
chunks.addToBlock( &mddIndexEntry, sizeof( mddIndexEntry ) );
chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
// Add entries to the index
addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords );
}
@ -935,15 +942,15 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
indexIsOldOrBad( indexFile, !mddFileName.empty() ) )
{
// Building the index
MdxParser parser( i->c_str() );
sptr<MddParser> mddParser = NULL;
MdictParser parser( i->c_str() );
sptr<MdictParser> mddParser = NULL;
if ( !parser.open() )
continue;
if ( File::exists( mddFileName ) )
{
mddParser = new MddParser( mddFileName.c_str() );
mddParser = new MdictParser( mddFileName.c_str() );
if ( !mddParser->open() )
{
FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() );
@ -960,9 +967,18 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
idx.write( idxHeader );
// Write the title first
idx.write< uint32_t >( title.size() );
idx.write( title.data(), title.size() );
// then the encoding
{
string encoding = string( parser.encoding().toUtf8().constData() );
idx.write< uint32_t >( encoding.size() );
idx.write( encoding.data(), encoding.size() );
}
// This is our index data that we accumulate during the loading process.
// For each new word encountered, we emit the article's body to the file
// immediately, inserting the word itself and its offset in this map.
@ -976,10 +992,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Save dictionary description if there's one
{
string description = string( parser.description().toUtf8().constData() );
idxHeader.descriptionSize = 0;
idxHeader.descriptionAddress = chunks.startNewBlock();
chunks.addToBlock( description.c_str(), description.size() + 1 );
idxHeader.descriptionSize += description.size() + 1;
idxHeader.descriptionSize = description.size() + 1;
}
ArticleHandler articleHandler( chunks, indexedWords );
@ -1062,7 +1077,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idxHeader.formatVersion = kCurrentFormatVersion;
idxHeader.parserVersion = MdictParser::kParserVersion;
idxHeader.foldingVersion = Folding::Version;
idxHeader.articleCount = articleHandler.articleCount();
idxHeader.articleCount = parser.wordCount();
idxHeader.wordCount = parser.wordCount();
idx.rewind();