Merge pull request #278 from timonwong/mdict_large_file

MDict: Support large dictionary files; indexing speed improvement
This commit is contained in:
Tvangeste 2013-04-28 02:31:32 -07:00
commit fd45873205
3 changed files with 206 additions and 195 deletions

View file

@ -38,6 +38,9 @@
#include "decompress.hh" #include "decompress.hh"
namespace Mdict
{
static inline int u16StrSize( const ushort * unicode ) static inline int u16StrSize( const ushort * unicode )
{ {
int size = 0; int size = 0;
@ -141,10 +144,12 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
headWordIndex.clear(); headWordIndex.clear();
file_->seek( headWordPos_ ); ScopedMemMap mapping( *file_, headWordPos_, headWordBlockSize_ );
QByteArray data = file_->read( headWordBlockSize_ ); if ( !mapping.startAddress() )
const char * pDataStart = data.constData(); return false;
const char * pDataEnd = pDataStart + data.size();
const char * pDataStart = ( const char * )mapping.startAddress();
const char * pDataEnd = pDataStart + headWordBlockSize_;
const char pattern[] = {0x02, 0x00, 0x00, 0x00}; const char pattern[] = {0x02, 0x00, 0x00, 0x00};
const char * patternBegin = pattern; const char * patternBegin = pattern;
const char * patternEnd = pattern + 4; const char * patternEnd = pattern + 4;
@ -168,17 +173,20 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() ) if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
return false; return false;
file_->seek( headWordPos_ );
qint64 compressedSize = headWordBlockInfosIter_->first; qint64 compressedSize = headWordBlockInfosIter_->first;
qint64 decompressedSize = headWordBlockInfosIter_->second; qint64 decompressedSize = headWordBlockInfosIter_->second;
if ( compressedSize < 8 ) if ( compressedSize < 8 )
return false; return false;
QByteArray compressed = file_->read( compressedSize ); ScopedMemMap compressed( *file_, headWordPos_, compressedSize );
headWordPos_ = file_->pos(); if ( !compressed.startAddress() )
return false;
headWordPos_ += compressedSize;
QByteArray decompressed; QByteArray decompressed;
if ( !parseCompressedBlock( compressedSize, compressed, decompressedSize, decompressed ) ) if ( !parseCompressedBlock( compressedSize, ( char * )compressed.startAddress(),
decompressedSize, decompressed ) )
return false; return false;
headWordIndex = splitHeadWordBlock( decompressed ); headWordIndex = splitHeadWordBlock( decompressed );
@ -381,7 +389,7 @@ bool MdictParser::readHeader( QDataStream & in )
// Read metadata // Read metadata
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes"; rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
QString title = headerAttributes.namedItem( "Title" ).toAttr().value(); QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
if ( title == "Title (No HTML code allowed)" ) if ( title.isEmpty() || title.length() < 5 || title == "Title (No HTML code allowed)" )
{ {
// Use filename instead // Use filename instead
QFileInfo fi( filename_ ); QFileInfo fi( filename_ );
@ -587,49 +595,42 @@ MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & b
return index; return index;
} }
bool MdxParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex, bool MdictParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
MdxParser::ArticleHandler & articleHandler ) MdictParser::RecordHandler & recordHandler )
{ {
size_t prevIdx = ( size_t ) ( -1 ); // cache the index, the headWordIndex is already sorted
QByteArray decompressed; size_t idx = 0;
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ ) for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
{ {
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first ); if ( recordBlockInfos_[idx].endPos <= i->first )
RecordIndex const & recordIndex = recordBlockInfos_[idx]; idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
if ( idx == ( size_t )( -1 ) ) if ( idx == ( size_t )( -1 ) )
return false; return false;
// Reload if index changes RecordIndex const & recordIndex = recordBlockInfos_[idx];
if ( prevIdx != idx )
{
prevIdx = idx;
file_->seek( recordPos_ + recordIndex.startPos );
QByteArray compressed;
compressed.resize( recordIndex.compressedSize );
file_->read( compressed.data(), recordIndex.compressedSize );
if ( !parseCompressedBlock( recordIndex.compressedSize, compressed,
recordIndex.decompressedSize, decompressed ) )
return false;
}
HeadWordIndex::const_iterator iNext = i + 1; HeadWordIndex::const_iterator iNext = i + 1;
size_t articleSize; size_t recordSize;
if ( iNext == headWordIndex.end() ) if ( iNext == headWordIndex.end() )
articleSize = recordIndex.shadowEndPos - i->first; recordSize = recordIndex.shadowEndPos - i->first;
else else
articleSize = iNext->first - i->first; recordSize = iNext->first - i->first;
QString article = toUtf16( encoding_, decompressed.constData() + i->first - recordIndex.shadowStartPos, articleSize );
articleHandler.handleAritcle( i->second, article ); RecordInfo recordInfo;
recordInfo.compressedBlockPos = recordPos_ + recordIndex.startPos;
recordInfo.recordOffset = i->first - recordIndex.shadowStartPos;
recordInfo.decompressedBlockSize = recordIndex.decompressedSize;
recordInfo.compressedBlockSize = recordIndex.compressedSize;
recordInfo.recordSize = recordSize;
recordHandler.handleRecord( i->second, recordInfo );
} }
return true; return true;
} }
QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSheets const & styleSheets ) QString & MdictParser::substituteStylesheet( QString & article, MdictParser::StyleSheets const & styleSheets )
{ {
QRegExp rx( "`(\\d+)`" ); QRegExp rx( "`(\\d+)`" );
QString endStyle; QString endStyle;
@ -658,28 +659,4 @@ QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSh
return article; return article;
} }
bool MddParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
MddParser::ResourceHandler & resourceHandler )
{
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
{
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
RecordIndex const & recordIndex = recordBlockInfos_[idx];
if ( idx == ( size_t )( -1 ) )
return false;
HeadWordIndex::const_iterator iNext = i + 1;
size_t resourceSize;
if ( iNext == headWordIndex.end() )
resourceSize = recordIndex.shadowEndPos - i->first;
else
resourceSize = iNext->first - i->first;
resourceHandler.handleResource( i->second, recordIndex.decompressedSize,
recordPos_ + recordIndex.startPos, recordIndex.compressedSize,
i->first - recordIndex.shadowStartPos, resourceSize );
}
return true;
} }

View file

@ -27,18 +27,46 @@
#include <QPointer> #include <QPointer>
#include <QFile> #include <QFile>
namespace Mdict
{
using std::string; using std::string;
using std::vector; using std::vector;
using std::pair; using std::pair;
using std::map; using std::map;
// A helper class to handle memory map for QFile
class ScopedMemMap
{
QFile & file;
uchar * address;
public:
ScopedMemMap( QFile & file, qint64 offset, qint64 size ) :
file( file ),
address( file.map( offset, size ) )
{
}
~ScopedMemMap()
{
if ( address )
file.unmap( address );
}
inline uchar * startAddress()
{
return address;
}
};
class MdictParser class MdictParser
{ {
public: public:
enum enum
{ {
kParserVersion = 0x0000009 kParserVersion = 0x000000b
}; };
struct RecordIndex struct RecordIndex
@ -68,6 +96,22 @@ public:
static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val ); static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val );
}; };
struct RecordInfo
{
qint64 compressedBlockPos;
qint64 recordOffset;
size_t decompressedBlockSize;
size_t compressedBlockSize;
size_t recordSize;
};
class RecordHandler
{
public:
virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0;
};
typedef vector< pair<qint64, qint64> > BlockInfoVector; typedef vector< pair<qint64, qint64> > BlockInfoVector;
typedef vector< pair<qint64, QString> > HeadWordIndex; typedef vector< pair<qint64, QString> > HeadWordIndex;
typedef map<int, pair<QString, QString> > StyleSheets; typedef map<int, pair<QString, QString> > StyleSheets;
@ -107,9 +151,13 @@ public:
return rtl_; return rtl_;
} }
MdictParser( char const * filename );
~MdictParser() {}
bool open(); bool open();
void close(); void close();
bool readNextHeadWordIndex( HeadWordIndex & headWordIndex ); bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler );
// helpers // helpers
static QString toUtf16( const char * fromCode, const char * from, size_t fromSize ); static QString toUtf16( const char * fromCode, const char * from, size_t fromSize );
@ -120,11 +168,15 @@ public:
static bool parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr, static bool parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr,
size_t decompressedBlockSize, QByteArray & decompressedBlock ); size_t decompressedBlockSize, QByteArray & decompressedBlock );
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
{
QString s = QString::fromUtf8( article.c_str() );
substituteStylesheet( s, styleSheets );
return string( s.toUtf8().constData() );
}
protected: protected:
MdictParser( char const * filename );
~MdictParser() {}
qint64 readNumber( QDataStream & in ); qint64 readNumber( QDataStream & in );
static quint32 readU8OrU16( QDataStream & in, bool isU16 ); static quint32 readU8OrU16( QDataStream & in, bool isU16 );
bool readHeader( QDataStream & in ); bool readHeader( QDataStream & in );
@ -161,46 +213,6 @@ protected:
bool bruteForceEnd_; bool bruteForceEnd_;
}; };
class MdxParser: public MdictParser
{
public:
class ArticleHandler
{
public:
virtual void handleAritcle( QString const & headWord, QString const & article ) = 0;
};
MdxParser( const char * filename ): MdictParser( filename ) {}
~MdxParser() {}
bool readRecordBlock( HeadWordIndex & headWordIndex, ArticleHandler & articleHandler );
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
{
QString s = QString::fromUtf8( article.c_str() );
substituteStylesheet( s, styleSheets );
return string( s.toUtf8().constData() );
} }
};
class MddParser: public MdictParser
{
public:
class ResourceHandler
{
public:
virtual void handleResource( QString const & fileName, quint32 decompressedBlockSize,
quint32 compressedBlockPos, quint32 compressedBlockSize,
quint32 resourceOffset, quint32 resourceSize ) = 0;
};
MddParser( const char * filename ) : MdictParser( filename ) {}
~MddParser() {}
bool readRecordBlock( HeadWordIndex & headWordIndex, ResourceHandler & resourceHandler );
private:
};
#endif // __MDICTPARSER_HH_INCLUDED__ #endif // __MDICTPARSER_HH_INCLUDED__

188
mdx.cc
View file

@ -13,6 +13,7 @@
#include "langcoder.hh" #include "langcoder.hh"
#include "fsencoding.hh" #include "fsencoding.hh"
#include "audiolink.hh" #include "audiolink.hh"
#include "ex.hh"
#include "mdictparser.hh" #include "mdictparser.hh"
#include <map> #include <map>
@ -49,22 +50,20 @@ using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo; using BtreeIndexing::IndexInfo;
namespace using namespace Mdict;
{
/// Checks if the given string ends with the given substring /// Checks if the given string ends with the given substring
bool endsWith( string const & str, string const & tail ) static bool endsWith( string const & str, string const & tail )
{ {
return str.size() >= tail.size() && return str.size() >= tail.size() &&
str.compare( str.size() - tail.size(), tail.size(), tail ) == 0; str.compare( str.size() - tail.size(), tail.size(), tail ) == 0;
} }
}
enum enum
{ {
kSignature = 0x4349444d, // MDIC kSignature = 0x4349444d, // MDIC
kCurrentFormatVersion = 4 + BtreeIndexing::FormatVersion kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion
}; };
struct IdxHeader struct IdxHeader
@ -104,15 +103,6 @@ __attribute__( ( packed ) )
#endif #endif
; ;
struct MddIndexEntry
{
size_t decompressedBlockSize;
size_t compressedBlockPos;
size_t compressedBlockSize;
size_t resourceOffset;
size_t resourceSize;
};
// A helper method to read resources from .mdd file // A helper method to read resources from .mdd file
class IndexedMdd: public BtreeIndexing::BtreeIndex class IndexedMdd: public BtreeIndexing::BtreeIndex
{ {
@ -167,26 +157,27 @@ public:
if ( links.empty() ) if ( links.empty() )
return false; return false;
MddIndexEntry indexEntry; MdictParser::RecordInfo indexEntry;
{
vector< char > chunk; vector< char > chunk;
Mutex::Lock _( idxMutex ); Mutex::Lock _( idxMutex );
const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk ); const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) ); memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
ScopedMemMap compressed( mddFile, indexEntry.compressedBlockPos, indexEntry.compressedBlockSize );
if ( !compressed.startAddress() )
{
return false;
} }
QByteArray decompressed; QByteArray decompressed;
mddFile.seek( indexEntry.compressedBlockPos ); if ( !MdictParser::parseCompressedBlock( indexEntry.compressedBlockSize, ( char * )compressed.startAddress(),
QByteArray compressed = mddFile.read( indexEntry.compressedBlockSize );
if ( !MdictParser::parseCompressedBlock( compressed.size(), compressed.constData(),
indexEntry.decompressedBlockSize, decompressed ) ) indexEntry.decompressedBlockSize, decompressed ) )
{ {
return false; return false;
} }
compressed.clear(); result.resize( indexEntry.recordSize );
result.resize( indexEntry.resourceSize ); memcpy( &result.front(), decompressed.constData() + indexEntry.recordOffset, indexEntry.recordSize );
memcpy( &result.front(), decompressed.constData() + indexEntry.resourceOffset, indexEntry.resourceSize );
return true; return true;
} }
@ -198,7 +189,9 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
File::Class idx; File::Class idx;
IdxHeader idxHeader; IdxHeader idxHeader;
string dictionaryName; string dictionaryName;
string encoding;
ChunkedStorage::Reader chunks; ChunkedStorage::Reader chunks;
QFile dictFile;
IndexedMdd mddResource; IndexedMdd mddResource;
MdictParser::StyleSheets styleSheets; MdictParser::StyleSheets styleSheets;
@ -263,7 +256,7 @@ private:
void doDeferredInit(); void doDeferredInit();
/// Loads an article with the given offset, filling the given strings. /// Loads an article with the given offset, filling the given strings.
void loadArticle( uint32_t offset, string & headword, string & articleText ); void loadArticle( uint32_t offset, string & articleText );
/// Process resource links (images, audios, etc) /// Process resource links (images, audios, etc)
string filterResource( const char * articleId, const char * article ); string filterResource( const char * articleId, const char * article );
@ -283,14 +276,21 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
mddResource( idxMutex, chunks ), mddResource( idxMutex, chunks ),
deferredInitRunnableStarted( false ) deferredInitRunnableStarted( false )
{ {
idx.seek( sizeof( idxHeader ) );
// Read the dictionary's name // Read the dictionary's name
idx.seek( sizeof( idxHeader ) );
size_t len = idx.read< uint32_t >(); size_t len = idx.read< uint32_t >();
vector< char > nameBuf( len ); vector< char > buf( len );
idx.read( &nameBuf.front(), len ); idx.read( &buf.front(), len );
dictionaryName = string( &buf.front(), len );
dictionaryName = string( &nameBuf.front(), len ); // then read the dictionary's encoding
len = idx.read< uint32_t >();
buf.resize( len );
idx.read( &buf.front(), len );
encoding = string( &buf.front(), len );
dictFile.setFileName( QString::fromUtf8( dictionaryFiles[ 0 ].c_str() ) );
dictFile.open( QIODevice::ReadOnly );
} }
MdxDictionary::~MdxDictionary() MdxDictionary::~MdxDictionary()
@ -300,6 +300,8 @@ MdxDictionary::~MdxDictionary()
// Wait for init runnable to complete if it was ever started // Wait for init runnable to complete if it was ever started
if ( deferredInitRunnableStarted ) if ( deferredInitRunnableStarted )
deferredInitRunnableExited.acquire(); deferredInitRunnableExited.acquire();
dictFile.close();
} }
//////// MdxDictionary::deferredInit() //////// MdxDictionary::deferredInit()
@ -530,10 +532,9 @@ void MdxArticleRequest::run()
continue; // We already have this article in the body. continue; // We already have this article in the body.
// Grab that article // Grab that article
string headword;
string articleBody; string articleBody;
dict.loadArticle( chain[ x ].articleOffset, headword, articleBody ); dict.loadArticle( chain[ x ].articleOffset, articleBody );
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() ) if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
continue; // We already have this article in the body. continue; // We already have this article in the body.
@ -700,7 +701,7 @@ void MddResourceRequest::run()
{ {
data.push_back( '\0' ); data.push_back( '\0' );
data.push_back( '\0' ); data.push_back( '\0' );
QString target = MdxParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ), QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
data.size() - sizeof( pattern ) ); data.size() - sizeof( pattern ) );
resourceName = gd::toWString( target.trimmed() ); resourceName = gd::toWString( target.trimmed() );
continue; continue;
@ -761,25 +762,57 @@ void MdxDictionary::loadIcon() throw()
dictionaryIconLoaded = true; dictionaryIconLoaded = true;
} }
void MdxDictionary::loadArticle( uint32_t offset, string & headword, string & articleText ) DEF_EX( exCorruptDictionary, "dictionary file tampered or corrupted", std::exception )
void MdxDictionary::loadArticle( uint32_t offset, string & articleText )
{ {
vector< char > chunk; vector< char > chunk;
Mutex::Lock _( idxMutex ); Mutex::Lock _( idxMutex );
char * articleData = chunks.getBlock( offset, chunk ); // Load record info from index
MdictParser::RecordInfo recordInfo;
char * pRecordInfo = chunks.getBlock( offset, chunk );
memcpy( &recordInfo, pRecordInfo, sizeof( recordInfo ) );
// Make an sub unique id for this article // Make an sub unique id for this article
QString articleId; QString articleId;
articleId.setNum( ( quint64 )articleData, 16 ); articleId.setNum( ( quint64 )pRecordInfo, 16 );
headword = articleData; articleText = string( QObject::tr( "Article loading error" ).toUtf8().constData() );
articleText = string( articleData + headword.size() + 1 );
articleText = MdxParser::substituteStylesheet( articleText, styleSheets ); try
articleText = filterResource( articleId.toLatin1().constData(), articleText.c_str() ); {
ScopedMemMap compressed( dictFile, recordInfo.compressedBlockPos, recordInfo.compressedBlockSize );
if ( !compressed.startAddress() )
throw exCorruptDictionary();
QByteArray decompressed;
if ( !MdictParser::parseCompressedBlock( recordInfo.compressedBlockSize, ( char * )compressed.startAddress(),
recordInfo.decompressedBlockSize, decompressed ) )
return;
QString article = MdictParser::toUtf16( encoding.c_str(),
decompressed.constData() + recordInfo.recordOffset,
recordInfo.recordSize );
article = MdictParser::substituteStylesheet( article, styleSheets );
articleText = filterResource( articleId.toLatin1().constData(), article.toUtf8().constData() );
}
catch ( std::exception & e )
{
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
getDictionaryFilenames()[ 0 ].c_str(), e.what() );
}
catch ( ... )
{
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
getDictionaryFilenames()[ 0 ].c_str(), "unknown error" );
}
} }
string MdxDictionary::filterResource( const char * articleId, const char * article ) string MdxDictionary::filterResource( const char * articleId, const char * article )
{ {
QString id = QString::fromStdString( getId() );
QString uniquePrefix = QString::fromStdString( getId() + "_" + articleId + "_" ); QString uniquePrefix = QString::fromStdString( getId() + "_" + articleId + "_" );
return string( QString::fromUtf8( article ) return string( QString::fromUtf8( article )
@ -794,15 +827,21 @@ string MdxDictionary::filterResource( const char * articleId, const char * artic
// sounds, and audio link script // sounds, and audio link script
.replace( QRegExp( "(<\\s*a\\s+[^>]*href\\s*=\\s*\")sound://([^\"']*)", Qt::CaseInsensitive ), .replace( QRegExp( "(<\\s*a\\s+[^>]*href\\s*=\\s*\")sound://([^\"']*)", Qt::CaseInsensitive ),
QString::fromStdString( addAudioLink( "\"gdau://" + getId() + "/\\2\"", getId() ) ) + QString::fromStdString( addAudioLink( "\"gdau://" + getId() + "/\\2\"", getId() ) ) +
"\\1gdau://" + QString::fromStdString( getId() ) + "/\\2" ) "\\1gdau://" + id + "/\\2" )
// stylesheets // stylesheets
.replace( QRegExp( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(file://)?[\\x00-\\x30\\x7f]*([^\"']*)", .replace( QRegExp( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?:file://)?[\\x00-\\x30\\x7f]*([^\"']*)",
Qt::CaseInsensitive, QRegExp::RegExp2 ), Qt::CaseInsensitive, QRegExp::RegExp2 ),
"\\1bres://" + QString::fromStdString( getId() ) + "/\\3" ) "\\1bres://" + id + "/\\2" )
.replace( QRegExp( "(<\\s*link\\s+[^>]*href\\s*=\\s*)(?!['\"]+)(?!bres:|data:)(?:file://)?([^\\s>]+)",
Qt::CaseInsensitive, QRegExp::RegExp2 ),
"\\1\"bres://" + id + "/\\\"" )
// images // images
.replace( QRegExp( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(file://)?[\\x00-\\x30\\x7f]*([^\"']*)", .replace( QRegExp( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?:file://)?[\\x00-\\x30\\x7f]*([^\"']*)",
Qt::CaseInsensitive, QRegExp::RegExp2 ), Qt::CaseInsensitive, QRegExp::RegExp2 ),
"\\1bres://" + QString::fromStdString( getId() ) + "/\\3" ) "\\1bres://" + id + "/\\2" )
.replace( QRegExp( "(<\\s*img\\s+[^>]*src\\s*=\\s*)(?!['\"]+)(?!bres:|data:)(?:file://)?([^\\s>]+)",
Qt::CaseInsensitive, QRegExp::RegExp2 ),
"\\1\"bres://" + id + "/\\2\"" )
.toUtf8().constData() ); .toUtf8().constData() );
} }
@ -820,36 +859,20 @@ static void addEntryToIndexSingle( QString const & word, uint32_t offset, Indexe
indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset ); indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset );
} }
class ArticleHandler: public MdxParser::ArticleHandler class ArticleHandler: public MdictParser::RecordHandler
{ {
public: public:
ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) : ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) :
chunks( chunks ), chunks( chunks ),
indexedWords( indexedWords ), indexedWords( indexedWords )
articleCount_( 0 )
{ {
} }
inline size_t articleCount() virtual void handleRecord( QString const & headWord, MdictParser::RecordInfo const & recordInfo )
{ {
return articleCount_; // Save the article's record info
}
void handleAritcle( QString const & headWord, QString const & article )
{
if ( !article.startsWith( "@@@LINK=" ) )
{
articleCount_++;
}
// Save the article's body itself first
uint32_t articleAddress = chunks.startNewBlock(); uint32_t articleAddress = chunks.startNewBlock();
string headWordU8 = string( headWord.toUtf8().constData() ); chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
string articleU8 = string( article.toUtf8().constData() );
chunks.addToBlock( headWordU8.c_str(), headWordU8.size() + 1 );
chunks.addToBlock( articleU8.c_str(), articleU8.size() + 1 );
// Add entries to the index // Add entries to the index
addEntryToIndex( headWord, articleAddress, indexedWords ); addEntryToIndex( headWord, articleAddress, indexedWords );
} }
@ -857,10 +880,9 @@ public:
private: private:
ChunkedStorage::Writer & chunks; ChunkedStorage::Writer & chunks;
IndexedWords & indexedWords; IndexedWords & indexedWords;
size_t articleCount_;
}; };
class ResourceHandler: public MddParser::ResourceHandler class ResourceHandler: public MdictParser::RecordHandler
{ {
public: public:
ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ): ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ):
@ -869,18 +891,10 @@ public:
{ {
} }
void handleResource( QString const & fileName, quint32 decompressedBlockSize, virtual void handleRecord( QString const & fileName, MdictParser::RecordInfo const & recordInfo )
quint32 compressedBlockPos, quint32 compressedBlockSize,
quint32 resourceOffset, quint32 resourceSize )
{ {
uint32_t resourceInfoAddress = chunks.startNewBlock(); uint32_t resourceInfoAddress = chunks.startNewBlock();
MddIndexEntry mddIndexEntry; chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
mddIndexEntry.decompressedBlockSize = decompressedBlockSize;
mddIndexEntry.compressedBlockPos = compressedBlockPos;
mddIndexEntry.compressedBlockSize = compressedBlockSize;
mddIndexEntry.resourceOffset = resourceOffset;
mddIndexEntry.resourceSize = resourceSize;
chunks.addToBlock( &mddIndexEntry, sizeof( mddIndexEntry ) );
// Add entries to the index // Add entries to the index
addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords ); addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords );
} }
@ -935,15 +949,15 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
indexIsOldOrBad( indexFile, !mddFileName.empty() ) ) indexIsOldOrBad( indexFile, !mddFileName.empty() ) )
{ {
// Building the index // Building the index
MdxParser parser( i->c_str() ); MdictParser parser( i->c_str() );
sptr<MddParser> mddParser = NULL; sptr<MdictParser> mddParser = NULL;
if ( !parser.open() ) if ( !parser.open() )
continue; continue;
if ( File::exists( mddFileName ) ) if ( File::exists( mddFileName ) )
{ {
mddParser = new MddParser( mddFileName.c_str() ); mddParser = new MdictParser( mddFileName.c_str() );
if ( !mddParser->open() ) if ( !mddParser->open() )
{ {
FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() ); FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() );
@ -960,9 +974,18 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// We write a dummy header first. At the end of the process the header // We write a dummy header first. At the end of the process the header
// will be rewritten with the right values. // will be rewritten with the right values.
idx.write( idxHeader ); idx.write( idxHeader );
// Write the title first
idx.write< uint32_t >( title.size() ); idx.write< uint32_t >( title.size() );
idx.write( title.data(), title.size() ); idx.write( title.data(), title.size() );
// then the encoding
{
string encoding = string( parser.encoding().toUtf8().constData() );
idx.write< uint32_t >( encoding.size() );
idx.write( encoding.data(), encoding.size() );
}
// This is our index data that we accumulate during the loading process. // This is our index data that we accumulate during the loading process.
// For each new word encountered, we emit the article's body to the file // For each new word encountered, we emit the article's body to the file
// immediately, inserting the word itself and its offset in this map. // immediately, inserting the word itself and its offset in this map.
@ -976,10 +999,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Save dictionary description if there's one // Save dictionary description if there's one
{ {
string description = string( parser.description().toUtf8().constData() ); string description = string( parser.description().toUtf8().constData() );
idxHeader.descriptionSize = 0;
idxHeader.descriptionAddress = chunks.startNewBlock(); idxHeader.descriptionAddress = chunks.startNewBlock();
chunks.addToBlock( description.c_str(), description.size() + 1 ); chunks.addToBlock( description.c_str(), description.size() + 1 );
idxHeader.descriptionSize += description.size() + 1; idxHeader.descriptionSize = description.size() + 1;
} }
ArticleHandler articleHandler( chunks, indexedWords ); ArticleHandler articleHandler( chunks, indexedWords );
@ -1062,7 +1084,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idxHeader.formatVersion = kCurrentFormatVersion; idxHeader.formatVersion = kCurrentFormatVersion;
idxHeader.parserVersion = MdictParser::kParserVersion; idxHeader.parserVersion = MdictParser::kParserVersion;
idxHeader.foldingVersion = Folding::Version; idxHeader.foldingVersion = Folding::Version;
idxHeader.articleCount = articleHandler.articleCount(); idxHeader.articleCount = parser.wordCount();
idxHeader.wordCount = parser.wordCount(); idxHeader.wordCount = parser.wordCount();
idx.rewind(); idx.rewind();