mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 04:24:09 +00:00
Support large dictionary files; indexing speed improvement
This commit is contained in:
parent
1e047df679
commit
0101f52abd
|
@ -38,6 +38,9 @@
|
|||
|
||||
#include "decompress.hh"
|
||||
|
||||
namespace Mdict
|
||||
{
|
||||
|
||||
static inline int u16StrSize( const ushort * unicode )
|
||||
{
|
||||
int size = 0;
|
||||
|
@ -141,10 +144,12 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
|
|||
|
||||
headWordIndex.clear();
|
||||
|
||||
file_->seek( headWordPos_ );
|
||||
QByteArray data = file_->read( headWordBlockSize_ );
|
||||
const char * pDataStart = data.constData();
|
||||
const char * pDataEnd = pDataStart + data.size();
|
||||
ScopedMemMap mapping( *file_, headWordPos_, headWordBlockSize_ );
|
||||
if ( !mapping.startAddress() )
|
||||
return false;
|
||||
|
||||
const char * pDataStart = ( const char * )mapping.startAddress();
|
||||
const char * pDataEnd = pDataStart + headWordBlockSize_;
|
||||
const char pattern[] = {0x02, 0x00, 0x00, 0x00};
|
||||
const char * patternBegin = pattern;
|
||||
const char * patternEnd = pattern + 4;
|
||||
|
@ -168,17 +173,20 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
|
|||
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
|
||||
return false;
|
||||
|
||||
file_->seek( headWordPos_ );
|
||||
qint64 compressedSize = headWordBlockInfosIter_->first;
|
||||
qint64 decompressedSize = headWordBlockInfosIter_->second;
|
||||
|
||||
if ( compressedSize < 8 )
|
||||
return false;
|
||||
|
||||
QByteArray compressed = file_->read( compressedSize );
|
||||
headWordPos_ = file_->pos();
|
||||
ScopedMemMap compressed( *file_, headWordPos_, compressedSize );
|
||||
if ( !compressed.startAddress() )
|
||||
return false;
|
||||
|
||||
headWordPos_ += compressedSize;
|
||||
QByteArray decompressed;
|
||||
if ( !parseCompressedBlock( compressedSize, compressed, decompressedSize, decompressed ) )
|
||||
if ( !parseCompressedBlock( compressedSize, ( char * )compressed.startAddress(),
|
||||
decompressedSize, decompressed ) )
|
||||
return false;
|
||||
|
||||
headWordIndex = splitHeadWordBlock( decompressed );
|
||||
|
@ -381,7 +389,7 @@ bool MdictParser::readHeader( QDataStream & in )
|
|||
// Read metadata
|
||||
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
|
||||
QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
|
||||
if ( title == "Title (No HTML code allowed)" )
|
||||
if ( title.isEmpty() || title.length() < 5 || title == "Title (No HTML code allowed)" )
|
||||
{
|
||||
// Use filename instead
|
||||
QFileInfo fi( filename_ );
|
||||
|
@ -587,49 +595,42 @@ MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & b
|
|||
return index;
|
||||
}
|
||||
|
||||
bool MdxParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
|
||||
MdxParser::ArticleHandler & articleHandler )
|
||||
bool MdictParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
|
||||
MdictParser::RecordHandler & recordHandler )
|
||||
{
|
||||
size_t prevIdx = ( size_t ) ( -1 );
|
||||
QByteArray decompressed;
|
||||
// cache the index, the headWordIndex is already sorted
|
||||
size_t idx = 0;
|
||||
|
||||
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
|
||||
{
|
||||
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
|
||||
RecordIndex const & recordIndex = recordBlockInfos_[idx];
|
||||
if ( recordBlockInfos_[idx].endPos <= i->first )
|
||||
idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
|
||||
|
||||
if ( idx == ( size_t )( -1 ) )
|
||||
return false;
|
||||
|
||||
// Reload if index changes
|
||||
if ( prevIdx != idx )
|
||||
{
|
||||
prevIdx = idx;
|
||||
file_->seek( recordPos_ + recordIndex.startPos );
|
||||
|
||||
QByteArray compressed;
|
||||
compressed.resize( recordIndex.compressedSize );
|
||||
file_->read( compressed.data(), recordIndex.compressedSize );
|
||||
|
||||
if ( !parseCompressedBlock( recordIndex.compressedSize, compressed,
|
||||
recordIndex.decompressedSize, decompressed ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
RecordIndex const & recordIndex = recordBlockInfos_[idx];
|
||||
HeadWordIndex::const_iterator iNext = i + 1;
|
||||
size_t articleSize;
|
||||
size_t recordSize;
|
||||
if ( iNext == headWordIndex.end() )
|
||||
articleSize = recordIndex.shadowEndPos - i->first;
|
||||
recordSize = recordIndex.shadowEndPos - i->first;
|
||||
else
|
||||
articleSize = iNext->first - i->first;
|
||||
QString article = toUtf16( encoding_, decompressed.constData() + i->first - recordIndex.shadowStartPos, articleSize );
|
||||
articleHandler.handleAritcle( i->second, article );
|
||||
recordSize = iNext->first - i->first;
|
||||
|
||||
RecordInfo recordInfo;
|
||||
recordInfo.compressedBlockPos = recordPos_ + recordIndex.startPos;
|
||||
recordInfo.recordOffset = i->first - recordIndex.shadowStartPos;
|
||||
recordInfo.decompressedBlockSize = recordIndex.decompressedSize;
|
||||
recordInfo.compressedBlockSize = recordIndex.compressedSize;
|
||||
recordInfo.recordSize = recordSize;
|
||||
|
||||
recordHandler.handleRecord( i->second, recordInfo );
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSheets const & styleSheets )
|
||||
QString & MdictParser::substituteStylesheet( QString & article, MdictParser::StyleSheets const & styleSheets )
|
||||
{
|
||||
QRegExp rx( "`(\\d+)`" );
|
||||
QString endStyle;
|
||||
|
@ -658,28 +659,4 @@ QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSh
|
|||
return article;
|
||||
}
|
||||
|
||||
bool MddParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
|
||||
MddParser::ResourceHandler & resourceHandler )
|
||||
{
|
||||
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
|
||||
{
|
||||
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
|
||||
RecordIndex const & recordIndex = recordBlockInfos_[idx];
|
||||
|
||||
if ( idx == ( size_t )( -1 ) )
|
||||
return false;
|
||||
|
||||
HeadWordIndex::const_iterator iNext = i + 1;
|
||||
size_t resourceSize;
|
||||
if ( iNext == headWordIndex.end() )
|
||||
resourceSize = recordIndex.shadowEndPos - i->first;
|
||||
else
|
||||
resourceSize = iNext->first - i->first;
|
||||
|
||||
resourceHandler.handleResource( i->second, recordIndex.decompressedSize,
|
||||
recordPos_ + recordIndex.startPos, recordIndex.compressedSize,
|
||||
i->first - recordIndex.shadowStartPos, resourceSize );
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
104
mdictparser.hh
104
mdictparser.hh
|
@ -27,18 +27,46 @@
|
|||
#include <QPointer>
|
||||
#include <QFile>
|
||||
|
||||
namespace Mdict
|
||||
{
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
using std::pair;
|
||||
using std::map;
|
||||
|
||||
// A helper class to handle memory map for QFile
|
||||
class ScopedMemMap
|
||||
{
|
||||
QFile & file;
|
||||
uchar * address;
|
||||
|
||||
public:
|
||||
ScopedMemMap( QFile & file, qint64 offset, qint64 size ) :
|
||||
file( file ),
|
||||
address( file.map( offset, size ) )
|
||||
{
|
||||
}
|
||||
|
||||
~ScopedMemMap()
|
||||
{
|
||||
if ( address )
|
||||
file.unmap( address );
|
||||
}
|
||||
|
||||
inline uchar * startAddress()
|
||||
{
|
||||
return address;
|
||||
}
|
||||
};
|
||||
|
||||
class MdictParser
|
||||
{
|
||||
public:
|
||||
|
||||
enum
|
||||
{
|
||||
kParserVersion = 0x0000009
|
||||
kParserVersion = 0x000000b
|
||||
};
|
||||
|
||||
struct RecordIndex
|
||||
|
@ -68,6 +96,22 @@ public:
|
|||
static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val );
|
||||
};
|
||||
|
||||
struct RecordInfo
|
||||
{
|
||||
qint64 compressedBlockPos;
|
||||
qint64 recordOffset;
|
||||
|
||||
size_t decompressedBlockSize;
|
||||
size_t compressedBlockSize;
|
||||
size_t recordSize;
|
||||
};
|
||||
|
||||
class RecordHandler
|
||||
{
|
||||
public:
|
||||
virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0;
|
||||
};
|
||||
|
||||
typedef vector< pair<qint64, qint64> > BlockInfoVector;
|
||||
typedef vector< pair<qint64, QString> > HeadWordIndex;
|
||||
typedef map<int, pair<QString, QString> > StyleSheets;
|
||||
|
@ -107,9 +151,13 @@ public:
|
|||
return rtl_;
|
||||
}
|
||||
|
||||
MdictParser( char const * filename );
|
||||
~MdictParser() {}
|
||||
|
||||
bool open();
|
||||
void close();
|
||||
bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
|
||||
bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler );
|
||||
|
||||
// helpers
|
||||
static QString toUtf16( const char * fromCode, const char * from, size_t fromSize );
|
||||
|
@ -120,11 +168,15 @@ public:
|
|||
static bool parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr,
|
||||
size_t decompressedBlockSize, QByteArray & decompressedBlock );
|
||||
|
||||
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
|
||||
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
|
||||
{
|
||||
QString s = QString::fromUtf8( article.c_str() );
|
||||
substituteStylesheet( s, styleSheets );
|
||||
return string( s.toUtf8().constData() );
|
||||
}
|
||||
|
||||
protected:
|
||||
MdictParser( char const * filename );
|
||||
|
||||
~MdictParser() {}
|
||||
|
||||
qint64 readNumber( QDataStream & in );
|
||||
static quint32 readU8OrU16( QDataStream & in, bool isU16 );
|
||||
bool readHeader( QDataStream & in );
|
||||
|
@ -161,46 +213,6 @@ protected:
|
|||
bool bruteForceEnd_;
|
||||
};
|
||||
|
||||
class MdxParser: public MdictParser
|
||||
{
|
||||
public:
|
||||
class ArticleHandler
|
||||
{
|
||||
public:
|
||||
virtual void handleAritcle( QString const & headWord, QString const & article ) = 0;
|
||||
};
|
||||
|
||||
MdxParser( const char * filename ): MdictParser( filename ) {}
|
||||
~MdxParser() {}
|
||||
|
||||
bool readRecordBlock( HeadWordIndex & headWordIndex, ArticleHandler & articleHandler );
|
||||
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
|
||||
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
|
||||
{
|
||||
QString s = QString::fromUtf8( article.c_str() );
|
||||
substituteStylesheet( s, styleSheets );
|
||||
return string( s.toUtf8().constData() );
|
||||
}
|
||||
};
|
||||
|
||||
class MddParser: public MdictParser
|
||||
{
|
||||
public:
|
||||
class ResourceHandler
|
||||
{
|
||||
public:
|
||||
virtual void handleResource( QString const & fileName, quint32 decompressedBlockSize,
|
||||
quint32 compressedBlockPos, quint32 compressedBlockSize,
|
||||
quint32 resourceOffset, quint32 resourceSize ) = 0;
|
||||
};
|
||||
|
||||
MddParser( const char * filename ) : MdictParser( filename ) {}
|
||||
~MddParser() {}
|
||||
|
||||
bool readRecordBlock( HeadWordIndex & headWordIndex, ResourceHandler & resourceHandler );
|
||||
|
||||
private:
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#endif // __MDICTPARSER_HH_INCLUDED__
|
||||
|
|
179
mdx.cc
179
mdx.cc
|
@ -13,6 +13,7 @@
|
|||
#include "langcoder.hh"
|
||||
#include "fsencoding.hh"
|
||||
#include "audiolink.hh"
|
||||
#include "ex.hh"
|
||||
#include "mdictparser.hh"
|
||||
|
||||
#include <map>
|
||||
|
@ -49,22 +50,20 @@ using BtreeIndexing::WordArticleLink;
|
|||
using BtreeIndexing::IndexedWords;
|
||||
using BtreeIndexing::IndexInfo;
|
||||
|
||||
namespace
|
||||
{
|
||||
using namespace Mdict;
|
||||
|
||||
|
||||
/// Checks if the given string ends with the given substring
|
||||
bool endsWith( string const & str, string const & tail )
|
||||
static bool endsWith( string const & str, string const & tail )
|
||||
{
|
||||
return str.size() >= tail.size() &&
|
||||
str.compare( str.size() - tail.size(), tail.size(), tail ) == 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
enum
|
||||
{
|
||||
kSignature = 0x4349444d, // MDIC
|
||||
kCurrentFormatVersion = 4 + BtreeIndexing::FormatVersion
|
||||
kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion
|
||||
};
|
||||
|
||||
struct IdxHeader
|
||||
|
@ -104,15 +103,6 @@ __attribute__( ( packed ) )
|
|||
#endif
|
||||
;
|
||||
|
||||
struct MddIndexEntry
|
||||
{
|
||||
size_t decompressedBlockSize;
|
||||
size_t compressedBlockPos;
|
||||
size_t compressedBlockSize;
|
||||
size_t resourceOffset;
|
||||
size_t resourceSize;
|
||||
};
|
||||
|
||||
// A helper method to read resources from .mdd file
|
||||
class IndexedMdd: public BtreeIndexing::BtreeIndex
|
||||
{
|
||||
|
@ -167,26 +157,27 @@ public:
|
|||
if ( links.empty() )
|
||||
return false;
|
||||
|
||||
MddIndexEntry indexEntry;
|
||||
MdictParser::RecordInfo indexEntry;
|
||||
vector< char > chunk;
|
||||
Mutex::Lock _( idxMutex );
|
||||
const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
|
||||
memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
|
||||
|
||||
ScopedMemMap compressed( mddFile, indexEntry.compressedBlockPos, indexEntry.compressedBlockSize );
|
||||
if ( !compressed.startAddress() )
|
||||
{
|
||||
vector< char > chunk;
|
||||
Mutex::Lock _( idxMutex );
|
||||
const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
|
||||
memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
|
||||
return false;
|
||||
}
|
||||
|
||||
QByteArray decompressed;
|
||||
mddFile.seek( indexEntry.compressedBlockPos );
|
||||
QByteArray compressed = mddFile.read( indexEntry.compressedBlockSize );
|
||||
if ( !MdictParser::parseCompressedBlock( compressed.size(), compressed.constData(),
|
||||
if ( !MdictParser::parseCompressedBlock( indexEntry.compressedBlockSize, ( char * )compressed.startAddress(),
|
||||
indexEntry.decompressedBlockSize, decompressed ) )
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
compressed.clear();
|
||||
result.resize( indexEntry.resourceSize );
|
||||
memcpy( &result.front(), decompressed.constData() + indexEntry.resourceOffset, indexEntry.resourceSize );
|
||||
result.resize( indexEntry.recordSize );
|
||||
memcpy( &result.front(), decompressed.constData() + indexEntry.recordOffset, indexEntry.recordSize );
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -198,7 +189,9 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
|
|||
File::Class idx;
|
||||
IdxHeader idxHeader;
|
||||
string dictionaryName;
|
||||
string encoding;
|
||||
ChunkedStorage::Reader chunks;
|
||||
QFile dictFile;
|
||||
IndexedMdd mddResource;
|
||||
MdictParser::StyleSheets styleSheets;
|
||||
|
||||
|
@ -263,7 +256,7 @@ private:
|
|||
void doDeferredInit();
|
||||
|
||||
/// Loads an article with the given offset, filling the given strings.
|
||||
void loadArticle( uint32_t offset, string & headword, string & articleText );
|
||||
void loadArticle( uint32_t offset, string & articleText );
|
||||
|
||||
/// Process resource links (images, audios, etc)
|
||||
string filterResource( const char * articleId, const char * article );
|
||||
|
@ -283,14 +276,21 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
|
|||
mddResource( idxMutex, chunks ),
|
||||
deferredInitRunnableStarted( false )
|
||||
{
|
||||
idx.seek( sizeof( idxHeader ) );
|
||||
|
||||
// Read the dictionary's name
|
||||
idx.seek( sizeof( idxHeader ) );
|
||||
size_t len = idx.read< uint32_t >();
|
||||
vector< char > nameBuf( len );
|
||||
idx.read( &nameBuf.front(), len );
|
||||
vector< char > buf( len );
|
||||
idx.read( &buf.front(), len );
|
||||
dictionaryName = string( &buf.front(), len );
|
||||
|
||||
dictionaryName = string( &nameBuf.front(), len );
|
||||
// then read the dictionary's encoding
|
||||
len = idx.read< uint32_t >();
|
||||
buf.resize( len );
|
||||
idx.read( &buf.front(), len );
|
||||
encoding = string( &buf.front(), len );
|
||||
|
||||
dictFile.setFileName( QString::fromUtf8( dictionaryFiles[ 0 ].c_str() ) );
|
||||
dictFile.open( QIODevice::ReadOnly );
|
||||
}
|
||||
|
||||
MdxDictionary::~MdxDictionary()
|
||||
|
@ -300,6 +300,8 @@ MdxDictionary::~MdxDictionary()
|
|||
// Wait for init runnable to complete if it was ever started
|
||||
if ( deferredInitRunnableStarted )
|
||||
deferredInitRunnableExited.acquire();
|
||||
|
||||
dictFile.close();
|
||||
}
|
||||
|
||||
//////// MdxDictionary::deferredInit()
|
||||
|
@ -530,10 +532,9 @@ void MdxArticleRequest::run()
|
|||
continue; // We already have this article in the body.
|
||||
|
||||
// Grab that article
|
||||
string headword;
|
||||
string articleBody;
|
||||
|
||||
dict.loadArticle( chain[ x ].articleOffset, headword, articleBody );
|
||||
dict.loadArticle( chain[ x ].articleOffset, articleBody );
|
||||
|
||||
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
||||
continue; // We already have this article in the body.
|
||||
|
@ -700,8 +701,8 @@ void MddResourceRequest::run()
|
|||
{
|
||||
data.push_back( '\0' );
|
||||
data.push_back( '\0' );
|
||||
QString target = MdxParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
|
||||
data.size() - sizeof( pattern ) );
|
||||
QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
|
||||
data.size() - sizeof( pattern ) );
|
||||
resourceName = gd::toWString( target.trimmed() );
|
||||
continue;
|
||||
}
|
||||
|
@ -761,21 +762,52 @@ void MdxDictionary::loadIcon() throw()
|
|||
dictionaryIconLoaded = true;
|
||||
}
|
||||
|
||||
void MdxDictionary::loadArticle( uint32_t offset, string & headword, string & articleText )
|
||||
DEF_EX( exCorruptDictionary, "dictionary file tampered or corrupted", std::exception )
|
||||
|
||||
void MdxDictionary::loadArticle( uint32_t offset, string & articleText )
|
||||
{
|
||||
vector< char > chunk;
|
||||
Mutex::Lock _( idxMutex );
|
||||
|
||||
char * articleData = chunks.getBlock( offset, chunk );
|
||||
// Load record info from index
|
||||
MdictParser::RecordInfo recordInfo;
|
||||
char * pRecordInfo = chunks.getBlock( offset, chunk );
|
||||
memcpy( &recordInfo, pRecordInfo, sizeof( recordInfo ) );
|
||||
|
||||
// Make an sub unique id for this article
|
||||
QString articleId;
|
||||
articleId.setNum( ( quint64 )articleData, 16 );
|
||||
articleId.setNum( ( quint64 )pRecordInfo, 16 );
|
||||
|
||||
headword = articleData;
|
||||
articleText = string( articleData + headword.size() + 1 );
|
||||
articleText = MdxParser::substituteStylesheet( articleText, styleSheets );
|
||||
articleText = filterResource( articleId.toLatin1().constData(), articleText.c_str() );
|
||||
articleText = "Article loading error";
|
||||
|
||||
try
|
||||
{
|
||||
ScopedMemMap compressed( dictFile, recordInfo.compressedBlockPos, recordInfo.compressedBlockSize );
|
||||
if ( !compressed.startAddress() )
|
||||
throw exCorruptDictionary();
|
||||
|
||||
QByteArray decompressed;
|
||||
if ( !MdictParser::parseCompressedBlock( recordInfo.compressedBlockSize, ( char * )compressed.startAddress(),
|
||||
recordInfo.decompressedBlockSize, decompressed ) )
|
||||
return;
|
||||
|
||||
QString article = MdictParser::toUtf16( encoding.c_str(),
|
||||
decompressed.constData() + recordInfo.recordOffset,
|
||||
recordInfo.recordSize );
|
||||
|
||||
article = MdictParser::substituteStylesheet( article, styleSheets );
|
||||
articleText = filterResource( articleId.toLatin1().constData(), article.toUtf8().constData() );
|
||||
}
|
||||
catch ( std::exception & e )
|
||||
{
|
||||
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
|
||||
getDictionaryFilenames()[ 0 ].c_str(), e.what() );
|
||||
}
|
||||
catch ( ... )
|
||||
{
|
||||
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
|
||||
getDictionaryFilenames()[ 0 ].c_str(), "unknown error" );
|
||||
}
|
||||
}
|
||||
|
||||
string MdxDictionary::filterResource( const char * articleId, const char * article )
|
||||
|
@ -820,36 +852,20 @@ static void addEntryToIndexSingle( QString const & word, uint32_t offset, Indexe
|
|||
indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset );
|
||||
}
|
||||
|
||||
class ArticleHandler: public MdxParser::ArticleHandler
|
||||
class ArticleHandler: public MdictParser::RecordHandler
|
||||
{
|
||||
public:
|
||||
ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) :
|
||||
chunks( chunks ),
|
||||
indexedWords( indexedWords ),
|
||||
articleCount_( 0 )
|
||||
indexedWords( indexedWords )
|
||||
{
|
||||
}
|
||||
|
||||
inline size_t articleCount()
|
||||
virtual void handleRecord( QString const & headWord, MdictParser::RecordInfo const & recordInfo )
|
||||
{
|
||||
return articleCount_;
|
||||
}
|
||||
|
||||
void handleAritcle( QString const & headWord, QString const & article )
|
||||
{
|
||||
if ( !article.startsWith( "@@@LINK=" ) )
|
||||
{
|
||||
articleCount_++;
|
||||
}
|
||||
|
||||
// Save the article's body itself first
|
||||
// Save the article's record info
|
||||
uint32_t articleAddress = chunks.startNewBlock();
|
||||
string headWordU8 = string( headWord.toUtf8().constData() );
|
||||
string articleU8 = string( article.toUtf8().constData() );
|
||||
|
||||
chunks.addToBlock( headWordU8.c_str(), headWordU8.size() + 1 );
|
||||
chunks.addToBlock( articleU8.c_str(), articleU8.size() + 1 );
|
||||
|
||||
chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
|
||||
// Add entries to the index
|
||||
addEntryToIndex( headWord, articleAddress, indexedWords );
|
||||
}
|
||||
|
@ -857,10 +873,9 @@ public:
|
|||
private:
|
||||
ChunkedStorage::Writer & chunks;
|
||||
IndexedWords & indexedWords;
|
||||
size_t articleCount_;
|
||||
};
|
||||
|
||||
class ResourceHandler: public MddParser::ResourceHandler
|
||||
class ResourceHandler: public MdictParser::RecordHandler
|
||||
{
|
||||
public:
|
||||
ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ):
|
||||
|
@ -869,18 +884,10 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
void handleResource( QString const & fileName, quint32 decompressedBlockSize,
|
||||
quint32 compressedBlockPos, quint32 compressedBlockSize,
|
||||
quint32 resourceOffset, quint32 resourceSize )
|
||||
virtual void handleRecord( QString const & fileName, MdictParser::RecordInfo const & recordInfo )
|
||||
{
|
||||
uint32_t resourceInfoAddress = chunks.startNewBlock();
|
||||
MddIndexEntry mddIndexEntry;
|
||||
mddIndexEntry.decompressedBlockSize = decompressedBlockSize;
|
||||
mddIndexEntry.compressedBlockPos = compressedBlockPos;
|
||||
mddIndexEntry.compressedBlockSize = compressedBlockSize;
|
||||
mddIndexEntry.resourceOffset = resourceOffset;
|
||||
mddIndexEntry.resourceSize = resourceSize;
|
||||
chunks.addToBlock( &mddIndexEntry, sizeof( mddIndexEntry ) );
|
||||
chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
|
||||
// Add entries to the index
|
||||
addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords );
|
||||
}
|
||||
|
@ -935,15 +942,15 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
|||
indexIsOldOrBad( indexFile, !mddFileName.empty() ) )
|
||||
{
|
||||
// Building the index
|
||||
MdxParser parser( i->c_str() );
|
||||
sptr<MddParser> mddParser = NULL;
|
||||
MdictParser parser( i->c_str() );
|
||||
sptr<MdictParser> mddParser = NULL;
|
||||
|
||||
if ( !parser.open() )
|
||||
continue;
|
||||
|
||||
if ( File::exists( mddFileName ) )
|
||||
{
|
||||
mddParser = new MddParser( mddFileName.c_str() );
|
||||
mddParser = new MdictParser( mddFileName.c_str() );
|
||||
if ( !mddParser->open() )
|
||||
{
|
||||
FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() );
|
||||
|
@ -960,9 +967,18 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
|||
// We write a dummy header first. At the end of the process the header
|
||||
// will be rewritten with the right values.
|
||||
idx.write( idxHeader );
|
||||
|
||||
// Write the title first
|
||||
idx.write< uint32_t >( title.size() );
|
||||
idx.write( title.data(), title.size() );
|
||||
|
||||
// then the encoding
|
||||
{
|
||||
string encoding = string( parser.encoding().toUtf8().constData() );
|
||||
idx.write< uint32_t >( encoding.size() );
|
||||
idx.write( encoding.data(), encoding.size() );
|
||||
}
|
||||
|
||||
// This is our index data that we accumulate during the loading process.
|
||||
// For each new word encountered, we emit the article's body to the file
|
||||
// immediately, inserting the word itself and its offset in this map.
|
||||
|
@ -976,10 +992,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
|||
// Save dictionary description if there's one
|
||||
{
|
||||
string description = string( parser.description().toUtf8().constData() );
|
||||
idxHeader.descriptionSize = 0;
|
||||
idxHeader.descriptionAddress = chunks.startNewBlock();
|
||||
chunks.addToBlock( description.c_str(), description.size() + 1 );
|
||||
idxHeader.descriptionSize += description.size() + 1;
|
||||
idxHeader.descriptionSize = description.size() + 1;
|
||||
}
|
||||
|
||||
ArticleHandler articleHandler( chunks, indexedWords );
|
||||
|
@ -1062,7 +1077,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
|||
idxHeader.formatVersion = kCurrentFormatVersion;
|
||||
idxHeader.parserVersion = MdictParser::kParserVersion;
|
||||
idxHeader.foldingVersion = Folding::Version;
|
||||
idxHeader.articleCount = articleHandler.articleCount();
|
||||
idxHeader.articleCount = parser.wordCount();
|
||||
idxHeader.wordCount = parser.wordCount();
|
||||
|
||||
idx.rewind();
|
||||
|
|
Loading…
Reference in a new issue