mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-28 07:54:06 +00:00
Merge pull request #278 from timonwong/mdict_large_file
MDict: Support large dictionary files; indexing speed improvement
This commit is contained in:
commit
fd45873205
|
@ -38,6 +38,9 @@
|
||||||
|
|
||||||
#include "decompress.hh"
|
#include "decompress.hh"
|
||||||
|
|
||||||
|
namespace Mdict
|
||||||
|
{
|
||||||
|
|
||||||
static inline int u16StrSize( const ushort * unicode )
|
static inline int u16StrSize( const ushort * unicode )
|
||||||
{
|
{
|
||||||
int size = 0;
|
int size = 0;
|
||||||
|
@ -141,10 +144,12 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
|
||||||
|
|
||||||
headWordIndex.clear();
|
headWordIndex.clear();
|
||||||
|
|
||||||
file_->seek( headWordPos_ );
|
ScopedMemMap mapping( *file_, headWordPos_, headWordBlockSize_ );
|
||||||
QByteArray data = file_->read( headWordBlockSize_ );
|
if ( !mapping.startAddress() )
|
||||||
const char * pDataStart = data.constData();
|
return false;
|
||||||
const char * pDataEnd = pDataStart + data.size();
|
|
||||||
|
const char * pDataStart = ( const char * )mapping.startAddress();
|
||||||
|
const char * pDataEnd = pDataStart + headWordBlockSize_;
|
||||||
const char pattern[] = {0x02, 0x00, 0x00, 0x00};
|
const char pattern[] = {0x02, 0x00, 0x00, 0x00};
|
||||||
const char * patternBegin = pattern;
|
const char * patternBegin = pattern;
|
||||||
const char * patternEnd = pattern + 4;
|
const char * patternEnd = pattern + 4;
|
||||||
|
@ -168,17 +173,20 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
|
||||||
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
|
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
file_->seek( headWordPos_ );
|
|
||||||
qint64 compressedSize = headWordBlockInfosIter_->first;
|
qint64 compressedSize = headWordBlockInfosIter_->first;
|
||||||
qint64 decompressedSize = headWordBlockInfosIter_->second;
|
qint64 decompressedSize = headWordBlockInfosIter_->second;
|
||||||
|
|
||||||
if ( compressedSize < 8 )
|
if ( compressedSize < 8 )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
QByteArray compressed = file_->read( compressedSize );
|
ScopedMemMap compressed( *file_, headWordPos_, compressedSize );
|
||||||
headWordPos_ = file_->pos();
|
if ( !compressed.startAddress() )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
headWordPos_ += compressedSize;
|
||||||
QByteArray decompressed;
|
QByteArray decompressed;
|
||||||
if ( !parseCompressedBlock( compressedSize, compressed, decompressedSize, decompressed ) )
|
if ( !parseCompressedBlock( compressedSize, ( char * )compressed.startAddress(),
|
||||||
|
decompressedSize, decompressed ) )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
headWordIndex = splitHeadWordBlock( decompressed );
|
headWordIndex = splitHeadWordBlock( decompressed );
|
||||||
|
@ -381,7 +389,7 @@ bool MdictParser::readHeader( QDataStream & in )
|
||||||
// Read metadata
|
// Read metadata
|
||||||
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
|
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
|
||||||
QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
|
QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
|
||||||
if ( title == "Title (No HTML code allowed)" )
|
if ( title.isEmpty() || title.length() < 5 || title == "Title (No HTML code allowed)" )
|
||||||
{
|
{
|
||||||
// Use filename instead
|
// Use filename instead
|
||||||
QFileInfo fi( filename_ );
|
QFileInfo fi( filename_ );
|
||||||
|
@ -587,49 +595,42 @@ MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & b
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MdxParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
|
bool MdictParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
|
||||||
MdxParser::ArticleHandler & articleHandler )
|
MdictParser::RecordHandler & recordHandler )
|
||||||
{
|
{
|
||||||
size_t prevIdx = ( size_t ) ( -1 );
|
// cache the index, the headWordIndex is already sorted
|
||||||
QByteArray decompressed;
|
size_t idx = 0;
|
||||||
|
|
||||||
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
|
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
|
||||||
{
|
{
|
||||||
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
|
if ( recordBlockInfos_[idx].endPos <= i->first )
|
||||||
RecordIndex const & recordIndex = recordBlockInfos_[idx];
|
idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
|
||||||
|
|
||||||
if ( idx == ( size_t )( -1 ) )
|
if ( idx == ( size_t )( -1 ) )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Reload if index changes
|
RecordIndex const & recordIndex = recordBlockInfos_[idx];
|
||||||
if ( prevIdx != idx )
|
|
||||||
{
|
|
||||||
prevIdx = idx;
|
|
||||||
file_->seek( recordPos_ + recordIndex.startPos );
|
|
||||||
|
|
||||||
QByteArray compressed;
|
|
||||||
compressed.resize( recordIndex.compressedSize );
|
|
||||||
file_->read( compressed.data(), recordIndex.compressedSize );
|
|
||||||
|
|
||||||
if ( !parseCompressedBlock( recordIndex.compressedSize, compressed,
|
|
||||||
recordIndex.decompressedSize, decompressed ) )
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
HeadWordIndex::const_iterator iNext = i + 1;
|
HeadWordIndex::const_iterator iNext = i + 1;
|
||||||
size_t articleSize;
|
size_t recordSize;
|
||||||
if ( iNext == headWordIndex.end() )
|
if ( iNext == headWordIndex.end() )
|
||||||
articleSize = recordIndex.shadowEndPos - i->first;
|
recordSize = recordIndex.shadowEndPos - i->first;
|
||||||
else
|
else
|
||||||
articleSize = iNext->first - i->first;
|
recordSize = iNext->first - i->first;
|
||||||
QString article = toUtf16( encoding_, decompressed.constData() + i->first - recordIndex.shadowStartPos, articleSize );
|
|
||||||
articleHandler.handleAritcle( i->second, article );
|
RecordInfo recordInfo;
|
||||||
|
recordInfo.compressedBlockPos = recordPos_ + recordIndex.startPos;
|
||||||
|
recordInfo.recordOffset = i->first - recordIndex.shadowStartPos;
|
||||||
|
recordInfo.decompressedBlockSize = recordIndex.decompressedSize;
|
||||||
|
recordInfo.compressedBlockSize = recordIndex.compressedSize;
|
||||||
|
recordInfo.recordSize = recordSize;
|
||||||
|
|
||||||
|
recordHandler.handleRecord( i->second, recordInfo );
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSheets const & styleSheets )
|
QString & MdictParser::substituteStylesheet( QString & article, MdictParser::StyleSheets const & styleSheets )
|
||||||
{
|
{
|
||||||
QRegExp rx( "`(\\d+)`" );
|
QRegExp rx( "`(\\d+)`" );
|
||||||
QString endStyle;
|
QString endStyle;
|
||||||
|
@ -658,28 +659,4 @@ QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSh
|
||||||
return article;
|
return article;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MddParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
|
|
||||||
MddParser::ResourceHandler & resourceHandler )
|
|
||||||
{
|
|
||||||
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
|
|
||||||
{
|
|
||||||
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
|
|
||||||
RecordIndex const & recordIndex = recordBlockInfos_[idx];
|
|
||||||
|
|
||||||
if ( idx == ( size_t )( -1 ) )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
HeadWordIndex::const_iterator iNext = i + 1;
|
|
||||||
size_t resourceSize;
|
|
||||||
if ( iNext == headWordIndex.end() )
|
|
||||||
resourceSize = recordIndex.shadowEndPos - i->first;
|
|
||||||
else
|
|
||||||
resourceSize = iNext->first - i->first;
|
|
||||||
|
|
||||||
resourceHandler.handleResource( i->second, recordIndex.decompressedSize,
|
|
||||||
recordPos_ + recordIndex.startPos, recordIndex.compressedSize,
|
|
||||||
i->first - recordIndex.shadowStartPos, resourceSize );
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
104
mdictparser.hh
104
mdictparser.hh
|
@ -27,18 +27,46 @@
|
||||||
#include <QPointer>
|
#include <QPointer>
|
||||||
#include <QFile>
|
#include <QFile>
|
||||||
|
|
||||||
|
namespace Mdict
|
||||||
|
{
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::pair;
|
using std::pair;
|
||||||
using std::map;
|
using std::map;
|
||||||
|
|
||||||
|
// A helper class to handle memory map for QFile
|
||||||
|
class ScopedMemMap
|
||||||
|
{
|
||||||
|
QFile & file;
|
||||||
|
uchar * address;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ScopedMemMap( QFile & file, qint64 offset, qint64 size ) :
|
||||||
|
file( file ),
|
||||||
|
address( file.map( offset, size ) )
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
~ScopedMemMap()
|
||||||
|
{
|
||||||
|
if ( address )
|
||||||
|
file.unmap( address );
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uchar * startAddress()
|
||||||
|
{
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
class MdictParser
|
class MdictParser
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
kParserVersion = 0x0000009
|
kParserVersion = 0x000000b
|
||||||
};
|
};
|
||||||
|
|
||||||
struct RecordIndex
|
struct RecordIndex
|
||||||
|
@ -68,6 +96,22 @@ public:
|
||||||
static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val );
|
static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val );
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct RecordInfo
|
||||||
|
{
|
||||||
|
qint64 compressedBlockPos;
|
||||||
|
qint64 recordOffset;
|
||||||
|
|
||||||
|
size_t decompressedBlockSize;
|
||||||
|
size_t compressedBlockSize;
|
||||||
|
size_t recordSize;
|
||||||
|
};
|
||||||
|
|
||||||
|
class RecordHandler
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
typedef vector< pair<qint64, qint64> > BlockInfoVector;
|
typedef vector< pair<qint64, qint64> > BlockInfoVector;
|
||||||
typedef vector< pair<qint64, QString> > HeadWordIndex;
|
typedef vector< pair<qint64, QString> > HeadWordIndex;
|
||||||
typedef map<int, pair<QString, QString> > StyleSheets;
|
typedef map<int, pair<QString, QString> > StyleSheets;
|
||||||
|
@ -107,9 +151,13 @@ public:
|
||||||
return rtl_;
|
return rtl_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MdictParser( char const * filename );
|
||||||
|
~MdictParser() {}
|
||||||
|
|
||||||
bool open();
|
bool open();
|
||||||
void close();
|
void close();
|
||||||
bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
|
bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
|
||||||
|
bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler );
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
static QString toUtf16( const char * fromCode, const char * from, size_t fromSize );
|
static QString toUtf16( const char * fromCode, const char * from, size_t fromSize );
|
||||||
|
@ -120,11 +168,15 @@ public:
|
||||||
static bool parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr,
|
static bool parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr,
|
||||||
size_t decompressedBlockSize, QByteArray & decompressedBlock );
|
size_t decompressedBlockSize, QByteArray & decompressedBlock );
|
||||||
|
|
||||||
|
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
|
||||||
|
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
|
||||||
|
{
|
||||||
|
QString s = QString::fromUtf8( article.c_str() );
|
||||||
|
substituteStylesheet( s, styleSheets );
|
||||||
|
return string( s.toUtf8().constData() );
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
MdictParser( char const * filename );
|
|
||||||
|
|
||||||
~MdictParser() {}
|
|
||||||
|
|
||||||
qint64 readNumber( QDataStream & in );
|
qint64 readNumber( QDataStream & in );
|
||||||
static quint32 readU8OrU16( QDataStream & in, bool isU16 );
|
static quint32 readU8OrU16( QDataStream & in, bool isU16 );
|
||||||
bool readHeader( QDataStream & in );
|
bool readHeader( QDataStream & in );
|
||||||
|
@ -161,46 +213,6 @@ protected:
|
||||||
bool bruteForceEnd_;
|
bool bruteForceEnd_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class MdxParser: public MdictParser
|
}
|
||||||
{
|
|
||||||
public:
|
|
||||||
class ArticleHandler
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
virtual void handleAritcle( QString const & headWord, QString const & article ) = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
MdxParser( const char * filename ): MdictParser( filename ) {}
|
|
||||||
~MdxParser() {}
|
|
||||||
|
|
||||||
bool readRecordBlock( HeadWordIndex & headWordIndex, ArticleHandler & articleHandler );
|
|
||||||
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
|
|
||||||
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
|
|
||||||
{
|
|
||||||
QString s = QString::fromUtf8( article.c_str() );
|
|
||||||
substituteStylesheet( s, styleSheets );
|
|
||||||
return string( s.toUtf8().constData() );
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class MddParser: public MdictParser
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
class ResourceHandler
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
virtual void handleResource( QString const & fileName, quint32 decompressedBlockSize,
|
|
||||||
quint32 compressedBlockPos, quint32 compressedBlockSize,
|
|
||||||
quint32 resourceOffset, quint32 resourceSize ) = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
MddParser( const char * filename ) : MdictParser( filename ) {}
|
|
||||||
~MddParser() {}
|
|
||||||
|
|
||||||
bool readRecordBlock( HeadWordIndex & headWordIndex, ResourceHandler & resourceHandler );
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // __MDICTPARSER_HH_INCLUDED__
|
#endif // __MDICTPARSER_HH_INCLUDED__
|
||||||
|
|
188
mdx.cc
188
mdx.cc
|
@ -13,6 +13,7 @@
|
||||||
#include "langcoder.hh"
|
#include "langcoder.hh"
|
||||||
#include "fsencoding.hh"
|
#include "fsencoding.hh"
|
||||||
#include "audiolink.hh"
|
#include "audiolink.hh"
|
||||||
|
#include "ex.hh"
|
||||||
#include "mdictparser.hh"
|
#include "mdictparser.hh"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
|
@ -49,22 +50,20 @@ using BtreeIndexing::WordArticleLink;
|
||||||
using BtreeIndexing::IndexedWords;
|
using BtreeIndexing::IndexedWords;
|
||||||
using BtreeIndexing::IndexInfo;
|
using BtreeIndexing::IndexInfo;
|
||||||
|
|
||||||
namespace
|
using namespace Mdict;
|
||||||
{
|
|
||||||
|
|
||||||
/// Checks if the given string ends with the given substring
|
/// Checks if the given string ends with the given substring
|
||||||
bool endsWith( string const & str, string const & tail )
|
static bool endsWith( string const & str, string const & tail )
|
||||||
{
|
{
|
||||||
return str.size() >= tail.size() &&
|
return str.size() >= tail.size() &&
|
||||||
str.compare( str.size() - tail.size(), tail.size(), tail ) == 0;
|
str.compare( str.size() - tail.size(), tail.size(), tail ) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
kSignature = 0x4349444d, // MDIC
|
kSignature = 0x4349444d, // MDIC
|
||||||
kCurrentFormatVersion = 4 + BtreeIndexing::FormatVersion
|
kCurrentFormatVersion = 7 + BtreeIndexing::FormatVersion
|
||||||
};
|
};
|
||||||
|
|
||||||
struct IdxHeader
|
struct IdxHeader
|
||||||
|
@ -104,15 +103,6 @@ __attribute__( ( packed ) )
|
||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
|
|
||||||
struct MddIndexEntry
|
|
||||||
{
|
|
||||||
size_t decompressedBlockSize;
|
|
||||||
size_t compressedBlockPos;
|
|
||||||
size_t compressedBlockSize;
|
|
||||||
size_t resourceOffset;
|
|
||||||
size_t resourceSize;
|
|
||||||
};
|
|
||||||
|
|
||||||
// A helper method to read resources from .mdd file
|
// A helper method to read resources from .mdd file
|
||||||
class IndexedMdd: public BtreeIndexing::BtreeIndex
|
class IndexedMdd: public BtreeIndexing::BtreeIndex
|
||||||
{
|
{
|
||||||
|
@ -167,26 +157,27 @@ public:
|
||||||
if ( links.empty() )
|
if ( links.empty() )
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
MddIndexEntry indexEntry;
|
MdictParser::RecordInfo indexEntry;
|
||||||
{
|
|
||||||
vector< char > chunk;
|
vector< char > chunk;
|
||||||
Mutex::Lock _( idxMutex );
|
Mutex::Lock _( idxMutex );
|
||||||
const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
|
const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
|
||||||
memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
|
memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
|
||||||
|
|
||||||
|
ScopedMemMap compressed( mddFile, indexEntry.compressedBlockPos, indexEntry.compressedBlockSize );
|
||||||
|
if ( !compressed.startAddress() )
|
||||||
|
{
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
QByteArray decompressed;
|
QByteArray decompressed;
|
||||||
mddFile.seek( indexEntry.compressedBlockPos );
|
if ( !MdictParser::parseCompressedBlock( indexEntry.compressedBlockSize, ( char * )compressed.startAddress(),
|
||||||
QByteArray compressed = mddFile.read( indexEntry.compressedBlockSize );
|
|
||||||
if ( !MdictParser::parseCompressedBlock( compressed.size(), compressed.constData(),
|
|
||||||
indexEntry.decompressedBlockSize, decompressed ) )
|
indexEntry.decompressedBlockSize, decompressed ) )
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
compressed.clear();
|
result.resize( indexEntry.recordSize );
|
||||||
result.resize( indexEntry.resourceSize );
|
memcpy( &result.front(), decompressed.constData() + indexEntry.recordOffset, indexEntry.recordSize );
|
||||||
memcpy( &result.front(), decompressed.constData() + indexEntry.resourceOffset, indexEntry.resourceSize );
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,7 +189,9 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
|
||||||
File::Class idx;
|
File::Class idx;
|
||||||
IdxHeader idxHeader;
|
IdxHeader idxHeader;
|
||||||
string dictionaryName;
|
string dictionaryName;
|
||||||
|
string encoding;
|
||||||
ChunkedStorage::Reader chunks;
|
ChunkedStorage::Reader chunks;
|
||||||
|
QFile dictFile;
|
||||||
IndexedMdd mddResource;
|
IndexedMdd mddResource;
|
||||||
MdictParser::StyleSheets styleSheets;
|
MdictParser::StyleSheets styleSheets;
|
||||||
|
|
||||||
|
@ -263,7 +256,7 @@ private:
|
||||||
void doDeferredInit();
|
void doDeferredInit();
|
||||||
|
|
||||||
/// Loads an article with the given offset, filling the given strings.
|
/// Loads an article with the given offset, filling the given strings.
|
||||||
void loadArticle( uint32_t offset, string & headword, string & articleText );
|
void loadArticle( uint32_t offset, string & articleText );
|
||||||
|
|
||||||
/// Process resource links (images, audios, etc)
|
/// Process resource links (images, audios, etc)
|
||||||
string filterResource( const char * articleId, const char * article );
|
string filterResource( const char * articleId, const char * article );
|
||||||
|
@ -283,14 +276,21 @@ MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
|
||||||
mddResource( idxMutex, chunks ),
|
mddResource( idxMutex, chunks ),
|
||||||
deferredInitRunnableStarted( false )
|
deferredInitRunnableStarted( false )
|
||||||
{
|
{
|
||||||
idx.seek( sizeof( idxHeader ) );
|
|
||||||
|
|
||||||
// Read the dictionary's name
|
// Read the dictionary's name
|
||||||
|
idx.seek( sizeof( idxHeader ) );
|
||||||
size_t len = idx.read< uint32_t >();
|
size_t len = idx.read< uint32_t >();
|
||||||
vector< char > nameBuf( len );
|
vector< char > buf( len );
|
||||||
idx.read( &nameBuf.front(), len );
|
idx.read( &buf.front(), len );
|
||||||
|
dictionaryName = string( &buf.front(), len );
|
||||||
|
|
||||||
dictionaryName = string( &nameBuf.front(), len );
|
// then read the dictionary's encoding
|
||||||
|
len = idx.read< uint32_t >();
|
||||||
|
buf.resize( len );
|
||||||
|
idx.read( &buf.front(), len );
|
||||||
|
encoding = string( &buf.front(), len );
|
||||||
|
|
||||||
|
dictFile.setFileName( QString::fromUtf8( dictionaryFiles[ 0 ].c_str() ) );
|
||||||
|
dictFile.open( QIODevice::ReadOnly );
|
||||||
}
|
}
|
||||||
|
|
||||||
MdxDictionary::~MdxDictionary()
|
MdxDictionary::~MdxDictionary()
|
||||||
|
@ -300,6 +300,8 @@ MdxDictionary::~MdxDictionary()
|
||||||
// Wait for init runnable to complete if it was ever started
|
// Wait for init runnable to complete if it was ever started
|
||||||
if ( deferredInitRunnableStarted )
|
if ( deferredInitRunnableStarted )
|
||||||
deferredInitRunnableExited.acquire();
|
deferredInitRunnableExited.acquire();
|
||||||
|
|
||||||
|
dictFile.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
//////// MdxDictionary::deferredInit()
|
//////// MdxDictionary::deferredInit()
|
||||||
|
@ -530,10 +532,9 @@ void MdxArticleRequest::run()
|
||||||
continue; // We already have this article in the body.
|
continue; // We already have this article in the body.
|
||||||
|
|
||||||
// Grab that article
|
// Grab that article
|
||||||
string headword;
|
|
||||||
string articleBody;
|
string articleBody;
|
||||||
|
|
||||||
dict.loadArticle( chain[ x ].articleOffset, headword, articleBody );
|
dict.loadArticle( chain[ x ].articleOffset, articleBody );
|
||||||
|
|
||||||
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
||||||
continue; // We already have this article in the body.
|
continue; // We already have this article in the body.
|
||||||
|
@ -700,7 +701,7 @@ void MddResourceRequest::run()
|
||||||
{
|
{
|
||||||
data.push_back( '\0' );
|
data.push_back( '\0' );
|
||||||
data.push_back( '\0' );
|
data.push_back( '\0' );
|
||||||
QString target = MdxParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
|
QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
|
||||||
data.size() - sizeof( pattern ) );
|
data.size() - sizeof( pattern ) );
|
||||||
resourceName = gd::toWString( target.trimmed() );
|
resourceName = gd::toWString( target.trimmed() );
|
||||||
continue;
|
continue;
|
||||||
|
@ -761,25 +762,57 @@ void MdxDictionary::loadIcon() throw()
|
||||||
dictionaryIconLoaded = true;
|
dictionaryIconLoaded = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MdxDictionary::loadArticle( uint32_t offset, string & headword, string & articleText )
|
DEF_EX( exCorruptDictionary, "dictionary file tampered or corrupted", std::exception )
|
||||||
|
|
||||||
|
void MdxDictionary::loadArticle( uint32_t offset, string & articleText )
|
||||||
{
|
{
|
||||||
vector< char > chunk;
|
vector< char > chunk;
|
||||||
Mutex::Lock _( idxMutex );
|
Mutex::Lock _( idxMutex );
|
||||||
|
|
||||||
char * articleData = chunks.getBlock( offset, chunk );
|
// Load record info from index
|
||||||
|
MdictParser::RecordInfo recordInfo;
|
||||||
|
char * pRecordInfo = chunks.getBlock( offset, chunk );
|
||||||
|
memcpy( &recordInfo, pRecordInfo, sizeof( recordInfo ) );
|
||||||
|
|
||||||
// Make an sub unique id for this article
|
// Make an sub unique id for this article
|
||||||
QString articleId;
|
QString articleId;
|
||||||
articleId.setNum( ( quint64 )articleData, 16 );
|
articleId.setNum( ( quint64 )pRecordInfo, 16 );
|
||||||
|
|
||||||
headword = articleData;
|
articleText = string( QObject::tr( "Article loading error" ).toUtf8().constData() );
|
||||||
articleText = string( articleData + headword.size() + 1 );
|
|
||||||
articleText = MdxParser::substituteStylesheet( articleText, styleSheets );
|
try
|
||||||
articleText = filterResource( articleId.toLatin1().constData(), articleText.c_str() );
|
{
|
||||||
|
ScopedMemMap compressed( dictFile, recordInfo.compressedBlockPos, recordInfo.compressedBlockSize );
|
||||||
|
if ( !compressed.startAddress() )
|
||||||
|
throw exCorruptDictionary();
|
||||||
|
|
||||||
|
QByteArray decompressed;
|
||||||
|
if ( !MdictParser::parseCompressedBlock( recordInfo.compressedBlockSize, ( char * )compressed.startAddress(),
|
||||||
|
recordInfo.decompressedBlockSize, decompressed ) )
|
||||||
|
return;
|
||||||
|
|
||||||
|
QString article = MdictParser::toUtf16( encoding.c_str(),
|
||||||
|
decompressed.constData() + recordInfo.recordOffset,
|
||||||
|
recordInfo.recordSize );
|
||||||
|
|
||||||
|
article = MdictParser::substituteStylesheet( article, styleSheets );
|
||||||
|
articleText = filterResource( articleId.toLatin1().constData(), article.toUtf8().constData() );
|
||||||
|
}
|
||||||
|
catch ( std::exception & e )
|
||||||
|
{
|
||||||
|
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
|
||||||
|
getDictionaryFilenames()[ 0 ].c_str(), e.what() );
|
||||||
|
}
|
||||||
|
catch ( ... )
|
||||||
|
{
|
||||||
|
FDPRINTF( stderr, "MDict: load article from %s failed, error: %s\n",
|
||||||
|
getDictionaryFilenames()[ 0 ].c_str(), "unknown error" );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
string MdxDictionary::filterResource( const char * articleId, const char * article )
|
string MdxDictionary::filterResource( const char * articleId, const char * article )
|
||||||
{
|
{
|
||||||
|
QString id = QString::fromStdString( getId() );
|
||||||
QString uniquePrefix = QString::fromStdString( getId() + "_" + articleId + "_" );
|
QString uniquePrefix = QString::fromStdString( getId() + "_" + articleId + "_" );
|
||||||
|
|
||||||
return string( QString::fromUtf8( article )
|
return string( QString::fromUtf8( article )
|
||||||
|
@ -794,15 +827,21 @@ string MdxDictionary::filterResource( const char * articleId, const char * artic
|
||||||
// sounds, and audio link script
|
// sounds, and audio link script
|
||||||
.replace( QRegExp( "(<\\s*a\\s+[^>]*href\\s*=\\s*\")sound://([^\"']*)", Qt::CaseInsensitive ),
|
.replace( QRegExp( "(<\\s*a\\s+[^>]*href\\s*=\\s*\")sound://([^\"']*)", Qt::CaseInsensitive ),
|
||||||
QString::fromStdString( addAudioLink( "\"gdau://" + getId() + "/\\2\"", getId() ) ) +
|
QString::fromStdString( addAudioLink( "\"gdau://" + getId() + "/\\2\"", getId() ) ) +
|
||||||
"\\1gdau://" + QString::fromStdString( getId() ) + "/\\2" )
|
"\\1gdau://" + id + "/\\2" )
|
||||||
// stylesheets
|
// stylesheets
|
||||||
.replace( QRegExp( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(file://)?[\\x00-\\x30\\x7f]*([^\"']*)",
|
.replace( QRegExp( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?:file://)?[\\x00-\\x30\\x7f]*([^\"']*)",
|
||||||
Qt::CaseInsensitive, QRegExp::RegExp2 ),
|
Qt::CaseInsensitive, QRegExp::RegExp2 ),
|
||||||
"\\1bres://" + QString::fromStdString( getId() ) + "/\\3" )
|
"\\1bres://" + id + "/\\2" )
|
||||||
|
.replace( QRegExp( "(<\\s*link\\s+[^>]*href\\s*=\\s*)(?!['\"]+)(?!bres:|data:)(?:file://)?([^\\s>]+)",
|
||||||
|
Qt::CaseInsensitive, QRegExp::RegExp2 ),
|
||||||
|
"\\1\"bres://" + id + "/\\\"" )
|
||||||
// images
|
// images
|
||||||
.replace( QRegExp( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(file://)?[\\x00-\\x30\\x7f]*([^\"']*)",
|
.replace( QRegExp( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?:file://)?[\\x00-\\x30\\x7f]*([^\"']*)",
|
||||||
Qt::CaseInsensitive, QRegExp::RegExp2 ),
|
Qt::CaseInsensitive, QRegExp::RegExp2 ),
|
||||||
"\\1bres://" + QString::fromStdString( getId() ) + "/\\3" )
|
"\\1bres://" + id + "/\\2" )
|
||||||
|
.replace( QRegExp( "(<\\s*img\\s+[^>]*src\\s*=\\s*)(?!['\"]+)(?!bres:|data:)(?:file://)?([^\\s>]+)",
|
||||||
|
Qt::CaseInsensitive, QRegExp::RegExp2 ),
|
||||||
|
"\\1\"bres://" + id + "/\\2\"" )
|
||||||
.toUtf8().constData() );
|
.toUtf8().constData() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -820,36 +859,20 @@ static void addEntryToIndexSingle( QString const & word, uint32_t offset, Indexe
|
||||||
indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset );
|
indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset );
|
||||||
}
|
}
|
||||||
|
|
||||||
class ArticleHandler: public MdxParser::ArticleHandler
|
class ArticleHandler: public MdictParser::RecordHandler
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) :
|
ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) :
|
||||||
chunks( chunks ),
|
chunks( chunks ),
|
||||||
indexedWords( indexedWords ),
|
indexedWords( indexedWords )
|
||||||
articleCount_( 0 )
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
inline size_t articleCount()
|
virtual void handleRecord( QString const & headWord, MdictParser::RecordInfo const & recordInfo )
|
||||||
{
|
{
|
||||||
return articleCount_;
|
// Save the article's record info
|
||||||
}
|
|
||||||
|
|
||||||
void handleAritcle( QString const & headWord, QString const & article )
|
|
||||||
{
|
|
||||||
if ( !article.startsWith( "@@@LINK=" ) )
|
|
||||||
{
|
|
||||||
articleCount_++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save the article's body itself first
|
|
||||||
uint32_t articleAddress = chunks.startNewBlock();
|
uint32_t articleAddress = chunks.startNewBlock();
|
||||||
string headWordU8 = string( headWord.toUtf8().constData() );
|
chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
|
||||||
string articleU8 = string( article.toUtf8().constData() );
|
|
||||||
|
|
||||||
chunks.addToBlock( headWordU8.c_str(), headWordU8.size() + 1 );
|
|
||||||
chunks.addToBlock( articleU8.c_str(), articleU8.size() + 1 );
|
|
||||||
|
|
||||||
// Add entries to the index
|
// Add entries to the index
|
||||||
addEntryToIndex( headWord, articleAddress, indexedWords );
|
addEntryToIndex( headWord, articleAddress, indexedWords );
|
||||||
}
|
}
|
||||||
|
@ -857,10 +880,9 @@ public:
|
||||||
private:
|
private:
|
||||||
ChunkedStorage::Writer & chunks;
|
ChunkedStorage::Writer & chunks;
|
||||||
IndexedWords & indexedWords;
|
IndexedWords & indexedWords;
|
||||||
size_t articleCount_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class ResourceHandler: public MddParser::ResourceHandler
|
class ResourceHandler: public MdictParser::RecordHandler
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ):
|
ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ):
|
||||||
|
@ -869,18 +891,10 @@ public:
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void handleResource( QString const & fileName, quint32 decompressedBlockSize,
|
virtual void handleRecord( QString const & fileName, MdictParser::RecordInfo const & recordInfo )
|
||||||
quint32 compressedBlockPos, quint32 compressedBlockSize,
|
|
||||||
quint32 resourceOffset, quint32 resourceSize )
|
|
||||||
{
|
{
|
||||||
uint32_t resourceInfoAddress = chunks.startNewBlock();
|
uint32_t resourceInfoAddress = chunks.startNewBlock();
|
||||||
MddIndexEntry mddIndexEntry;
|
chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
|
||||||
mddIndexEntry.decompressedBlockSize = decompressedBlockSize;
|
|
||||||
mddIndexEntry.compressedBlockPos = compressedBlockPos;
|
|
||||||
mddIndexEntry.compressedBlockSize = compressedBlockSize;
|
|
||||||
mddIndexEntry.resourceOffset = resourceOffset;
|
|
||||||
mddIndexEntry.resourceSize = resourceSize;
|
|
||||||
chunks.addToBlock( &mddIndexEntry, sizeof( mddIndexEntry ) );
|
|
||||||
// Add entries to the index
|
// Add entries to the index
|
||||||
addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords );
|
addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords );
|
||||||
}
|
}
|
||||||
|
@ -935,15 +949,15 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
||||||
indexIsOldOrBad( indexFile, !mddFileName.empty() ) )
|
indexIsOldOrBad( indexFile, !mddFileName.empty() ) )
|
||||||
{
|
{
|
||||||
// Building the index
|
// Building the index
|
||||||
MdxParser parser( i->c_str() );
|
MdictParser parser( i->c_str() );
|
||||||
sptr<MddParser> mddParser = NULL;
|
sptr<MdictParser> mddParser = NULL;
|
||||||
|
|
||||||
if ( !parser.open() )
|
if ( !parser.open() )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if ( File::exists( mddFileName ) )
|
if ( File::exists( mddFileName ) )
|
||||||
{
|
{
|
||||||
mddParser = new MddParser( mddFileName.c_str() );
|
mddParser = new MdictParser( mddFileName.c_str() );
|
||||||
if ( !mddParser->open() )
|
if ( !mddParser->open() )
|
||||||
{
|
{
|
||||||
FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() );
|
FDPRINTF( stderr, "Warning: Invalid mdd (resource) file: %s\n", mddFileName.c_str() );
|
||||||
|
@ -960,9 +974,18 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
||||||
// We write a dummy header first. At the end of the process the header
|
// We write a dummy header first. At the end of the process the header
|
||||||
// will be rewritten with the right values.
|
// will be rewritten with the right values.
|
||||||
idx.write( idxHeader );
|
idx.write( idxHeader );
|
||||||
|
|
||||||
|
// Write the title first
|
||||||
idx.write< uint32_t >( title.size() );
|
idx.write< uint32_t >( title.size() );
|
||||||
idx.write( title.data(), title.size() );
|
idx.write( title.data(), title.size() );
|
||||||
|
|
||||||
|
// then the encoding
|
||||||
|
{
|
||||||
|
string encoding = string( parser.encoding().toUtf8().constData() );
|
||||||
|
idx.write< uint32_t >( encoding.size() );
|
||||||
|
idx.write( encoding.data(), encoding.size() );
|
||||||
|
}
|
||||||
|
|
||||||
// This is our index data that we accumulate during the loading process.
|
// This is our index data that we accumulate during the loading process.
|
||||||
// For each new word encountered, we emit the article's body to the file
|
// For each new word encountered, we emit the article's body to the file
|
||||||
// immediately, inserting the word itself and its offset in this map.
|
// immediately, inserting the word itself and its offset in this map.
|
||||||
|
@ -976,10 +999,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
||||||
// Save dictionary description if there's one
|
// Save dictionary description if there's one
|
||||||
{
|
{
|
||||||
string description = string( parser.description().toUtf8().constData() );
|
string description = string( parser.description().toUtf8().constData() );
|
||||||
idxHeader.descriptionSize = 0;
|
|
||||||
idxHeader.descriptionAddress = chunks.startNewBlock();
|
idxHeader.descriptionAddress = chunks.startNewBlock();
|
||||||
chunks.addToBlock( description.c_str(), description.size() + 1 );
|
chunks.addToBlock( description.c_str(), description.size() + 1 );
|
||||||
idxHeader.descriptionSize += description.size() + 1;
|
idxHeader.descriptionSize = description.size() + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ArticleHandler articleHandler( chunks, indexedWords );
|
ArticleHandler articleHandler( chunks, indexedWords );
|
||||||
|
@ -1062,7 +1084,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
||||||
idxHeader.formatVersion = kCurrentFormatVersion;
|
idxHeader.formatVersion = kCurrentFormatVersion;
|
||||||
idxHeader.parserVersion = MdictParser::kParserVersion;
|
idxHeader.parserVersion = MdictParser::kParserVersion;
|
||||||
idxHeader.foldingVersion = Folding::Version;
|
idxHeader.foldingVersion = Folding::Version;
|
||||||
idxHeader.articleCount = articleHandler.articleCount();
|
idxHeader.articleCount = parser.wordCount();
|
||||||
idxHeader.wordCount = parser.wordCount();
|
idxHeader.wordCount = parser.wordCount();
|
||||||
|
|
||||||
idx.rewind();
|
idx.rewind();
|
||||||
|
|
Loading…
Reference in a new issue