goldendict-ng/mdictparser.hh

220 lines
5.4 KiB
C++
Raw Normal View History

// https://bitbucket.org/xwang/mdict-analysis
2015-10-11 14:58:31 +00:00
// https://github.com/zhansliu/writemdict/blob/master/fileformat.md
// Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
//
// Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
// Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com>
2015-10-11 14:58:31 +00:00
// Copyright (C) 2015 Zhe Wang <0x1998 AT gmail DOT com>
//
// This program is a free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 3 of the License.
//
// You can get a copy of GNU General Public License along this program
// But you can always get it from http://www.gnu.org/licenses/gpl.txt
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
#ifndef __MDICTPARSER_HH_INCLUDED__
#define __MDICTPARSER_HH_INCLUDED__
#include <string>
#include <vector>
#include <map>
#include <utility>
#include <QPointer>
#include <QFile>
namespace Mdict
{
using std::string;
using std::vector;
using std::pair;
using std::map;
// A helper class to handle memory map for QFile
class ScopedMemMap
{
QFile & file;
uchar * address;
public:
ScopedMemMap( QFile & file, qint64 offset, qint64 size ) :
file( file ),
address( file.map( offset, size ) )
{
}
~ScopedMemMap()
{
if ( address )
file.unmap( address );
}
inline uchar * startAddress()
{
return address;
}
};
class MdictParser
{
public:
enum
{
kParserVersion = 0x000000d
};
struct RecordIndex
{
qint64 startPos;
qint64 endPos;
qint64 shadowStartPos;
qint64 shadowEndPos;
qint64 compressedSize;
qint64 decompressedSize;
inline bool operator==( qint64 rhs ) const
{
return ( shadowStartPos <= rhs ) && ( rhs < shadowEndPos );
}
inline bool operator<( qint64 rhs ) const
{
return shadowEndPos <= rhs;
}
inline bool operator>( qint64 rhs ) const
{
return shadowStartPos > rhs;
}
static size_t bsearch( vector<RecordIndex> const & offsets, qint64 val );
};
struct RecordInfo
{
qint64 compressedBlockPos;
qint64 recordOffset;
qint64 decompressedBlockSize;
qint64 compressedBlockSize;
qint64 recordSize;
};
class RecordHandler
{
public:
virtual void handleRecord( QString const & name, RecordInfo const & recordInfo ) = 0;
};
typedef vector< pair<qint64, qint64> > BlockInfoVector;
typedef vector< pair<qint64, QString> > HeadWordIndex;
typedef map<qint32, pair<QString, QString> > StyleSheets;
inline QString const & title() const
{
return title_;
}
inline QString const & description() const
{
return description_;
}
inline StyleSheets const & styleSheets() const
{
return styleSheets_;
}
inline quint32 wordCount() const
{
return wordCount_;
}
inline QString const & encoding() const
{
return encoding_;
}
inline QString const & filename() const
{
return filename_;
}
inline bool isRightToLeft() const
{
return rtl_;
}
MdictParser();
~MdictParser() {}
bool open( const char * filename );
bool readNextHeadWordIndex( HeadWordIndex & headWordIndex );
bool readRecordBlock( HeadWordIndex & headWordIndex, RecordHandler & recordHandler, bool cross_block_read=false );
// helpers
static QString toUtf16( const char * fromCode, const char * from, size_t fromSize );
static inline QString toUtf16( QString const & fromCode, const char * from, size_t fromSize )
{
return toUtf16( fromCode.toLatin1().constData(), from, fromSize );
}
static bool parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr,
2015-10-08 17:11:45 +00:00
qint64 decompressedBlockSize, QByteArray & decompressedBlock);
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
{
QString s = QString::fromUtf8( article.c_str() );
substituteStylesheet( s, styleSheets );
return s.toStdString();
}
protected:
qint64 readNumber( QDataStream & in );
static quint32 readU8OrU16( QDataStream & in, bool isU16 );
2015-10-08 17:11:45 +00:00
static bool checkAdler32(const char * buffer, unsigned int len, quint32 checksum);
static bool decryptHeadWordIndex(char * buffer, qint64 len);
bool readHeader( QDataStream & in );
bool readHeadWordBlockInfos( QDataStream & in );
bool readRecordBlockInfos();
BlockInfoVector decodeHeadWordBlockInfo( QByteArray const & headWordBlockInfo );
HeadWordIndex splitHeadWordBlock( QByteArray const & block );
protected:
QString filename_;
QPointer<QFile> file_;
StyleSheets styleSheets_;
BlockInfoVector headWordBlockInfos_;
BlockInfoVector::iterator headWordBlockInfosIter_;
vector<RecordIndex> recordBlockInfos_;
QString encoding_;
QString title_;
QString description_;
double version_;
qint64 numHeadWordBlocks_;
qint64 headWordBlockInfoSize_;
qint64 headWordBlockSize_;
qint64 headWordBlockInfoPos_;
qint64 headWordPos_;
qint64 totalRecordsSize_;
qint64 recordPos_;
quint32 wordCount_;
int numberTypeSize_;
2015-10-08 17:11:45 +00:00
int encrypted_;
bool rtl_;
};
}
#endif // __MDICTPARSER_HH_INCLUDED__