2013-04-23 12:07:05 +00:00
|
|
|
// https://bitbucket.org/xwang/mdict-analysis
|
2015-10-08 17:11:45 +00:00
|
|
|
// https://github.com/zhansliu/writemdict/blob/master/fileformat.md
|
2013-04-23 12:07:05 +00:00
|
|
|
// Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
|
|
|
|
//
|
|
|
|
// Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
|
|
|
|
// Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com>
|
2015-10-08 17:11:45 +00:00
|
|
|
// Copyright (C) 2015 Zhe Wang <0x1998 AT gmail DOT com>
|
2013-04-23 12:07:05 +00:00
|
|
|
//
|
|
|
|
// This program is a free software; you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
// the Free Software Foundation, version 3 of the License.
|
|
|
|
//
|
|
|
|
// You can get a copy of GNU General Public License along this program
|
|
|
|
// But you can always get it from http://www.gnu.org/licenses/gpl.txt
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
|
|
|
|
#include "mdictparser.hh"
|
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
#include <zlib.h>
|
|
|
|
#include <lzo/lzo1x.h>
|
2022-09-08 13:46:17 +00:00
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
#include <QtEndian>
|
|
|
|
#include <QStringList>
|
|
|
|
#include <QByteArray>
|
|
|
|
#include <QFileInfo>
|
2018-02-27 16:42:21 +00:00
|
|
|
#include <QRegularExpression>
|
2013-04-23 12:07:05 +00:00
|
|
|
#include <QDomDocument>
|
|
|
|
#include <QTextDocumentFragment>
|
2015-05-20 14:52:33 +00:00
|
|
|
#include <QDataStream>
|
2022-02-28 16:26:59 +00:00
|
|
|
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
|
|
|
|
#include <QtCore5Compat/QTextCodec>
|
|
|
|
#else
|
2021-10-18 16:19:25 +00:00
|
|
|
#include <QTextCodec>
|
2022-02-28 16:26:59 +00:00
|
|
|
#endif
|
2013-04-23 16:38:30 +00:00
|
|
|
#include "decompress.hh"
|
2015-10-08 17:11:45 +00:00
|
|
|
#include "gddebug.hh"
|
2015-10-11 14:58:31 +00:00
|
|
|
#include "ripemd.hh"
|
2022-04-16 07:14:26 +00:00
|
|
|
#include "utils.hh"
|
2022-04-16 11:34:12 +00:00
|
|
|
#include "htmlescape.hh"
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2013-04-28 08:26:04 +00:00
|
|
|
namespace Mdict
|
|
|
|
{
|
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
enum EncryptedSection
|
|
|
|
{
|
|
|
|
EcryptedHeadWordHeader = 1,
|
|
|
|
EcryptedHeadWordIndex = 2
|
|
|
|
};
|
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
static inline int u16StrSize( const ushort * unicode )
|
|
|
|
{
|
|
|
|
int size = 0;
|
|
|
|
if ( unicode )
|
|
|
|
{
|
|
|
|
while ( unicode[size] != 0 )
|
|
|
|
size++;
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static QDomNamedNodeMap parseHeaderAttributes( const QString & headerText )
|
|
|
|
{
|
|
|
|
QDomNamedNodeMap attributes;
|
|
|
|
QDomDocument doc;
|
|
|
|
doc.setContent( headerText );
|
|
|
|
|
|
|
|
QDomElement docElem = doc.documentElement();
|
|
|
|
attributes = docElem.attributes();
|
|
|
|
|
|
|
|
|
|
|
|
return attributes;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t MdictParser::RecordIndex::bsearch( const vector<MdictParser::RecordIndex> & offsets, qint64 val )
|
|
|
|
{
|
|
|
|
if ( offsets.size() == 0 )
|
|
|
|
return ( size_t ) ( -1 );
|
|
|
|
|
|
|
|
size_t lo = 0;
|
|
|
|
size_t hi = offsets.size() - 1;
|
|
|
|
|
|
|
|
while ( lo <= hi )
|
|
|
|
{
|
|
|
|
size_t mid = ( lo + hi ) >> 1;
|
|
|
|
RecordIndex const & p = offsets[mid];
|
|
|
|
if ( p == val )
|
|
|
|
return mid;
|
|
|
|
else if ( p < val )
|
|
|
|
lo = mid + 1;
|
|
|
|
else
|
|
|
|
hi = mid - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ( size_t ) ( -1 );
|
|
|
|
}
|
|
|
|
|
2013-05-11 05:41:26 +00:00
|
|
|
MdictParser::MdictParser() :
|
|
|
|
version_( 0 ),
|
|
|
|
numHeadWordBlocks_( 0 ),
|
|
|
|
headWordBlockInfoSize_( 0 ),
|
|
|
|
headWordBlockSize_( 0 ),
|
|
|
|
headWordBlockInfoPos_( 0 ),
|
|
|
|
headWordPos_( 0 ),
|
|
|
|
totalRecordsSize_( 0 ),
|
|
|
|
recordPos_( 0 ),
|
|
|
|
wordCount_( 0 ),
|
|
|
|
numberTypeSize_( 0 ),
|
2015-10-08 17:11:45 +00:00
|
|
|
encrypted_( 0 ),
|
|
|
|
rtl_( false )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-05-11 05:41:26 +00:00
|
|
|
bool MdictParser::open( const char * filename )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2013-05-11 05:41:26 +00:00
|
|
|
filename_ = QString::fromUtf8( filename );
|
2013-04-23 12:07:05 +00:00
|
|
|
file_ = new QFile( filename_ );
|
|
|
|
|
2022-05-15 04:41:24 +00:00
|
|
|
gdDebug( "MdictParser: open %s", filename );
|
2013-05-11 05:41:26 +00:00
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
if ( file_.isNull() || !file_->exists() )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if ( !file_->open( QIODevice::ReadOnly ) )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
QDataStream in( file_ );
|
|
|
|
in.setByteOrder( QDataStream::BigEndian );
|
|
|
|
|
|
|
|
if ( !readHeader( in ) )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if ( !readHeadWordBlockInfos( in ) )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if ( !readRecordBlockInfos() )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIndex )
|
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
|
|
|
|
return false;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
qint64 compressedSize = headWordBlockInfosIter_->first;
|
|
|
|
qint64 decompressedSize = headWordBlockInfosIter_->second;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
if ( compressedSize < 8 )
|
|
|
|
return false;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
ScopedMemMap compressed( *file_, headWordPos_, compressedSize );
|
|
|
|
if ( !compressed.startAddress() )
|
|
|
|
return false;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
headWordPos_ += compressedSize;
|
|
|
|
QByteArray decompressed;
|
|
|
|
if ( !parseCompressedBlock( compressedSize, ( char * )compressed.startAddress(),
|
|
|
|
decompressedSize, decompressed ) )
|
|
|
|
return false;
|
2013-04-28 08:26:04 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
headWordIndex = splitHeadWordBlock( decompressed );
|
2022-02-16 13:47:03 +00:00
|
|
|
++headWordBlockInfosIter_;
|
2015-10-08 17:11:45 +00:00
|
|
|
return true;
|
|
|
|
}
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
bool MdictParser::checkAdler32(const char * buffer, unsigned int len, quint32 checksum)
|
|
|
|
{
|
|
|
|
uLong adler = adler32( 0L, Z_NULL, 0 );
|
|
|
|
adler = adler32( adler, ( const Bytef * ) buffer, len );
|
|
|
|
return (adler & 0xFFFFFFFF) == checksum;
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t fromSize )
|
|
|
|
{
|
|
|
|
if ( !fromCode || !from )
|
|
|
|
return QString();
|
|
|
|
|
2021-10-18 16:19:25 +00:00
|
|
|
QTextCodec *codec =QTextCodec::codecForName(fromCode);
|
|
|
|
return codec->toUnicode(from,fromSize);
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
bool MdictParser::decryptHeadWordIndex(char * buffer, qint64 len)
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2015-10-11 14:58:31 +00:00
|
|
|
RIPEMD128 ripemd;
|
|
|
|
ripemd.update( ( const uchar * ) buffer + 4, 4 );
|
|
|
|
ripemd.update( ( const uchar * ) "\x95\x36\x00\x00", 4 );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
uint8_t key[16];
|
2015-10-11 14:58:31 +00:00
|
|
|
ripemd.digest( key );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
buffer += 8;
|
|
|
|
len -= 8;
|
|
|
|
uint8_t prev = 0x36;
|
|
|
|
for (qint64 i = 0; i < len; ++i)
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
uint8_t byte = buffer[i];
|
|
|
|
byte = (byte >> 4) | (byte << 4);
|
|
|
|
byte = byte ^ prev ^ (i & 0xFF) ^ key[i % 16];
|
|
|
|
prev = buffer[i];
|
|
|
|
buffer[i] = byte;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2013-05-01 05:34:56 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize,
|
|
|
|
const char * compressedBlockPtr,
|
|
|
|
qint64 decompressedBlockSize,
|
|
|
|
QByteArray & decompressedBlock )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
|
|
|
if ( compressedBlockSize <= 8 )
|
|
|
|
return false;
|
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
// compression type
|
|
|
|
quint32 type = qFromBigEndian<quint32>( ( const uchar * ) compressedBlockPtr );
|
|
|
|
quint32 checksum = qFromBigEndian<quint32>( ( const uchar * )compressedBlockPtr + 4 );
|
|
|
|
const char * buf = compressedBlockPtr + 8;
|
|
|
|
qint64 size = compressedBlockSize - 8;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
switch ( type )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
case 0x00000000:
|
|
|
|
// No compression
|
|
|
|
if ( !checkAdler32( buf, size, checksum ) )
|
|
|
|
{
|
|
|
|
gdWarning( "MDict: parseCompressedBlock: plain: checksum not match" );
|
|
|
|
return false;
|
|
|
|
}
|
2013-05-01 05:34:56 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
decompressedBlock = QByteArray( buf, size );
|
|
|
|
return true;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
case 0x01000000:
|
|
|
|
{
|
|
|
|
// LZO compression
|
|
|
|
int result;
|
|
|
|
lzo_uint blockSize = ( lzo_uint )decompressedBlockSize;
|
|
|
|
decompressedBlock.resize( blockSize );
|
|
|
|
result = lzo1x_decompress_safe( ( const uchar * ) buf, size,
|
|
|
|
( uchar * )decompressedBlock.data(),
|
|
|
|
&blockSize, NULL );
|
|
|
|
|
|
|
|
if ( result != LZO_E_OK || blockSize != ( lzo_uint )decompressedBlockSize )
|
|
|
|
{
|
|
|
|
gdWarning( "MDict: parseCompressedBlock: decompression failed" );
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( checksum != lzo_adler32( lzo_adler32( 0, NULL, 0 ),
|
|
|
|
( const uchar * )decompressedBlock.constData(),
|
|
|
|
blockSize ) )
|
|
|
|
{
|
|
|
|
gdWarning( "MDict: parseCompressedBlock: lzo: checksum does not match" );
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
case 0x02000000:
|
|
|
|
// zlib compression
|
|
|
|
decompressedBlock = zlibDecompress( buf, size );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
if ( !checkAdler32( decompressedBlock.constData(), decompressedBlock.size(),
|
|
|
|
checksum ) )
|
|
|
|
{
|
|
|
|
gdWarning( "MDict: parseCompressedBlock: zlib: checksum does not match" );
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
gdWarning( "MDict: parseCompressedBlock: unknown type" );
|
|
|
|
return false;
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
qint64 MdictParser::readNumber( QDataStream & in )
|
|
|
|
{
|
|
|
|
if ( numberTypeSize_ == 8 )
|
|
|
|
{
|
|
|
|
qint64 val;
|
|
|
|
in >> val;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
quint32 val;
|
|
|
|
in >> val;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
quint32 MdictParser::readU8OrU16( QDataStream & in, bool isU16 )
|
|
|
|
{
|
|
|
|
if ( isU16 )
|
|
|
|
{
|
|
|
|
quint16 val;
|
|
|
|
in >> val;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
quint8 val;
|
|
|
|
in >> val;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MdictParser::readHeader( QDataStream & in )
|
|
|
|
{
|
|
|
|
qint32 headerTextSize;
|
|
|
|
in >> headerTextSize;
|
|
|
|
|
|
|
|
QByteArray headerTextUtf16 = file_->read( headerTextSize );
|
|
|
|
if ( headerTextUtf16.size() != headerTextSize )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
QString headerText = toUtf16( "UTF-16LE", headerTextUtf16.constData(), headerTextUtf16.size() );
|
2015-10-08 17:11:45 +00:00
|
|
|
|
|
|
|
// Adler-32 checksum of the header text (little-endian)
|
|
|
|
quint32 checksum;
|
|
|
|
in.setByteOrder( QDataStream::LittleEndian );
|
|
|
|
in >> checksum;
|
|
|
|
if ( !checkAdler32( headerTextUtf16.constData(), headerTextUtf16.size(), checksum ) )
|
|
|
|
{
|
|
|
|
gdWarning( "MDict: readHeader: checksum does not match" );
|
|
|
|
return false;
|
|
|
|
}
|
2013-04-23 12:07:05 +00:00
|
|
|
headerTextUtf16.clear();
|
2015-10-08 17:11:45 +00:00
|
|
|
in.setByteOrder( QDataStream::BigEndian );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2022-06-16 00:00:24 +00:00
|
|
|
|
|
|
|
//parse stylesheet
|
|
|
|
QString styleSheets;
|
|
|
|
|
|
|
|
if( headerText.contains( "StyleSheet" ) )
|
|
|
|
{
|
|
|
|
// a workaround to bypass https://bugreports.qt.io/browse/QTBUG-102612
|
|
|
|
QRegularExpression rx( "StyleSheet=\"([^\"]*?)\"", QRegularExpression::CaseInsensitiveOption );
|
|
|
|
|
|
|
|
auto match = rx.match( headerText );
|
|
|
|
|
|
|
|
if( match.hasMatch() || match.hasPartialMatch() )
|
|
|
|
{
|
|
|
|
styleSheets = match.captured( 1 );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-11 08:25:56 +00:00
|
|
|
//with this control character ,qt6.x can not parse attribute value.
|
|
|
|
headerText.remove(QRegularExpression("\\p{C}"));
|
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
QDomNamedNodeMap headerAttributes = parseHeaderAttributes( headerText );
|
|
|
|
|
|
|
|
encoding_ = headerAttributes.namedItem( "Encoding" ).toAttr().value();
|
|
|
|
if ( encoding_ == "GBK" || encoding_ == "GB2312" )
|
|
|
|
{
|
|
|
|
encoding_ = "GB18030";
|
|
|
|
}
|
|
|
|
else if ( encoding_.isEmpty() || encoding_ == "UTF-16" )
|
|
|
|
{
|
|
|
|
encoding_ = "UTF-16LE";
|
|
|
|
}
|
|
|
|
|
|
|
|
// stylesheet attribute if present takes form of:
|
|
|
|
// styleId # 1-255
|
|
|
|
// style.prefix
|
|
|
|
// style.suffix
|
2022-06-16 00:00:24 +00:00
|
|
|
if ( !styleSheets.isEmpty() )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2022-01-08 13:16:22 +00:00
|
|
|
QStringList lines = styleSheets.split( QRegularExpression( "[\r\n]" ), Qt::KeepEmptyParts );
|
2021-11-26 09:24:59 +00:00
|
|
|
|
2022-04-16 10:22:16 +00:00
|
|
|
for( int i = 0; i < lines.size() - 3; i += 3 )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2022-04-16 10:22:16 +00:00
|
|
|
styleSheets_[ lines[ i ].toInt() ] =
|
2022-06-16 00:00:24 +00:00
|
|
|
pair( Html::fromHtmlEscaped( lines[ i + 1 ] ), Html::fromHtmlEscaped( lines[ i + 2 ] ) );
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// before version 2.0, number is 4 bytes integer
|
|
|
|
// version 2.0 and above uses 8 bytes
|
|
|
|
version_ = headerAttributes.namedItem( "GeneratedByEngineVersion" ).toAttr().value().toDouble();
|
|
|
|
if ( version_ < 2.0 )
|
|
|
|
numberTypeSize_ = 4;
|
|
|
|
else
|
|
|
|
numberTypeSize_ = 8;
|
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
// Encrypted ?
|
|
|
|
encrypted_ = headerAttributes.namedItem("Encrypted").toAttr().value().toInt();
|
2013-04-23 12:07:05 +00:00
|
|
|
|
|
|
|
// Read metadata
|
|
|
|
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
|
|
|
|
QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
|
2013-04-28 08:26:04 +00:00
|
|
|
if ( title.isEmpty() || title.length() < 5 || title == "Title (No HTML code allowed)" )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
|
|
|
// Use filename instead
|
|
|
|
QFileInfo fi( filename_ );
|
|
|
|
title_ = fi.baseName();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if ( title.contains( '<' ) || title.contains( '>' ) )
|
|
|
|
title_ = QTextDocumentFragment::fromHtml( title ).toPlainText();
|
|
|
|
else
|
|
|
|
title_ = title;
|
|
|
|
}
|
|
|
|
QString description = headerAttributes.namedItem( "Description" ).toAttr().value();
|
|
|
|
description_ = QTextDocumentFragment::fromHtml( description ).toPlainText();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MdictParser::readHeadWordBlockInfos( QDataStream & in )
|
|
|
|
{
|
2015-10-12 14:26:22 +00:00
|
|
|
QByteArray header = file_->read( version_ >= 2.0 ? ( numberTypeSize_ * 5 )
|
|
|
|
: ( numberTypeSize_ * 4 ) );
|
2015-10-08 17:11:45 +00:00
|
|
|
QDataStream stream( header );
|
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
// number of headword blocks
|
2015-10-08 17:11:45 +00:00
|
|
|
numHeadWordBlocks_ = readNumber( stream );
|
2013-04-23 12:07:05 +00:00
|
|
|
// number of entries
|
2015-10-08 17:11:45 +00:00
|
|
|
wordCount_ = readNumber( stream );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
// number of bytes of a headword block info after decompression
|
|
|
|
qint64 decompressedSize;
|
2013-04-23 12:07:05 +00:00
|
|
|
if ( version_ >= 2.0 )
|
2015-10-08 17:11:45 +00:00
|
|
|
stream >> decompressedSize;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
// number of bytes of a headword block info before decompression
|
|
|
|
headWordBlockInfoSize_ = readNumber( stream );
|
2013-04-23 12:07:05 +00:00
|
|
|
// number of bytes of a headword block
|
2015-10-08 17:11:45 +00:00
|
|
|
headWordBlockSize_ = readNumber( stream );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
// Adler-32 checksum of the header. If those are encrypted, it is
|
|
|
|
// the checksum of the decrypted version
|
2013-04-23 12:07:05 +00:00
|
|
|
if ( version_ >= 2.0 )
|
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
quint32 checksum;
|
|
|
|
in >> checksum;
|
2015-10-12 14:26:22 +00:00
|
|
|
if ( !checkAdler32( header.constData(), numberTypeSize_ * 5, checksum ) )
|
2013-04-23 12:07:05 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
headWordBlockInfoPos_ = file_->pos();
|
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
// read headword block info
|
2013-04-23 12:07:05 +00:00
|
|
|
QByteArray headWordBlockInfo = file_->read( headWordBlockInfoSize_ );
|
|
|
|
if ( headWordBlockInfo.size() != headWordBlockInfoSize_ )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if ( version_ >= 2.0 )
|
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
// decrypt
|
|
|
|
if ( encrypted_ & EcryptedHeadWordIndex )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
if ( !decryptHeadWordIndex( headWordBlockInfo.data(),
|
|
|
|
headWordBlockInfo.size() ) )
|
|
|
|
return false;
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
QByteArray decompressed;
|
|
|
|
if ( !parseCompressedBlock( headWordBlockInfo.size(), headWordBlockInfo.data(),
|
|
|
|
decompressedSize, decompressed) )
|
|
|
|
return false;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2015-10-08 17:11:45 +00:00
|
|
|
headWordBlockInfos_ = decodeHeadWordBlockInfo( decompressed );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
headWordBlockInfos_ = decodeHeadWordBlockInfo( headWordBlockInfo );
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
headWordPos_ = file_->pos();
|
|
|
|
headWordBlockInfosIter_ = headWordBlockInfos_.begin();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MdictParser::readRecordBlockInfos()
|
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
file_->seek( headWordBlockInfoPos_ + headWordBlockInfoSize_ +
|
|
|
|
headWordBlockSize_ );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
|
|
|
QDataStream in( file_ );
|
|
|
|
in.setByteOrder( QDataStream::BigEndian );
|
|
|
|
qint64 numRecordBlocks = readNumber( in );
|
2015-10-08 17:11:45 +00:00
|
|
|
readNumber( in ); // total number of records, skip
|
2013-04-23 12:07:05 +00:00
|
|
|
qint64 recordInfoSize = readNumber( in );
|
|
|
|
totalRecordsSize_ = readNumber( in );
|
|
|
|
recordPos_ = file_->pos() + recordInfoSize;
|
|
|
|
|
|
|
|
// Build record block index
|
|
|
|
recordBlockInfos_.reserve( numRecordBlocks );
|
|
|
|
|
|
|
|
qint64 acc1 = 0;
|
|
|
|
qint64 acc2 = 0;
|
|
|
|
for ( qint64 i = 0; i < numRecordBlocks; i++ )
|
|
|
|
{
|
|
|
|
RecordIndex r;
|
|
|
|
r.compressedSize = readNumber( in );
|
|
|
|
r.decompressedSize = readNumber( in );
|
|
|
|
r.startPos = acc1;
|
|
|
|
r.endPos = acc1 + r.compressedSize;
|
|
|
|
r.shadowStartPos = acc2;
|
|
|
|
r.shadowEndPos = acc2 + r.decompressedSize;
|
|
|
|
recordBlockInfos_.push_back( r );
|
|
|
|
|
|
|
|
acc1 = r.endPos;
|
|
|
|
acc2 = r.shadowEndPos;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
MdictParser::BlockInfoVector MdictParser::decodeHeadWordBlockInfo( QByteArray const & headWordBlockInfo )
|
|
|
|
{
|
|
|
|
BlockInfoVector headWordBlockInfos;
|
|
|
|
|
|
|
|
QDataStream s( headWordBlockInfo );
|
|
|
|
s.setByteOrder( QDataStream::BigEndian );
|
|
|
|
|
|
|
|
bool isU16 = false;
|
|
|
|
int textTermSize = 0;
|
|
|
|
|
|
|
|
if ( version_ >= 2.0 )
|
|
|
|
{
|
|
|
|
isU16 = true;
|
|
|
|
textTermSize = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
while ( !s.atEnd() )
|
|
|
|
{
|
2015-10-08 17:11:45 +00:00
|
|
|
// Number of keywords in the block
|
2013-04-23 12:07:05 +00:00
|
|
|
s.skipRawData( numberTypeSize_ );
|
2015-10-08 17:11:45 +00:00
|
|
|
// Size of the first headword in the block
|
2013-04-23 12:07:05 +00:00
|
|
|
quint32 textHeadSize = readU8OrU16( s, isU16 );
|
2015-10-08 17:11:45 +00:00
|
|
|
// The first headword
|
2013-04-23 12:07:05 +00:00
|
|
|
if ( encoding_ != "UTF-16LE" )
|
|
|
|
s.skipRawData( textHeadSize + textTermSize );
|
|
|
|
else
|
|
|
|
s.skipRawData( ( textHeadSize + textTermSize ) * 2 );
|
2015-10-08 17:11:45 +00:00
|
|
|
// Size of the last headword in the block
|
2013-04-23 12:07:05 +00:00
|
|
|
quint32 textTailSize = readU8OrU16( s, isU16 );
|
2015-10-08 17:11:45 +00:00
|
|
|
// The last headword
|
2013-04-23 12:07:05 +00:00
|
|
|
if ( encoding_ != "UTF-16LE" )
|
|
|
|
s.skipRawData( textTailSize + textTermSize );
|
|
|
|
else
|
|
|
|
s.skipRawData( ( textTailSize + textTermSize ) * 2 );
|
|
|
|
|
|
|
|
// headword block compressed size
|
|
|
|
qint64 compressedSize = readNumber( s );
|
|
|
|
// headword block decompressed size
|
|
|
|
qint64 decompressedSize = readNumber( s );
|
2022-08-09 12:47:10 +00:00
|
|
|
headWordBlockInfos.emplace_back(compressedSize, decompressedSize);
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return headWordBlockInfos;
|
|
|
|
}
|
|
|
|
|
|
|
|
MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & block )
|
|
|
|
{
|
|
|
|
HeadWordIndex index;
|
|
|
|
|
|
|
|
const char * p = block.constData();
|
|
|
|
const char * end = p + block.size();
|
|
|
|
|
|
|
|
while ( p < end )
|
|
|
|
{
|
|
|
|
qint64 headWordId = ( numberTypeSize_ == 8 ) ?
|
|
|
|
qFromBigEndian<qint64>( ( const uchar * )p ) :
|
|
|
|
qFromBigEndian<quint32>( ( const uchar * )p );
|
|
|
|
p += numberTypeSize_;
|
|
|
|
QByteArray headWordBuf;
|
|
|
|
|
|
|
|
if ( encoding_ == "UTF-16LE" )
|
|
|
|
{
|
|
|
|
int headWordLength = u16StrSize( ( const ushort * )p );
|
|
|
|
headWordBuf = QByteArray( p, ( headWordLength + 1 ) * 2 );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int headWordLength = strlen( p );
|
|
|
|
headWordBuf = QByteArray( p, headWordLength + 1 );
|
|
|
|
}
|
|
|
|
p += headWordBuf.size();
|
|
|
|
QString headWord = toUtf16( encoding_, headWordBuf.constBegin(), headWordBuf.size() );
|
2022-08-09 12:47:10 +00:00
|
|
|
index.emplace_back(headWordId, headWord);
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return index;
|
|
|
|
}
|
|
|
|
|
2013-04-28 08:26:04 +00:00
|
|
|
bool MdictParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
|
2022-08-07 03:03:46 +00:00
|
|
|
MdictParser::RecordHandler & recordHandler )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2013-04-28 08:26:04 +00:00
|
|
|
// cache the index, the headWordIndex is already sorted
|
|
|
|
size_t idx = 0;
|
2013-04-23 12:07:05 +00:00
|
|
|
|
2018-04-08 14:57:10 +00:00
|
|
|
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); ++i )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2022-01-10 15:42:27 +00:00
|
|
|
if (recordBlockInfos_[idx].shadowEndPos <= i->first)
|
2013-04-28 08:26:04 +00:00
|
|
|
idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
|
2013-04-23 12:07:05 +00:00
|
|
|
|
|
|
|
if ( idx == ( size_t )( -1 ) )
|
|
|
|
return false;
|
|
|
|
|
2013-04-28 08:26:04 +00:00
|
|
|
RecordIndex const & recordIndex = recordBlockInfos_[idx];
|
2013-04-23 12:07:05 +00:00
|
|
|
HeadWordIndex::const_iterator iNext = i + 1;
|
2013-06-04 16:17:55 +00:00
|
|
|
qint64 recordSize;
|
2022-08-07 03:03:46 +00:00
|
|
|
if ( iNext == headWordIndex.end() )
|
|
|
|
recordSize = recordIndex.shadowEndPos - i->first;
|
2013-04-23 12:07:05 +00:00
|
|
|
else
|
2022-08-07 03:03:46 +00:00
|
|
|
recordSize = iNext->first - i->first;
|
2013-04-28 08:26:04 +00:00
|
|
|
|
|
|
|
RecordInfo recordInfo;
|
|
|
|
recordInfo.compressedBlockPos = recordPos_ + recordIndex.startPos;
|
2022-08-07 03:03:46 +00:00
|
|
|
recordInfo.recordOffset = i->first - recordIndex.shadowStartPos;
|
2013-04-28 08:26:04 +00:00
|
|
|
recordInfo.decompressedBlockSize = recordIndex.decompressedSize;
|
|
|
|
recordInfo.compressedBlockSize = recordIndex.compressedSize;
|
|
|
|
recordInfo.recordSize = recordSize;
|
|
|
|
|
2022-08-07 03:03:46 +00:00
|
|
|
recordHandler.handleRecord( i->second, recordInfo );
|
2013-04-23 12:07:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-04-28 08:26:04 +00:00
|
|
|
QString & MdictParser::substituteStylesheet( QString & article, MdictParser::StyleSheets const & styleSheets )
|
2013-04-23 12:07:05 +00:00
|
|
|
{
|
2018-02-27 16:42:21 +00:00
|
|
|
QRegularExpression rx( "`(\\d+)`", QRegularExpression::UseUnicodePropertiesOption );
|
|
|
|
QString articleNewText;
|
2021-11-19 13:47:22 +00:00
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
QString endStyle;
|
|
|
|
int pos = 0;
|
|
|
|
|
2018-02-27 16:42:21 +00:00
|
|
|
QRegularExpressionMatchIterator it = rx.globalMatch( article );
|
|
|
|
while ( it.hasNext() )
|
|
|
|
{
|
|
|
|
QRegularExpressionMatch match = it.next();
|
|
|
|
int styleId = match.captured( 1 ).toInt();
|
2022-02-27 05:17:37 +00:00
|
|
|
articleNewText += article.mid( pos, match.capturedStart() - pos );
|
2018-02-27 16:42:21 +00:00
|
|
|
pos = match.capturedEnd();
|
2021-11-26 09:24:59 +00:00
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
StyleSheets::const_iterator iter = styleSheets.find( styleId );
|
|
|
|
|
|
|
|
if ( iter != styleSheets.end() )
|
|
|
|
{
|
|
|
|
QString rep = endStyle + iter->second.first;
|
2018-02-27 16:42:21 +00:00
|
|
|
articleNewText += rep;
|
2021-11-26 09:24:59 +00:00
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
endStyle = iter->second.second;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-02-27 16:42:21 +00:00
|
|
|
articleNewText += endStyle;
|
2021-11-26 09:24:59 +00:00
|
|
|
|
2013-04-23 12:07:05 +00:00
|
|
|
endStyle = "";
|
|
|
|
}
|
|
|
|
}
|
2018-02-27 16:42:21 +00:00
|
|
|
if( pos )
|
|
|
|
{
|
2022-04-16 07:14:26 +00:00
|
|
|
articleNewText += Utils::rstripnull( article.mid( pos ));
|
2018-02-27 16:42:21 +00:00
|
|
|
article = articleNewText;
|
|
|
|
articleNewText.clear();
|
|
|
|
}
|
2013-04-26 01:52:09 +00:00
|
|
|
article += endStyle;
|
2013-04-23 12:07:05 +00:00
|
|
|
return article;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|