MDict: Support encrypted keyword index

This commit is contained in:
Zhe Wang 2015-10-09 01:11:45 +08:00
parent b4bb1e9635
commit 3fe3c0ea2b
2 changed files with 180 additions and 179 deletions

View file

@ -1,8 +1,10 @@
// https://bitbucket.org/xwang/mdict-analysis // https://bitbucket.org/xwang/mdict-analysis
// https://github.com/zhansliu/writemdict/blob/master/fileformat.md
// Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser // Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
// //
// Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com> // Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
// Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com> // Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com>
// Copyright (C) 2015 Zhe Wang <0x1998 AT gmail DOT com>
// //
// This program is a free software; you can redistribute it and/or modify // This program is a free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
@ -23,8 +25,9 @@
#include <iconv.h> #include <iconv.h>
#include <lzo/lzo1x.h> #include <lzo/lzo1x.h>
#include <algorithm> extern "C" {
#include <iterator> #include <libavutil/ripemd.h>
}
#include <QtEndian> #include <QtEndian>
#include <QStringList> #include <QStringList>
@ -34,13 +37,18 @@
#include <QDomDocument> #include <QDomDocument>
#include <QTextDocumentFragment> #include <QTextDocumentFragment>
#include <QDebug>
#include "decompress.hh" #include "decompress.hh"
#include "gddebug.hh"
namespace Mdict namespace Mdict
{ {
enum EncryptedSection
{
EcryptedHeadWordHeader = 1,
EcryptedHeadWordIndex = 2
};
static inline int u16StrSize( const ushort * unicode ) static inline int u16StrSize( const ushort * unicode )
{ {
int size = 0; int size = 0;
@ -103,9 +111,8 @@ MdictParser::MdictParser() :
recordPos_( 0 ), recordPos_( 0 ),
wordCount_( 0 ), wordCount_( 0 ),
numberTypeSize_( 0 ), numberTypeSize_( 0 ),
rtl_( false ), encrypted_( 0 ),
bruteForce_( false ), rtl_( false )
bruteForceEnd_( true )
{ {
} }
@ -114,7 +121,7 @@ bool MdictParser::open( const char * filename )
filename_ = QString::fromUtf8( filename ); filename_ = QString::fromUtf8( filename );
file_ = new QFile( filename_ ); file_ = new QFile( filename_ );
qDebug() << "MdictParser: open " << filename_; GD_DPRINTF( "MdictParser: open %s\n", filename );
if ( file_.isNull() || !file_->exists() ) if ( file_.isNull() || !file_->exists() )
return false; return false;
@ -139,39 +146,6 @@ bool MdictParser::open( const char * filename )
bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIndex ) bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIndex )
{ {
if ( bruteForce_ )
{
if ( bruteForceEnd_ )
return false;
headWordIndex.clear();
ScopedMemMap mapping( *file_, headWordPos_, headWordBlockSize_ );
if ( !mapping.startAddress() )
return false;
const char * pDataStart = ( const char * )mapping.startAddress();
const char * pDataEnd = pDataStart + headWordBlockSize_;
const char pattern[] = {0x02, 0x00, 0x00, 0x00};
const char * patternBegin = pattern;
const char * patternEnd = pattern + 4;
const char * p;
do
{
p = std::search( pDataStart + 4, pDataEnd, patternBegin, patternEnd );
QByteArray decompressed = zlibDecompress( pDataStart + 8, p - ( pDataStart + 8 ) );
HeadWordIndex currentIndex = splitHeadWordBlock( decompressed );
headWordIndex.insert( headWordIndex.end(), currentIndex.begin(), currentIndex.end() );
pDataStart = p;
}
while ( p != pDataEnd );
bruteForceEnd_ = true;
return true;
}
else
{
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() ) if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
return false; return false;
@ -194,7 +168,13 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
headWordIndex = splitHeadWordBlock( decompressed ); headWordIndex = splitHeadWordBlock( decompressed );
headWordBlockInfosIter_++; headWordBlockInfosIter_++;
return true; return true;
} }
bool MdictParser::checkAdler32(const char * buffer, unsigned int len, quint32 checksum)
{
uLong adler = adler32( 0L, Z_NULL, 0 );
adler = adler32( adler, ( const Bytef * ) buffer, len );
return (adler & 0xFFFFFFFF) == checksum;
} }
QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t fromSize ) QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t fromSize )
@ -236,52 +216,71 @@ QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t f
return QString::fromUtf16( ( const ushort * )&result.front() ); return QString::fromUtf16( ( const ushort * )&result.front() );
} }
bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr, bool MdictParser::decryptHeadWordIndex(char * buffer, qint64 len)
qint64 decompressedBlockSize, QByteArray & decompressedBlock ) {
struct AVRIPEMD * ripemd = av_ripemd_alloc();
if ( av_ripemd_init( ripemd, 128 ) != 0 )
return false;
av_ripemd_update( ripemd, ( const uchar * ) buffer + 4, 4 );
av_ripemd_update( ripemd, ( const uchar * ) "\x95\x36\x00\x00", 4 );
uint8_t key[16];
av_ripemd_final( ripemd, key );
buffer += 8;
len -= 8;
uint8_t prev = 0x36;
for (qint64 i = 0; i < len; ++i)
{
uint8_t byte = buffer[i];
byte = (byte >> 4) | (byte << 4);
byte = byte ^ prev ^ (i & 0xFF) ^ key[i % 16];
prev = buffer[i];
buffer[i] = byte;
}
return true;
}
bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize,
const char * compressedBlockPtr,
qint64 decompressedBlockSize,
QByteArray & decompressedBlock )
{ {
if ( compressedBlockSize <= 8 ) if ( compressedBlockSize <= 8 )
return false; return false;
qint64 dataSize = compressedBlockSize - 8; // compression type
const char * dataPtr = compressedBlockPtr + 8; quint32 type = qFromBigEndian<quint32>( ( const uchar * ) compressedBlockPtr );
// 4bytes - type quint32 checksum = qFromBigEndian<quint32>( ( const uchar * )compressedBlockPtr + 4 );
// 4bytes - checksum const char * buf = compressedBlockPtr + 8;
quint32 type; qint64 size = compressedBlockSize - 8;
quint32 checksum;
type = qFromBigEndian<quint32>( ( const uchar * ) compressedBlockPtr );
checksum = qFromBigEndian<quint32>( ( const uchar * )compressedBlockPtr + sizeof( quint32 ) );
if ( type == 0x00000000 ) switch ( type )
{ {
case 0x00000000:
// No compression // No compression
checksum &= 0xffff; if ( !checkAdler32( buf, size, checksum ) )
quint16 sum = 0;
for ( qint64 i = 0; i < dataSize; i++ )
{ {
sum += dataPtr[i]; gdWarning( "MDict: parseCompressedBlock: plain: checksum not match" );
}
sum += 1;
if ( checksum != sum )
{
qWarning() << "MDict: parseCompressedBlock: plain: checksum not match";
return false; return false;
} }
decompressedBlock = QByteArray( dataPtr, dataSize ); decompressedBlock = QByteArray( buf, size );
} return true;
else if ( type == 0x01000000 )
case 0x01000000:
{ {
// LZO compression // LZO compression
int result; int result;
lzo_uint blockSize = ( lzo_uint )decompressedBlockSize; lzo_uint blockSize = ( lzo_uint )decompressedBlockSize;
decompressedBlock.resize( blockSize ); decompressedBlock.resize( blockSize );
result = lzo1x_decompress_safe( ( const uchar * )dataPtr, dataSize, result = lzo1x_decompress_safe( ( const uchar * ) buf, size,
( uchar * )decompressedBlock.data(), &blockSize, NULL ); ( uchar * )decompressedBlock.data(),
&blockSize, NULL );
if ( result != LZO_E_OK || blockSize != ( lzo_uint )decompressedBlockSize ) if ( result != LZO_E_OK || blockSize != ( lzo_uint )decompressedBlockSize )
{ {
qWarning() << "MDict: parseCompressedBlock: decompression failed"; gdWarning( "MDict: parseCompressedBlock: decompression failed" );
return false; return false;
} }
@ -289,24 +288,26 @@ bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize, const char *
( const uchar * )decompressedBlock.constData(), ( const uchar * )decompressedBlock.constData(),
blockSize ) ) blockSize ) )
{ {
qWarning() << "MDict: parseCompressedBlock: lzo: checksum not match"; gdWarning( "MDict: parseCompressedBlock: lzo: checksum does not match" );
return false; return false;
} }
} }
else if ( type == 0x02000000 ) break;
{
// zlib compression
if ( checksum != qFromBigEndian<quint32>( ( const uchar * )dataPtr + dataSize - 4 ) )
{
qWarning() << "MDict: parseCompressedBlock: zlib: checksum not match";
return false;
}
decompressedBlock = zlibDecompress( dataPtr, dataSize ); case 0x02000000:
} // zlib compression
else decompressedBlock = zlibDecompress( buf, size );
if ( !checkAdler32( decompressedBlock.constData(), decompressedBlock.size(),
checksum ) )
{ {
qWarning() << "MDict: parseCompressedBlock: unknown type"; gdWarning( "MDict: parseCompressedBlock: zlib: checksum does not match" );
return false;
}
break;
default:
gdWarning( "MDict: parseCompressedBlock: unknown type" );
return false; return false;
} }
@ -355,7 +356,18 @@ bool MdictParser::readHeader( QDataStream & in )
return false; return false;
QString headerText = toUtf16( "UTF-16LE", headerTextUtf16.constData(), headerTextUtf16.size() ); QString headerText = toUtf16( "UTF-16LE", headerTextUtf16.constData(), headerTextUtf16.size() );
// Adler-32 checksum of the header text (little-endian)
quint32 checksum;
in.setByteOrder( QDataStream::LittleEndian );
in >> checksum;
if ( !checkAdler32( headerTextUtf16.constData(), headerTextUtf16.size(), checksum ) )
{
gdWarning( "MDict: readHeader: checksum does not match" );
return false;
}
headerTextUtf16.clear(); headerTextUtf16.clear();
in.setByteOrder( QDataStream::BigEndian );
QDomNamedNodeMap headerAttributes = parseHeaderAttributes( headerText ); QDomNamedNodeMap headerAttributes = parseHeaderAttributes( headerText );
@ -391,9 +403,8 @@ bool MdictParser::readHeader( QDataStream & in )
else else
numberTypeSize_ = 8; numberTypeSize_ = 8;
// 4 bytes unknown // Encrypted ?
if ( in.skipRawData( 4 ) != 4 ) encrypted_ = headerAttributes.namedItem("Encrypted").toAttr().value().toInt();
return false;
// Read metadata // Read metadata
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes"; rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
@ -418,87 +429,77 @@ bool MdictParser::readHeader( QDataStream & in )
bool MdictParser::readHeadWordBlockInfos( QDataStream & in ) bool MdictParser::readHeadWordBlockInfos( QDataStream & in )
{ {
QByteArray header = file_->read( version_ >= 2.0 ? 40 : 32 );
QDataStream stream( header );
// number of headword blocks // number of headword blocks
numHeadWordBlocks_ = readNumber( in ); numHeadWordBlocks_ = readNumber( stream );
// number of entries // number of entries
wordCount_ = readNumber( in ); wordCount_ = readNumber( stream );
// unknown field // number of bytes of a headword block info after decompression
qint64 decompressedSize;
if ( version_ >= 2.0 ) if ( version_ >= 2.0 )
{ stream >> decompressedSize;
if ( in.skipRawData( numberTypeSize_ ) != numberTypeSize_ )
return false;
}
// number of bytes of a headword block info // number of bytes of a headword block info before decompression
headWordBlockInfoSize_ = readNumber( in ); headWordBlockInfoSize_ = readNumber( stream );
// number of bytes of a headword block // number of bytes of a headword block
headWordBlockSize_ = readNumber( in ); headWordBlockSize_ = readNumber( stream );
// unknown field // Adler-32 checksum of the header. If those are encrypted, it is
// the checksum of the decrypted version
if ( version_ >= 2.0 ) if ( version_ >= 2.0 )
{ {
if ( in.skipRawData( 4 ) != 4 ) quint32 checksum;
in >> checksum;
if ( !checkAdler32( header.constData(), 40, checksum ) )
return false; return false;
} }
headWordBlockInfoPos_ = file_->pos(); headWordBlockInfoPos_ = file_->pos();
// read headword block info, which indicates headword block's compressed and decompressed size // read headword block info
QByteArray headWordBlockInfo = file_->read( headWordBlockInfoSize_ ); QByteArray headWordBlockInfo = file_->read( headWordBlockInfoSize_ );
if ( headWordBlockInfo.size() != headWordBlockInfoSize_ ) if ( headWordBlockInfo.size() != headWordBlockInfoSize_ )
return false; return false;
if ( version_ >= 2.0 ) if ( version_ >= 2.0 )
{ {
quint32 type; // decrypt
quint32 checksum; if ( encrypted_ & EcryptedHeadWordIndex )
quint32 value;
QDataStream headWordBlockInfoStream( headWordBlockInfo );
headWordBlockInfoStream.setByteOrder( QDataStream::BigEndian );
headWordBlockInfoStream >> type >> checksum;
headWordBlockInfoStream.skipRawData( headWordBlockInfoSize_ - 8 - 4 );
headWordBlockInfoStream >> value;
// 02 00 00 00
if ( type != 0x02000000 )
{ {
qWarning() << "MDict: readHeadWordBlockInfos: type not match"; if ( !decryptHeadWordIndex( headWordBlockInfo.data(),
headWordBlockInfo.size() ) )
return false; return false;
} }
if ( checksum == value ) QByteArray decompressed;
{ if ( !parseCompressedBlock( headWordBlockInfo.size(), headWordBlockInfo.data(),
// Decompress decompressedSize, decompressed) )
headWordBlockInfo = zlibDecompress( headWordBlockInfo.data() + 8, return false;
headWordBlockInfo.size() - 8 );
headWordBlockInfos_ = decodeHeadWordBlockInfo( decompressed );
} }
else else
{ {
qWarning() << "MDict: readHeadWordBlockInfos: checksum not match, try brute force...";
headWordPos_ = file_->pos();
bruteForce_ = true;
bruteForceEnd_ = false;
return true;
}
}
headWordPos_ = file_->pos();
headWordBlockInfos_ = decodeHeadWordBlockInfo( headWordBlockInfo ); headWordBlockInfos_ = decodeHeadWordBlockInfo( headWordBlockInfo );
}
headWordPos_ = file_->pos();
headWordBlockInfosIter_ = headWordBlockInfos_.begin(); headWordBlockInfosIter_ = headWordBlockInfos_.begin();
return true; return true;
} }
bool MdictParser::readRecordBlockInfos() bool MdictParser::readRecordBlockInfos()
{ {
file_->seek( headWordBlockInfoPos_ + headWordBlockInfoSize_ + headWordBlockSize_ ); file_->seek( headWordBlockInfoPos_ + headWordBlockInfoSize_ +
headWordBlockSize_ );
QDataStream in( file_ ); QDataStream in( file_ );
in.setByteOrder( QDataStream::BigEndian ); in.setByteOrder( QDataStream::BigEndian );
qint64 numRecordBlocks = readNumber( in ); qint64 numRecordBlocks = readNumber( in );
readNumber( in ); // entry count, skip readNumber( in ); // total number of records, skip
qint64 recordInfoSize = readNumber( in ); qint64 recordInfoSize = readNumber( in );
totalRecordsSize_ = readNumber( in ); totalRecordsSize_ = readNumber( in );
recordPos_ = file_->pos() + recordInfoSize; recordPos_ = file_->pos() + recordInfoSize;
@ -544,18 +545,18 @@ MdictParser::BlockInfoVector MdictParser::decodeHeadWordBlockInfo( QByteArray co
while ( !s.atEnd() ) while ( !s.atEnd() )
{ {
// unknown // Number of keywords in the block
s.skipRawData( numberTypeSize_ ); s.skipRawData( numberTypeSize_ );
// Text head size // Size of the first headword in the block
quint32 textHeadSize = readU8OrU16( s, isU16 ); quint32 textHeadSize = readU8OrU16( s, isU16 );
// Text head // The first headword
if ( encoding_ != "UTF-16LE" ) if ( encoding_ != "UTF-16LE" )
s.skipRawData( textHeadSize + textTermSize ); s.skipRawData( textHeadSize + textTermSize );
else else
s.skipRawData( ( textHeadSize + textTermSize ) * 2 ); s.skipRawData( ( textHeadSize + textTermSize ) * 2 );
// Text tail Size // Size of the last headword in the block
quint32 textTailSize = readU8OrU16( s, isU16 ); quint32 textTailSize = readU8OrU16( s, isU16 );
// Text tail // The last headword
if ( encoding_ != "UTF-16LE" ) if ( encoding_ != "UTF-16LE" )
s.skipRawData( textTailSize + textTermSize ); s.skipRawData( textTailSize + textTermSize );
else else

View file

@ -165,8 +165,7 @@ public:
return toUtf16( fromCode.toLatin1().constData(), from, fromSize ); return toUtf16( fromCode.toLatin1().constData(), from, fromSize );
} }
static bool parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr, static bool parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr,
qint64 decompressedBlockSize, QByteArray & decompressedBlock ); qint64 decompressedBlockSize, QByteArray & decompressedBlock);
static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets ); static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets ) static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
{ {
@ -178,6 +177,8 @@ public:
protected: protected:
qint64 readNumber( QDataStream & in ); qint64 readNumber( QDataStream & in );
static quint32 readU8OrU16( QDataStream & in, bool isU16 ); static quint32 readU8OrU16( QDataStream & in, bool isU16 );
static bool checkAdler32(const char * buffer, unsigned int len, quint32 checksum);
static bool decryptHeadWordIndex(char * buffer, qint64 len);
bool readHeader( QDataStream & in ); bool readHeader( QDataStream & in );
bool readHeadWordBlockInfos( QDataStream & in ); bool readHeadWordBlockInfos( QDataStream & in );
bool readRecordBlockInfos(); bool readRecordBlockInfos();
@ -207,9 +208,8 @@ protected:
quint32 wordCount_; quint32 wordCount_;
int numberTypeSize_; int numberTypeSize_;
int encrypted_;
bool rtl_; bool rtl_;
bool bruteForce_;
bool bruteForceEnd_;
}; };
} }