goldendict-ng/mdictparser.cc

686 lines
18 KiB
C++

// https://bitbucket.org/xwang/mdict-analysis
// Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
//
// Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
// Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com>
//
// This program is a free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 3 of the License.
//
// You can get a copy of GNU General Public License along this program
// But you can always get it from http://www.gnu.org/licenses/gpl.txt
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
#include "mdictparser.hh"
#include <errno.h>
#include <zlib.h>
#include <iconv.h>
#include <lzo/lzo1x.h>
#include <algorithm>
#include <iterator>
#include <QtEndian>
#include <QStringList>
#include <QByteArray>
#include <QFileInfo>
#include <QRegExp>
#include <QDomDocument>
#include <QTextDocumentFragment>
#include <QDebug>
#include "decompress.hh"
static inline int u16StrSize( const ushort * unicode )
{
int size = 0;
if ( unicode )
{
while ( unicode[size] != 0 )
size++;
}
return size;
}
static QDomNamedNodeMap parseHeaderAttributes( const QString & headerText )
{
QDomNamedNodeMap attributes;
QDomDocument doc;
doc.setContent( headerText );
QDomElement docElem = doc.documentElement();
attributes = docElem.attributes();
for ( int i = 0; i < attributes.count(); i++ )
{
QDomAttr attr = attributes.item( i ).toAttr();
}
return attributes;
}
size_t MdictParser::RecordIndex::bsearch( const vector<MdictParser::RecordIndex> & offsets, qint64 val )
{
if ( offsets.size() == 0 )
return ( size_t ) ( -1 );
size_t lo = 0;
size_t hi = offsets.size() - 1;
while ( lo <= hi )
{
size_t mid = ( lo + hi ) >> 1;
RecordIndex const & p = offsets[mid];
if ( p == val )
return mid;
else if ( p < val )
lo = mid + 1;
else
hi = mid - 1;
}
return ( size_t ) ( -1 );
}
MdictParser::MdictParser( const char * filename ): filename_( QString::fromUtf8( filename ) )
{
version_ = 0;
numHeadWordBlocks_ = 0;
headWordBlockInfoSize_ = 0;
headWordBlockSize_ = 0;
headWordBlockInfoPos_ = 0;
headWordPos_ = 0;
totalRecordsSize_ = 0;
recordPos_ = 0;
wordCount_ = 0;
numberTypeSize_ = 0;
rtl_ = false;
bruteForce_ = false;
bruteForceEnd_ = true;
}
bool MdictParser::open()
{
file_ = new QFile( filename_ );
if ( file_.isNull() || !file_->exists() )
return false;
if ( !file_->open( QIODevice::ReadOnly ) )
return false;
QDataStream in( file_ );
in.setByteOrder( QDataStream::BigEndian );
if ( !readHeader( in ) )
return false;
if ( !readHeadWordBlockInfos( in ) )
return false;
if ( !readRecordBlockInfos() )
return false;
return true;
}
bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIndex )
{
if ( bruteForce_ )
{
if ( bruteForceEnd_ )
return false;
headWordIndex.clear();
file_->seek( headWordPos_ );
QByteArray data = file_->read( headWordBlockSize_ );
const char * pDataStart = data.constData();
const char * pDataEnd = pDataStart + data.size();
const char pattern[] = {0x02, 0x00, 0x00, 0x00};
const char * patternBegin = pattern;
const char * patternEnd = pattern + 4;
const char * p;
do
{
p = std::search( pDataStart + 4, pDataEnd, patternBegin, patternEnd );
QByteArray decompressed = zlibDecompress( pDataStart + 8, p - ( pDataStart + 8 ) );
HeadWordIndex currentIndex = splitHeadWordBlock( decompressed );
headWordIndex.insert( headWordIndex.end(), currentIndex.begin(), currentIndex.end() );
pDataStart = p;
}
while ( p != pDataEnd );
bruteForceEnd_ = true;
return true;
}
else
{
if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
return false;
file_->seek( headWordPos_ );
qint64 compressedSize = headWordBlockInfosIter_->first;
qint64 decompressedSize = headWordBlockInfosIter_->second;
if ( compressedSize < 8 )
return false;
QByteArray compressed = file_->read( compressedSize );
headWordPos_ = file_->pos();
QByteArray decompressed;
if ( !parseCompressedBlock( compressedSize, compressed, decompressedSize, decompressed ) )
return false;
headWordIndex = splitHeadWordBlock( decompressed );
headWordBlockInfosIter_++;
return true;
}
}
QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t fromSize )
{
if ( !fromCode || !from )
return QString();
iconv_t conv = iconv_open( "UTF-16//IGNORE", fromCode );
if ( conv == ( iconv_t ) - 1 )
return QString();
vector<char> result;
const static int CHUNK_SIZE = 512;
char buf[CHUNK_SIZE];
char ** inBuf = ( char ** )&from;
while ( fromSize )
{
char * outBuf = buf;
size_t outBytesLeft = CHUNK_SIZE;
size_t ret = iconv( conv, inBuf, &fromSize, &outBuf, &outBytesLeft );
if ( ret == ( size_t ) - 1 )
{
if ( errno != E2BIG )
{
// Real problem
result.clear();
break;
}
}
result.insert( result.end(), buf, buf + CHUNK_SIZE - outBytesLeft );
}
iconv_close( conv );
if ( result.size() <= 2 )
return QString();
return QString::fromUtf16( ( const ushort * )&result.front() );
}
bool MdictParser::parseCompressedBlock( size_t compressedBlockSize, const char * compressedBlockPtr,
size_t decompressedBlockSize, QByteArray & decompressedBlock )
{
if ( compressedBlockSize <= 8 )
return false;
size_t dataSize = compressedBlockSize - 8;
const char * dataPtr = compressedBlockPtr + 8;
// 4bytes - type
// 4bytes - checksum
quint32 type;
quint32 checksum;
type = qFromBigEndian<quint32>( ( const uchar * ) compressedBlockPtr );
checksum = qFromBigEndian<quint32>( ( const uchar * )compressedBlockPtr + sizeof( type ) );
if ( type == 0x00000000 )
{
// No compression
checksum >>= 8;
if ( checksum != qFromBigEndian<quint16>( dataSize - 2 ) )
{
qWarning() << "MDict: parseCompressedBlock: plain: checksum not match";
return false;
}
decompressedBlock = QByteArray( dataPtr, dataSize );
}
else if ( type == 0x01000000 )
{
// LZO compression
int result;
lzo_uint blockSize = decompressedBlockSize;
decompressedBlock.resize( blockSize );
result = lzo1x_decompress_safe( ( const uchar * )dataPtr, dataSize,
( uchar * )decompressedBlock.data(), &blockSize, NULL );
if ( result != LZO_E_OK || blockSize != decompressedBlockSize )
{
qWarning() << "MDict: parseCompressedBlock: decompression failed";
return false;
}
if ( checksum != lzo_adler32( lzo_adler32( 0, NULL, 0 ),
( const uchar * )decompressedBlock.constData(),
blockSize ) )
{
qWarning() << "MDict: parseCompressedBlock: lzo: checksum not match";
return false;
}
}
else if ( type == 0x02000000 )
{
// zlib compression
if ( checksum != qFromBigEndian<quint32>( ( const uchar * )dataPtr + dataSize - 4 ) )
{
qWarning() << "MDict: parseCompressedBlock: zlib: checksum not match";
return false;
}
decompressedBlock = zlibDecompress( dataPtr, dataSize );
}
else
{
qWarning() << "MDict: parseCompressedBlock: unknown type";
return false;
}
return true;
}
qint64 MdictParser::readNumber( QDataStream & in )
{
if ( numberTypeSize_ == 8 )
{
qint64 val;
in >> val;
return val;
}
else
{
quint32 val;
in >> val;
return val;
}
}
quint32 MdictParser::readU8OrU16( QDataStream & in, bool isU16 )
{
if ( isU16 )
{
quint16 val;
in >> val;
return val;
}
else
{
quint8 val;
in >> val;
return val;
}
}
bool MdictParser::readHeader( QDataStream & in )
{
qint32 headerTextSize;
in >> headerTextSize;
QByteArray headerTextUtf16 = file_->read( headerTextSize );
if ( headerTextUtf16.size() != headerTextSize )
return false;
QString headerText = toUtf16( "UTF-16LE", headerTextUtf16.constData(), headerTextUtf16.size() );
headerTextUtf16.clear();
QDomNamedNodeMap headerAttributes = parseHeaderAttributes( headerText );
encoding_ = headerAttributes.namedItem( "Encoding" ).toAttr().value();
if ( encoding_ == "GBK" || encoding_ == "GB2312" )
{
encoding_ = "GB18030";
}
else if ( encoding_.isEmpty() || encoding_ == "UTF-16" )
{
encoding_ = "UTF-16LE";
}
// stylesheet attribute if present takes form of:
// styleId # 1-255
// style.prefix
// style.suffix
if ( headerAttributes.contains( "StyleSheet" ) )
{
QString styleSheets = headerAttributes.namedItem( "StyleSheet" ).toAttr().value();
QStringList lines = styleSheets.split( QRegExp( "[\r\n]" ), QString::KeepEmptyParts );
for ( int i = 0; i < lines.size() - 3; i += 3 )
{
styleSheets_[lines[i].toInt()] = pair<QString, QString>( lines[i + 1], lines[i + 2] );
}
}
// before version 2.0, number is 4 bytes integer
// version 2.0 and above uses 8 bytes
version_ = headerAttributes.namedItem( "GeneratedByEngineVersion" ).toAttr().value().toDouble();
if ( version_ < 2.0 )
numberTypeSize_ = 4;
else
numberTypeSize_ = 8;
// 4 bytes unknown
if ( in.skipRawData( 4 ) != 4 )
return false;
// Read metadata
rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
if ( title == "Title (No HTML code allowed)" )
{
// Use filename instead
QFileInfo fi( filename_ );
title_ = fi.baseName();
}
else
{
if ( title.contains( '<' ) || title.contains( '>' ) )
title_ = QTextDocumentFragment::fromHtml( title ).toPlainText();
else
title_ = title;
}
QString description = headerAttributes.namedItem( "Description" ).toAttr().value();
description_ = QTextDocumentFragment::fromHtml( description ).toPlainText();
return true;
}
bool MdictParser::readHeadWordBlockInfos( QDataStream & in )
{
// number of headword blocks
numHeadWordBlocks_ = readNumber( in );
// number of entries
wordCount_ = readNumber( in );
// unknown field
if ( version_ >= 2.0 )
{
if ( in.skipRawData( numberTypeSize_ ) != numberTypeSize_ )
return false;
}
// number of bytes of a headword block info
headWordBlockInfoSize_ = readNumber( in );
// number of bytes of a headword block
headWordBlockSize_ = readNumber( in );
// unknown field
if ( version_ >= 2.0 )
{
if ( in.skipRawData( 4 ) != 4 )
return false;
}
headWordBlockInfoPos_ = file_->pos();
// read headword block info, which indicates headword block's compressed and decompressed size
QByteArray headWordBlockInfo = file_->read( headWordBlockInfoSize_ );
if ( headWordBlockInfo.size() != headWordBlockInfoSize_ )
return false;
if ( version_ >= 2.0 )
{
quint32 type;
quint32 checksum;
quint32 value;
QDataStream headWordBlockInfoStream( headWordBlockInfo );
headWordBlockInfoStream.setByteOrder( QDataStream::BigEndian );
headWordBlockInfoStream >> type >> checksum;
headWordBlockInfoStream.skipRawData( headWordBlockInfoSize_ - 8 - 4 );
headWordBlockInfoStream >> value;
// 02 00 00 00
if ( type != 0x02000000 )
{
qWarning() << "MDict: readHeadWordBlockInfos: type not match";
return false;
}
if ( checksum == value )
{
// Decompress
headWordBlockInfo = zlibDecompress( headWordBlockInfo.data() + 8,
headWordBlockInfo.size() - 8 );
}
else
{
qWarning() << "MDict: readHeadWordBlockInfos: checksum not match, try brute force...";
headWordPos_ = file_->pos();
bruteForce_ = true;
bruteForceEnd_ = false;
return true;
}
}
headWordPos_ = file_->pos();
headWordBlockInfos_ = decodeHeadWordBlockInfo( headWordBlockInfo );
headWordBlockInfosIter_ = headWordBlockInfos_.begin();
return true;
}
bool MdictParser::readRecordBlockInfos()
{
file_->seek( headWordBlockInfoPos_ + headWordBlockInfoSize_ + headWordBlockSize_ );
QDataStream in( file_ );
in.setByteOrder( QDataStream::BigEndian );
qint64 numRecordBlocks = readNumber( in );
readNumber( in ); // entry count, skip
qint64 recordInfoSize = readNumber( in );
totalRecordsSize_ = readNumber( in );
recordPos_ = file_->pos() + recordInfoSize;
// Build record block index
recordBlockInfos_.reserve( numRecordBlocks );
qint64 acc1 = 0;
qint64 acc2 = 0;
for ( qint64 i = 0; i < numRecordBlocks; i++ )
{
RecordIndex r;
r.compressedSize = readNumber( in );
r.decompressedSize = readNumber( in );
r.startPos = acc1;
r.endPos = acc1 + r.compressedSize;
r.shadowStartPos = acc2;
r.shadowEndPos = acc2 + r.decompressedSize;
recordBlockInfos_.push_back( r );
acc1 = r.endPos;
acc2 = r.shadowEndPos;
}
return true;
}
MdictParser::BlockInfoVector MdictParser::decodeHeadWordBlockInfo( QByteArray const & headWordBlockInfo )
{
BlockInfoVector headWordBlockInfos;
QDataStream s( headWordBlockInfo );
s.setByteOrder( QDataStream::BigEndian );
bool isU16 = false;
int textTermSize = 0;
if ( version_ >= 2.0 )
{
isU16 = true;
textTermSize = 1;
}
while ( !s.atEnd() )
{
// unknown
s.skipRawData( numberTypeSize_ );
// Text head size
quint32 textHeadSize = readU8OrU16( s, isU16 );
// Text head
if ( encoding_ != "UTF-16LE" )
s.skipRawData( textHeadSize + textTermSize );
else
s.skipRawData( ( textHeadSize + textTermSize ) * 2 );
// Text tail Size
quint32 textTailSize = readU8OrU16( s, isU16 );
// Text tail
if ( encoding_ != "UTF-16LE" )
s.skipRawData( textTailSize + textTermSize );
else
s.skipRawData( ( textTailSize + textTermSize ) * 2 );
// headword block compressed size
qint64 compressedSize = readNumber( s );
// headword block decompressed size
qint64 decompressedSize = readNumber( s );
headWordBlockInfos.push_back( BlockInfoVector::value_type( compressedSize, decompressedSize ) );
}
return headWordBlockInfos;
}
MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & block )
{
HeadWordIndex index;
const char * p = block.constData();
const char * end = p + block.size();
while ( p < end )
{
qint64 headWordId = ( numberTypeSize_ == 8 ) ?
qFromBigEndian<qint64>( ( const uchar * )p ) :
qFromBigEndian<quint32>( ( const uchar * )p );
p += numberTypeSize_;
QByteArray headWordBuf;
if ( encoding_ == "UTF-16LE" )
{
int headWordLength = u16StrSize( ( const ushort * )p );
headWordBuf = QByteArray( p, ( headWordLength + 1 ) * 2 );
}
else
{
int headWordLength = strlen( p );
headWordBuf = QByteArray( p, headWordLength + 1 );
}
p += headWordBuf.size();
QString headWord = toUtf16( encoding_, headWordBuf.constBegin(), headWordBuf.size() );
index.push_back( HeadWordIndex::value_type( headWordId, headWord ) );
}
return index;
}
bool MdxParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
MdxParser::ArticleHandler & articleHandler )
{
size_t prevIdx = ( size_t ) ( -1 );
QByteArray decompressed;
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
{
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
RecordIndex const & recordIndex = recordBlockInfos_[idx];
if ( idx == ( size_t )( -1 ) )
return false;
// Reload if index changes
if ( prevIdx != idx )
{
prevIdx = idx;
file_->seek( recordPos_ + recordIndex.startPos );
QByteArray compressed;
compressed.resize( recordIndex.compressedSize );
file_->read( compressed.data(), recordIndex.compressedSize );
if ( !parseCompressedBlock( recordIndex.compressedSize, compressed,
recordIndex.decompressedSize, decompressed ) )
return false;
}
HeadWordIndex::const_iterator iNext = i + 1;
size_t articleSize;
if ( iNext == headWordIndex.end() )
articleSize = recordIndex.shadowEndPos - i->first;
else
articleSize = iNext->first - i->first;
QString article = toUtf16( encoding_, decompressed.constData() + i->first - recordIndex.shadowStartPos, articleSize );
articleHandler.handleAritcle( i->second, article );
}
return true;
}
QString & MdxParser::substituteStylesheet( QString & article, MdxParser::StyleSheets const & styleSheets )
{
QRegExp rx( "`(\\d+)`" );
QString endStyle;
int pos = 0;
while ( ( pos = rx.indexIn( article, pos ) ) != -1 )
{
int styleId = rx.cap( 1 ).toInt();
StyleSheets::const_iterator iter = styleSheets.find( styleId );
if ( iter != styleSheets.end() )
{
QString rep = endStyle + iter->second.first;
article.replace( pos, rx.cap( 0 ).length(), rep );
pos += rep.length();
endStyle = iter->second.second;
}
else
{
article.replace( pos, rx.cap( 0 ).length(), endStyle );
pos += endStyle.length();
endStyle = "";
}
}
article += endStyle;
return article;
}
bool MddParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
MddParser::ResourceHandler & resourceHandler )
{
for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); i++ )
{
size_t idx = RecordIndex::bsearch( recordBlockInfos_, i->first );
RecordIndex const & recordIndex = recordBlockInfos_[idx];
if ( idx == ( size_t )( -1 ) )
return false;
HeadWordIndex::const_iterator iNext = i + 1;
size_t resourceSize;
if ( iNext == headWordIndex.end() )
resourceSize = recordIndex.shadowEndPos - i->first;
else
resourceSize = iNext->first - i->first;
resourceHandler.handleResource( i->second, recordIndex.decompressedSize,
recordPos_ + recordIndex.startPos, recordIndex.compressedSize,
i->first - recordIndex.shadowStartPos, resourceSize );
}
return true;
}