goldendict-ng/mdictparser.cc

// https://bitbucket.org/xwang/mdict-analysis
// https://github.com/zhansliu/writemdict/blob/master/fileformat.md
// Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
//
// Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
// Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com>
// Copyright (C) 2015 Zhe Wang <0x1998 AT gmail DOT com>
//
// This program is a free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 3 of the License.
//
// You can get a copy of GNU General Public License along this program
// But you can always get it from http://www.gnu.org/licenses/gpl.txt
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

#include "mdictparser.hh"

#include <errno.h>
#include <zlib.h>
#include <lzo/lzo1x.h>

#include <QtEndian>
#include <QStringList>
#include <QByteArray>
#include <QFileInfo>
#include <QRegularExpression>
#include <QDomDocument>
#include <QTextDocumentFragment>
#include <QDataStream>
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
#include <QtCore5Compat/QTextCodec>
#else
#include <QTextCodec>
#endif
#include "decompress.hh"
#include "gddebug.hh"
#include "ripemd.hh"
#include "utils.hh"
#include "htmlescape.hh"

namespace Mdict
{

enum EncryptedSection
{
  EcryptedHeadWordHeader = 1,
  EcryptedHeadWordIndex = 2
};

static inline int u16StrSize( const ushort * unicode )
{
  int size = 0;
  if ( unicode )
  {
    while ( unicode[size] != 0 )
      size++;
  }
  return size;
}

static QDomNamedNodeMap parseHeaderAttributes( const QString & headerText )
{
  QDomNamedNodeMap attributes;
  QDomDocument doc;
  doc.setContent( headerText );

  QDomElement docElem = doc.documentElement();
  attributes = docElem.attributes();


  return attributes;
}

size_t MdictParser::RecordIndex::bsearch( const vector<MdictParser::RecordIndex> & offsets, qint64 val )
{
  if ( offsets.size() == 0 )
    return ( size_t ) ( -1 );

  size_t lo = 0;
  size_t hi = offsets.size() - 1;

  while ( lo <= hi )
  {
    size_t mid = ( lo + hi ) >> 1;
    RecordIndex const & p = offsets[mid];
    if ( p == val )
      return mid;
    else if ( p < val )
      lo = mid + 1;
    else
      hi = mid - 1;
  }

  return ( size_t ) ( -1 );
}

MdictParser::MdictParser() :
  version_( 0 ),
  numHeadWordBlocks_( 0 ),
  headWordBlockInfoSize_( 0 ),
  headWordBlockSize_( 0 ),
  headWordBlockInfoPos_( 0 ),
  headWordPos_( 0 ),
  totalRecordsSize_( 0 ),
  recordPos_( 0 ),
  wordCount_( 0 ),
  numberTypeSize_( 0 ),
  encrypted_( 0 ),
  rtl_( false )
{
}

bool MdictParser::open( const char * filename )
{
  filename_ = QString::fromUtf8( filename );
  file_ = new QFile( filename_ );

  gdDebug( "MdictParser: open %s", filename );

  if ( file_.isNull() || !file_->exists() )
    return false;

  if ( !file_->open( QIODevice::ReadOnly ) )
    return false;

  QDataStream in( file_ );
  in.setByteOrder( QDataStream::BigEndian );

  if ( !readHeader( in ) )
    return false;

  if ( !readHeadWordBlockInfos( in ) )
    return false;

  if ( !readRecordBlockInfos() )
    return false;

  return true;
}

bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIndex )
{
  if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
    return false;

  qint64 compressedSize = headWordBlockInfosIter_->first;
  qint64 decompressedSize = headWordBlockInfosIter_->second;

  if ( compressedSize < 8 )
    return false;

  ScopedMemMap compressed( *file_, headWordPos_, compressedSize );
  if ( !compressed.startAddress() )
    return false;

  headWordPos_ += compressedSize;
  QByteArray decompressed;
  if ( !parseCompressedBlock( compressedSize, ( char * )compressed.startAddress(),
                              decompressedSize, decompressed ) )
    return false;

  headWordIndex = splitHeadWordBlock( decompressed );
  ++headWordBlockInfosIter_;
  return true;
}

bool MdictParser::checkAdler32(const char * buffer, unsigned int len, quint32 checksum)
{
  uLong adler = adler32( 0L, Z_NULL, 0 );
  adler = adler32( adler, ( const Bytef * ) buffer, len );
  return (adler & 0xFFFFFFFF) == checksum;
}

QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t fromSize )
{
  if ( !fromCode || !from )
    return QString();

  QTextCodec *codec =QTextCodec::codecForName(fromCode);
  return codec->toUnicode(from,fromSize);
}

bool MdictParser::decryptHeadWordIndex(char * buffer, qint64 len)
{
  RIPEMD128 ripemd;
  ripemd.update( ( const uchar * ) buffer + 4, 4 );
  ripemd.update( ( const uchar * ) "\x95\x36\x00\x00", 4 );

  uint8_t key[16];
  ripemd.digest( key );

  buffer += 8;
  len -= 8;
  uint8_t prev = 0x36;
  for (qint64 i = 0; i < len; ++i)
  {
    uint8_t byte = buffer[i];
    byte = (byte >> 4) | (byte << 4);
    byte = byte ^ prev ^ (i & 0xFF) ^ key[i % 16];
    prev = buffer[i];
    buffer[i] = byte;
  }
  return true;
}

bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize,
                                        const char * compressedBlockPtr,
                                        qint64 decompressedBlockSize,
                                        QByteArray & decompressedBlock )
{
  if ( compressedBlockSize <= 8 )
    return false;

  // compression type
  quint32 type = qFromBigEndian<quint32>( ( const uchar * ) compressedBlockPtr );
  quint32 checksum = qFromBigEndian<quint32>( ( const uchar * )compressedBlockPtr + 4 );
  const char * buf = compressedBlockPtr + 8;
  qint64 size = compressedBlockSize - 8;

  switch ( type )
  {
    case 0x00000000:
      // No compression
      if ( !checkAdler32( buf, size, checksum ) )
      {
        gdWarning( "MDict: parseCompressedBlock: plain: checksum not match" );
        return false;
      }

      decompressedBlock = QByteArray( buf, size );
      return true;

    case 0x01000000:
      {
        // LZO compression
        int result;
        lzo_uint blockSize = ( lzo_uint )decompressedBlockSize;
        decompressedBlock.resize( blockSize );
        result = lzo1x_decompress_safe( ( const uchar * ) buf, size,
                                        ( uchar * )decompressedBlock.data(),
                                        &blockSize, NULL );

        if ( result != LZO_E_OK || blockSize != ( lzo_uint )decompressedBlockSize )
        {
          gdWarning( "MDict: parseCompressedBlock: decompression failed" );
          return false;
        }

        if ( checksum != lzo_adler32( lzo_adler32( 0, NULL, 0 ),
                                      ( const uchar * )decompressedBlock.constData(),
                                      blockSize ) )
        {
          gdWarning( "MDict: parseCompressedBlock: lzo: checksum does not match" );
          return false;
        }
      }
      break;

    case 0x02000000:
      // zlib compression
      decompressedBlock = zlibDecompress( buf, size );

      if ( !checkAdler32( decompressedBlock.constData(), decompressedBlock.size(),
                          checksum ) )
      {
        gdWarning( "MDict: parseCompressedBlock: zlib: checksum does not match" );
        return false;
      }
      break;

    default:
      gdWarning( "MDict: parseCompressedBlock: unknown type" );
      return false;
  }

  return true;
}

qint64 MdictParser::readNumber( QDataStream & in )
{
  if ( numberTypeSize_ == 8 )
  {
    qint64 val;
    in >> val;
    return val;
  }
  else
  {
    quint32 val;
    in >> val;
    return val;
  }
}

quint32 MdictParser::readU8OrU16( QDataStream & in, bool isU16 )
{
  if ( isU16 )
  {
    quint16 val;
    in >> val;
    return val;
  }
  else
  {
    quint8 val;
    in >> val;
    return val;
  }
}

bool MdictParser::readHeader( QDataStream & in )
{
  qint32 headerTextSize;
  in >> headerTextSize;

  QByteArray headerTextUtf16 = file_->read( headerTextSize );
  if ( headerTextUtf16.size() != headerTextSize )
    return false;

  QString headerText = toUtf16( "UTF-16LE", headerTextUtf16.constData(), headerTextUtf16.size() );

  // Adler-32 checksum of the header text (little-endian)
  quint32 checksum;
  in.setByteOrder( QDataStream::LittleEndian );
  in >> checksum;
  if ( !checkAdler32( headerTextUtf16.constData(), headerTextUtf16.size(), checksum ) )
  {
    gdWarning( "MDict: readHeader: checksum does not match" );
    return false;
  }
  headerTextUtf16.clear();
  in.setByteOrder( QDataStream::BigEndian );


  //parse stylesheet
  QString styleSheets;

  if( headerText.contains( "StyleSheet" ) )
  {
    // a workaround to bypass https://bugreports.qt.io/browse/QTBUG-102612
    QRegularExpression rx( "StyleSheet=\"([^\"]*?)\"", QRegularExpression::CaseInsensitiveOption );

    auto match = rx.match( headerText );

    if( match.hasMatch() || match.hasPartialMatch() )
    {
      styleSheets = match.captured( 1 );
    }
  }

  //with this control character ,qt6.x can not parse attribute value.
  headerText.remove(QRegularExpression("\\p{C}"));

  QDomNamedNodeMap headerAttributes = parseHeaderAttributes( headerText );

  encoding_ = headerAttributes.namedItem( "Encoding" ).toAttr().value();
  if ( encoding_ == "GBK" || encoding_ == "GB2312" )
  {
    encoding_ = "GB18030";
  }
  else if ( encoding_.isEmpty() || encoding_ == "UTF-16" )
  {
    encoding_ = "UTF-16LE";
  }

  // stylesheet attribute if present takes form of:
  //   styleId # 1-255
  //   style.prefix
  //   style.suffix
  if ( !styleSheets.isEmpty() )
  {
    QStringList lines = styleSheets.split( QRegularExpression( "[\r\n]" ), Qt::KeepEmptyParts );

    for( int i = 0; i < lines.size() - 3; i += 3 )
    {
      styleSheets_[ lines[ i ].toInt() ] =
        pair( Html::fromHtmlEscaped( lines[ i + 1 ] ),  Html::fromHtmlEscaped( lines[ i + 2 ] ) );
    }
  }

  // before version 2.0, number is 4 bytes integer
  // version 2.0 and above uses 8 bytes
  version_ = headerAttributes.namedItem( "GeneratedByEngineVersion" ).toAttr().value().toDouble();
  if ( version_ < 2.0 )
    numberTypeSize_ = 4;
  else
    numberTypeSize_ = 8;

  // Encrypted ?
  encrypted_ = headerAttributes.namedItem("Encrypted").toAttr().value().toInt();

  // Read metadata
  rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
  QString title = headerAttributes.namedItem( "Title" ).toAttr().value();
  if ( title.isEmpty() || title.length() < 5 || title == "Title (No HTML code allowed)" )
  {
    // Use filename instead
    QFileInfo fi( filename_ );
    title_ = fi.baseName();
  }
  else
  {
    if ( title.contains( '<' ) || title.contains( '>' ) )
      title_ = QTextDocumentFragment::fromHtml( title ).toPlainText();
    else
      title_ = title;
  }
  QString description = headerAttributes.namedItem( "Description" ).toAttr().value();
  description_ = description; //QTextDocumentFragment::fromHtml( description ).toPlainText();
  return true;
}

bool MdictParser::readHeadWordBlockInfos( QDataStream & in )
{
  QByteArray header = file_->read( version_ >= 2.0 ? ( numberTypeSize_ * 5 )
                                                   : ( numberTypeSize_ * 4 ) );
  QDataStream stream( header );

  // number of headword blocks
  numHeadWordBlocks_ = readNumber( stream );
  // number of entries
  wordCount_ = readNumber( stream );

  // number of bytes of a headword block info after decompression
  qint64 decompressedSize;
  if ( version_ >= 2.0 )
    stream >> decompressedSize;

  // number of bytes of a headword block info before decompression
  headWordBlockInfoSize_ = readNumber( stream );
  // number of bytes of a headword block
  headWordBlockSize_ = readNumber( stream );

  // Adler-32 checksum of the header. If those are encrypted, it is
  // the checksum of the decrypted version
  if ( version_ >= 2.0 )
  {
    quint32 checksum;
    in >> checksum;
    if ( !checkAdler32( header.constData(), numberTypeSize_ * 5, checksum ) )
      return false;
  }

  headWordBlockInfoPos_ = file_->pos();

  // read headword block info
  QByteArray headWordBlockInfo = file_->read( headWordBlockInfoSize_ );
  if ( headWordBlockInfo.size() != headWordBlockInfoSize_ )
    return false;

  if ( version_ >= 2.0 )
  {
    // decrypt
    if ( encrypted_ & EcryptedHeadWordIndex )
    {
      if ( !decryptHeadWordIndex( headWordBlockInfo.data(),
                                  headWordBlockInfo.size() ) )
        return false;
    }

    QByteArray decompressed;
    if ( !parseCompressedBlock( headWordBlockInfo.size(), headWordBlockInfo.data(),
                                decompressedSize, decompressed) )
      return false;

    headWordBlockInfos_ = decodeHeadWordBlockInfo( decompressed );
  }
  else
  {
    headWordBlockInfos_ = decodeHeadWordBlockInfo( headWordBlockInfo );
  }

  headWordPos_ = file_->pos();
  headWordBlockInfosIter_ = headWordBlockInfos_.begin();
  return true;
}

bool MdictParser::readRecordBlockInfos()
{
  file_->seek( headWordBlockInfoPos_ + headWordBlockInfoSize_ +
               headWordBlockSize_ );

  QDataStream in( file_ );
  in.setByteOrder( QDataStream::BigEndian );
  qint64 numRecordBlocks = readNumber( in );
  readNumber( in ); // total number of records, skip
  qint64 recordInfoSize = readNumber( in );
  totalRecordsSize_ = readNumber( in );
  recordPos_ = file_->pos() + recordInfoSize;

  // Build record block index
  recordBlockInfos_.reserve( numRecordBlocks );

  qint64 acc1 = 0;
  qint64 acc2 = 0;
  for ( qint64 i = 0; i < numRecordBlocks; i++ )
  {
    RecordIndex r;
    r.compressedSize = readNumber( in );
    r.decompressedSize = readNumber( in );
    r.startPos = acc1;
    r.endPos = acc1 + r.compressedSize;
    r.shadowStartPos = acc2;
    r.shadowEndPos = acc2 + r.decompressedSize;
    recordBlockInfos_.push_back( r );

    acc1 = r.endPos;
    acc2 = r.shadowEndPos;
  }

  return true;
}

MdictParser::BlockInfoVector MdictParser::decodeHeadWordBlockInfo( QByteArray const & headWordBlockInfo )
{
  BlockInfoVector headWordBlockInfos;

  QDataStream s( headWordBlockInfo );
  s.setByteOrder( QDataStream::BigEndian );

  bool isU16 = false;
  int textTermSize = 0;

  if ( version_ >= 2.0 )
  {
    isU16 = true;
    textTermSize = 1;
  }

  while ( !s.atEnd() )
  {
    // Number of keywords in the block
    s.skipRawData( numberTypeSize_ );
    // Size of the first headword in the block
    quint32 textHeadSize = readU8OrU16( s, isU16 );
    // The first headword
    if ( encoding_ != "UTF-16LE" )
      s.skipRawData( textHeadSize + textTermSize );
    else
      s.skipRawData( ( textHeadSize + textTermSize ) * 2 );
    // Size of the last headword in the block
    quint32 textTailSize = readU8OrU16( s, isU16 );
    // The last headword
    if ( encoding_ != "UTF-16LE" )
      s.skipRawData( textTailSize + textTermSize );
    else
      s.skipRawData( ( textTailSize + textTermSize ) * 2 );

    // headword block compressed size
    qint64 compressedSize = readNumber( s );
    // headword block decompressed size
    qint64 decompressedSize = readNumber( s );
    headWordBlockInfos.emplace_back(compressedSize, decompressedSize);
  }

  return headWordBlockInfos;
}

MdictParser::HeadWordIndex MdictParser::splitHeadWordBlock( QByteArray const & block )
{
  HeadWordIndex index;

  const char * p = block.constData();
  const char * end = p + block.size();

  while ( p < end )
  {
    qint64 headWordId = ( numberTypeSize_ == 8 ) ?
                        qFromBigEndian<qint64>( ( const uchar * )p ) :
                        qFromBigEndian<quint32>( ( const uchar * )p );
    p += numberTypeSize_;
    QByteArray headWordBuf;

    if ( encoding_ == "UTF-16LE" )
    {
      int headWordLength = u16StrSize( ( const ushort * )p );
      headWordBuf = QByteArray( p, ( headWordLength + 1 ) * 2 );
    }
    else
    {
      int headWordLength = strlen( p );
      headWordBuf = QByteArray( p, headWordLength + 1 );
    }
    p += headWordBuf.size();
    QString headWord = toUtf16( encoding_, headWordBuf.constBegin(), headWordBuf.size() );
    index.emplace_back(headWordId, headWord);
  }

  return index;
}

bool MdictParser::readRecordBlock( MdictParser::HeadWordIndex & headWordIndex,
                                   MdictParser::RecordHandler & recordHandler )
{
  // cache the index, the headWordIndex is already sorted
  size_t idx = 0;

  for ( HeadWordIndex::const_iterator i = headWordIndex.begin(); i != headWordIndex.end(); ++i )
  {
    if (recordBlockInfos_[idx].shadowEndPos <= i->first)
      idx = RecordIndex::bsearch( recordBlockInfos_, i->first );

    if ( idx == ( size_t )( -1 ) )
      return false;

    RecordIndex const & recordIndex = recordBlockInfos_[idx];
    HeadWordIndex::const_iterator iNext = i + 1;
    qint64 recordSize;
    if ( iNext == headWordIndex.end() )
      recordSize = recordIndex.shadowEndPos - i->first;
    else
      recordSize = iNext->first - i->first;

    RecordInfo recordInfo;
    recordInfo.compressedBlockPos = recordPos_ + recordIndex.startPos;
    recordInfo.recordOffset = i->first - recordIndex.shadowStartPos;
    recordInfo.decompressedBlockSize = recordIndex.decompressedSize;
    recordInfo.compressedBlockSize = recordIndex.compressedSize;
    recordInfo.recordSize = recordSize;

    recordHandler.handleRecord( i->second, recordInfo );
  }

  return true;
}

QString & MdictParser::substituteStylesheet( QString & article, MdictParser::StyleSheets const & styleSheets )
{
  QRegularExpression rx( "`(\\d+)`", QRegularExpression::UseUnicodePropertiesOption );
  QString articleNewText;

  QString endStyle;
  int pos = 0;

  QRegularExpressionMatchIterator it = rx.globalMatch( article );
  while ( it.hasNext() )
  {
    QRegularExpressionMatch match = it.next();
    int styleId = match.captured( 1 ).toInt();
    articleNewText += article.mid( pos, match.capturedStart() - pos );
    pos = match.capturedEnd();

    StyleSheets::const_iterator iter = styleSheets.find( styleId );

    if ( iter != styleSheets.end() )
    {
      QString rep = endStyle + iter->second.first;
      articleNewText += rep;

      endStyle = iter->second.second;
    }
    else
    {
      articleNewText += endStyle;

      endStyle = "";
    }
  }
  if( pos )
  {
    articleNewText += Utils::rstripnull( article.mid( pos ));
    article = articleNewText;
    articleNewText.clear();
  }
  article += endStyle;
  return article;
}

}