MDict: Support encrypted keyword index

2024-12-18 03:14:06 +00:00 · 2015-10-09 01:11:45 +08:00 · 2015-10-09 01:11:45 +08:00 · 3fe3c0ea2b
parent b4bb1e9635
commit 3fe3c0ea2b
2 changed files with 180 additions and 179 deletions
--- a/mdictparser.cc
+++ b/mdictparser.cc
@ -1,8 +1,10 @@
 // https://bitbucket.org/xwang/mdict-analysis
 // https://github.com/zhansliu/writemdict/blob/master/fileformat.md
 // Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
 //
 // Copyright (C) 2012, 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
 // Copyright (C) 2013 Timon Wong <timon86.wang AT gmail DOT com>
 // Copyright (C) 2015 Zhe Wang <0x1998 AT gmail DOT com>
 //
 // This program is a free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
@ -23,8 +25,9 @@
 #include <iconv.h>
 #include <lzo/lzo1x.h>
-#include <algorithm>
+extern "C" {
-#include <iterator>
+#include <libavutil/ripemd.h>
 }
 #include <QtEndian>
 #include <QStringList>
@ -34,13 +37,18 @@
 #include <QDomDocument>
 #include <QTextDocumentFragment>
 #include <QDebug>
 #include "decompress.hh"
 #include "gddebug.hh"
 namespace Mdict
 {
 enum EncryptedSection
 {
  EcryptedHeadWordHeader = 1,
  EcryptedHeadWordIndex = 2
 };
 static inline int u16StrSize( const ushort * unicode )
 {
  int size = 0;
@ -103,9 +111,8 @@ MdictParser::MdictParser() :
  recordPos_( 0 ),
  wordCount_( 0 ),
  numberTypeSize_( 0 ),
-  rtl_( false ),
+  encrypted_( 0 ),
-  bruteForce_( false ),
+  rtl_( false )
  bruteForceEnd_( true )
 {
 }
@ -114,7 +121,7 @@ bool MdictParser::open( const char * filename )
  filename_ = QString::fromUtf8( filename );
  file_ = new QFile( filename_ );
-  qDebug() << "MdictParser: open " << filename_;
+  GD_DPRINTF( "MdictParser: open %s\n", filename );
  if ( file_.isNull() || !file_->exists() )
    return false;
@ -139,39 +146,6 @@ bool MdictParser::open( const char * filename )
 bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIndex )
 {
  if ( bruteForce_ )
  {
    if ( bruteForceEnd_ )
      return false;
    headWordIndex.clear();
    ScopedMemMap mapping( *file_, headWordPos_, headWordBlockSize_ );
    if ( !mapping.startAddress() )
      return false;
    const char * pDataStart = ( const char * )mapping.startAddress();
    const char * pDataEnd = pDataStart + headWordBlockSize_;
    const char pattern[] = {0x02, 0x00, 0x00, 0x00};
    const char * patternBegin = pattern;
    const char * patternEnd = pattern + 4;
    const char * p;
    do
    {
      p = std::search( pDataStart + 4, pDataEnd, patternBegin, patternEnd );
      QByteArray decompressed = zlibDecompress( pDataStart + 8, p - ( pDataStart + 8 ) );
      HeadWordIndex currentIndex = splitHeadWordBlock( decompressed );
      headWordIndex.insert( headWordIndex.end(), currentIndex.begin(), currentIndex.end() );
      pDataStart = p;
    }
    while ( p != pDataEnd );
    bruteForceEnd_ = true;
    return true;
  }
  else
  {
  if ( headWordBlockInfosIter_ == headWordBlockInfos_.end() )
    return false;
@ -194,7 +168,13 @@ bool MdictParser::readNextHeadWordIndex( MdictParser::HeadWordIndex & headWordIn
  headWordIndex = splitHeadWordBlock( decompressed );
  headWordBlockInfosIter_++;
  return true;
-  }
+}
 bool MdictParser::checkAdler32(const char * buffer, unsigned int len, quint32 checksum)
 {
  uLong adler = adler32( 0L, Z_NULL, 0 );
  adler = adler32( adler, ( const Bytef * ) buffer, len );
  return (adler & 0xFFFFFFFF) == checksum;
 }
 QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t fromSize )
@ -236,52 +216,71 @@ QString MdictParser::toUtf16( const char * fromCode, const char * from, size_t f
  return QString::fromUtf16( ( const ushort * )&result.front() );
 }
-bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr,
+bool MdictParser::decryptHeadWordIndex(char * buffer, qint64 len)
-                                        qint64 decompressedBlockSize, QByteArray & decompressedBlock )
+{
  struct AVRIPEMD * ripemd = av_ripemd_alloc();
  if ( av_ripemd_init( ripemd, 128 ) != 0 )
    return false;
  av_ripemd_update( ripemd, ( const uchar * ) buffer + 4, 4 );
  av_ripemd_update( ripemd, ( const uchar * ) "\x95\x36\x00\x00", 4 );
  uint8_t key[16];
  av_ripemd_final( ripemd, key );
  buffer += 8;
  len -= 8;
  uint8_t prev = 0x36;
  for (qint64 i = 0; i < len; ++i)
  {
    uint8_t byte = buffer[i];
    byte = (byte >> 4) | (byte << 4);
    byte = byte ^ prev ^ (i & 0xFF) ^ key[i % 16];
    prev = buffer[i];
    buffer[i] = byte;
  }
  return true;
 }
 bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize,
                                        const char * compressedBlockPtr,
                                        qint64 decompressedBlockSize,
                                        QByteArray & decompressedBlock )
 {
  if ( compressedBlockSize <= 8 )
    return false;
-  qint64 dataSize = compressedBlockSize - 8;
+  // compression type
-  const char * dataPtr = compressedBlockPtr + 8;
+  quint32 type = qFromBigEndian<quint32>( ( const uchar * ) compressedBlockPtr );
-  // 4bytes - type
+  quint32 checksum = qFromBigEndian<quint32>( ( const uchar * )compressedBlockPtr + 4 );
-  // 4bytes - checksum
+  const char * buf = compressedBlockPtr + 8;
-  quint32 type;
+  qint64 size = compressedBlockSize - 8;
  quint32 checksum;
  type = qFromBigEndian<quint32>( ( const uchar * ) compressedBlockPtr );
  checksum = qFromBigEndian<quint32>( ( const uchar * )compressedBlockPtr + sizeof( quint32 ) );
-  if ( type == 0x00000000 )
+  switch ( type )
  {
    case 0x00000000:
      // No compression
-    checksum &= 0xffff;
+      if ( !checkAdler32( buf, size, checksum ) )
    quint16 sum = 0;
    for ( qint64 i = 0; i < dataSize; i++ )
      {
-      sum += dataPtr[i];
+        gdWarning( "MDict: parseCompressedBlock: plain: checksum not match" );
    }
    sum += 1;
    if ( checksum != sum )
    {
      qWarning() << "MDict: parseCompressedBlock: plain: checksum not match";
        return false;
      }
-    decompressedBlock = QByteArray( dataPtr, dataSize );
+      decompressedBlock = QByteArray( buf, size );
-  }
+      return true;
-  else if ( type == 0x01000000 )
+
    case 0x01000000:
      {
        // LZO compression
        int result;
        lzo_uint blockSize = ( lzo_uint )decompressedBlockSize;
        decompressedBlock.resize( blockSize );
-    result = lzo1x_decompress_safe( ( const uchar * )dataPtr, dataSize,
+        result = lzo1x_decompress_safe( ( const uchar * ) buf, size,
-                                    ( uchar * )decompressedBlock.data(), &blockSize, NULL );
+                                        ( uchar * )decompressedBlock.data(),
                                        &blockSize, NULL );
        if ( result != LZO_E_OK || blockSize != ( lzo_uint )decompressedBlockSize )
        {
-      qWarning() << "MDict: parseCompressedBlock: decompression failed";
+          gdWarning( "MDict: parseCompressedBlock: decompression failed" );
          return false;
        }
@ -289,24 +288,26 @@ bool MdictParser::parseCompressedBlock( qint64 compressedBlockSize, const char *
                                      ( const uchar * )decompressedBlock.constData(),
                                      blockSize ) )
        {
-      qWarning() << "MDict: parseCompressedBlock: lzo: checksum not match";
+          gdWarning( "MDict: parseCompressedBlock: lzo: checksum does not match" );
          return false;
        }
      }
-  else if ( type == 0x02000000 )
+      break;
  {
    // zlib compression
    if ( checksum != qFromBigEndian<quint32>( ( const uchar * )dataPtr + dataSize - 4 ) )
    {
      qWarning() << "MDict: parseCompressedBlock: zlib: checksum not match";
      return false;
    }
-    decompressedBlock = zlibDecompress( dataPtr, dataSize );
+    case 0x02000000:
-  }
+      // zlib compression
-  else
+      decompressedBlock = zlibDecompress( buf, size );
      if ( !checkAdler32( decompressedBlock.constData(), decompressedBlock.size(),
                          checksum ) )
      {
-    qWarning() << "MDict: parseCompressedBlock: unknown type";
+        gdWarning( "MDict: parseCompressedBlock: zlib: checksum does not match" );
        return false;
      }
      break;
    default:
      gdWarning( "MDict: parseCompressedBlock: unknown type" );
      return false;
  }
@ -355,7 +356,18 @@ bool MdictParser::readHeader( QDataStream & in )
    return false;
  QString headerText = toUtf16( "UTF-16LE", headerTextUtf16.constData(), headerTextUtf16.size() );
  // Adler-32 checksum of the header text (little-endian)
  quint32 checksum;
  in.setByteOrder( QDataStream::LittleEndian );
  in >> checksum;
  if ( !checkAdler32( headerTextUtf16.constData(), headerTextUtf16.size(), checksum ) )
  {
    gdWarning( "MDict: readHeader: checksum does not match" );
    return false;
  }
  headerTextUtf16.clear();
  in.setByteOrder( QDataStream::BigEndian );
  QDomNamedNodeMap headerAttributes = parseHeaderAttributes( headerText );
@ -391,9 +403,8 @@ bool MdictParser::readHeader( QDataStream & in )
  else
    numberTypeSize_ = 8;
-  // 4 bytes unknown
+  // Encrypted ?
-  if ( in.skipRawData( 4 ) != 4 )
+  encrypted_ = headerAttributes.namedItem("Encrypted").toAttr().value().toInt();
    return false;
  // Read metadata
  rtl_ = headerAttributes.namedItem( "Left2Right" ).toAttr().value() != "Yes";
@ -418,87 +429,77 @@ bool MdictParser::readHeader( QDataStream & in )
 bool MdictParser::readHeadWordBlockInfos( QDataStream & in )
 {
  QByteArray header = file_->read( version_ >= 2.0 ? 40 : 32 );
  QDataStream stream( header );
  // number of headword blocks
-  numHeadWordBlocks_ = readNumber( in );
+  numHeadWordBlocks_ = readNumber( stream );
  // number of entries
-  wordCount_ = readNumber( in );
+  wordCount_ = readNumber( stream );
-  // unknown field
+  // number of bytes of a headword block info after decompression
  qint64 decompressedSize;
  if ( version_ >= 2.0 )
-  {
+    stream >> decompressedSize;
    if ( in.skipRawData( numberTypeSize_ ) != numberTypeSize_ )
      return false;
  }
-  // number of bytes of a headword block info
+  // number of bytes of a headword block info before decompression
-  headWordBlockInfoSize_ = readNumber( in );
+  headWordBlockInfoSize_ = readNumber( stream );
  // number of bytes of a headword block
-  headWordBlockSize_ = readNumber( in );
+  headWordBlockSize_ = readNumber( stream );
-  // unknown field
+  // Adler-32 checksum of the header. If those are encrypted, it is
  // the checksum of the decrypted version
  if ( version_ >= 2.0 )
  {
-    if ( in.skipRawData( 4 ) != 4 )
+    quint32 checksum;
    in >> checksum;
    if ( !checkAdler32( header.constData(), 40, checksum ) )
      return false;
  }
  headWordBlockInfoPos_ = file_->pos();
-  // read headword block info, which indicates headword block's compressed and decompressed size
+  // read headword block info
  QByteArray headWordBlockInfo = file_->read( headWordBlockInfoSize_ );
  if ( headWordBlockInfo.size() != headWordBlockInfoSize_ )
    return false;
  if ( version_ >= 2.0 )
  {
-    quint32 type;
+    // decrypt
-    quint32 checksum;
+    if ( encrypted_ & EcryptedHeadWordIndex )
    quint32 value;
    QDataStream headWordBlockInfoStream( headWordBlockInfo );
    headWordBlockInfoStream.setByteOrder( QDataStream::BigEndian );
    headWordBlockInfoStream >> type >> checksum;
    headWordBlockInfoStream.skipRawData( headWordBlockInfoSize_ - 8 - 4 );
    headWordBlockInfoStream >> value;
    // 02 00 00 00
    if ( type != 0x02000000 )
    {
-      qWarning() << "MDict: readHeadWordBlockInfos: type not match";
+      if ( !decryptHeadWordIndex( headWordBlockInfo.data(),
                                  headWordBlockInfo.size() ) )
        return false;
    }
-    if ( checksum == value )
+    QByteArray decompressed;
-    {
+    if ( !parseCompressedBlock( headWordBlockInfo.size(), headWordBlockInfo.data(),
-      // Decompress
+                                decompressedSize, decompressed) )
-      headWordBlockInfo = zlibDecompress( headWordBlockInfo.data() + 8,
+      return false;
-                                          headWordBlockInfo.size() - 8 );
+
    headWordBlockInfos_ = decodeHeadWordBlockInfo( decompressed );
  }
  else
  {
      qWarning() << "MDict: readHeadWordBlockInfos: checksum not match, try brute force...";
      headWordPos_ = file_->pos();
      bruteForce_ = true;
      bruteForceEnd_ = false;
      return true;
    }
  }
  headWordPos_ = file_->pos();
    headWordBlockInfos_ = decodeHeadWordBlockInfo( headWordBlockInfo );
  }
  headWordPos_ = file_->pos();
  headWordBlockInfosIter_ = headWordBlockInfos_.begin();
  return true;
 }
 bool MdictParser::readRecordBlockInfos()
 {
-  file_->seek( headWordBlockInfoPos_ + headWordBlockInfoSize_ + headWordBlockSize_ );
+  file_->seek( headWordBlockInfoPos_ + headWordBlockInfoSize_ +
               headWordBlockSize_ );
  QDataStream in( file_ );
  in.setByteOrder( QDataStream::BigEndian );
  qint64 numRecordBlocks = readNumber( in );
-  readNumber( in ); // entry count, skip
+  readNumber( in ); // total number of records, skip
  qint64 recordInfoSize = readNumber( in );
  totalRecordsSize_ = readNumber( in );
  recordPos_ = file_->pos() + recordInfoSize;
@ -544,18 +545,18 @@ MdictParser::BlockInfoVector MdictParser::decodeHeadWordBlockInfo( QByteArray co
  while ( !s.atEnd() )
  {
-    // unknown
+    // Number of keywords in the block
    s.skipRawData( numberTypeSize_ );
-    // Text head size
+    // Size of the first headword in the block
    quint32 textHeadSize = readU8OrU16( s, isU16 );
-    // Text head
+    // The first headword
    if ( encoding_ != "UTF-16LE" )
      s.skipRawData( textHeadSize + textTermSize );
    else
      s.skipRawData( ( textHeadSize + textTermSize ) * 2 );
-    // Text tail Size
+    // Size of the last headword in the block
    quint32 textTailSize = readU8OrU16( s, isU16 );
-    // Text tail
+    // The last headword
    if ( encoding_ != "UTF-16LE" )
      s.skipRawData( textTailSize + textTermSize );
    else
--- a/mdictparser.hh
+++ b/mdictparser.hh
@ -165,8 +165,7 @@ public:
    return toUtf16( fromCode.toLatin1().constData(), from, fromSize );
  }
  static bool parseCompressedBlock( qint64 compressedBlockSize, const char * compressedBlockPtr,
-                                    qint64 decompressedBlockSize, QByteArray & decompressedBlock );
+                                    qint64 decompressedBlockSize, QByteArray & decompressedBlock);
  static QString & substituteStylesheet( QString & article, StyleSheets const & styleSheets );
  static inline string substituteStylesheet( string const & article, StyleSheets const & styleSheets )
  {
@ -178,6 +177,8 @@ public:
 protected:
  qint64 readNumber( QDataStream & in );
  static quint32 readU8OrU16( QDataStream & in, bool isU16 );
  static bool checkAdler32(const char * buffer, unsigned int len, quint32 checksum);
  static bool decryptHeadWordIndex(char * buffer, qint64 len);
  bool readHeader( QDataStream & in );
  bool readHeadWordBlockInfos( QDataStream & in );
  bool readRecordBlockInfos();
@ -207,9 +208,8 @@ protected:
  quint32 wordCount_;
  int numberTypeSize_;
  int encrypted_;
  bool rtl_;
  bool bruteForce_;
  bool bruteForceEnd_;
 };
 }