2012-02-20 21:47:14 +00:00
|
|
|
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
2009-10-25 22:49:24 +00:00
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#include "indexedzip.hh"
|
|
|
|
#include "zipfile.hh"
|
|
|
|
#include <zlib.h>
|
2013-11-16 18:34:09 +00:00
|
|
|
#include "gddebug.hh"
|
2012-01-30 13:13:58 +00:00
|
|
|
#include "utf8.hh"
|
|
|
|
#include "iconv.hh"
|
2014-10-10 12:51:39 +00:00
|
|
|
#include "wstring_qt.hh"
|
2022-02-27 05:17:37 +00:00
|
|
|
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
|
2022-02-28 16:26:59 +00:00
|
|
|
#include <QtCore5Compat/QTextCodec>
|
|
|
|
#else
|
|
|
|
#include <QTextCodec>
|
2022-02-27 05:17:37 +00:00
|
|
|
#endif
|
2013-09-20 14:25:44 +00:00
|
|
|
|
2009-10-25 22:49:24 +00:00
|
|
|
using namespace BtreeIndexing;
|
|
|
|
using std::vector;
|
|
|
|
|
|
|
|
bool IndexedZip::openZipFile( QString const & name )
|
|
|
|
{
|
|
|
|
zip.setFileName( name );
|
|
|
|
|
|
|
|
zipIsOpen = zip.open( QFile::ReadOnly );
|
|
|
|
|
|
|
|
return zipIsOpen;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IndexedZip::hasFile( gd::wstring const & name )
|
|
|
|
{
|
|
|
|
if ( !zipIsOpen )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
vector< WordArticleLink > links = findArticles( name );
|
|
|
|
|
|
|
|
return !links.empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IndexedZip::loadFile( gd::wstring const & name, vector< char > & data )
|
|
|
|
{
|
|
|
|
if ( !zipIsOpen )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
vector< WordArticleLink > links = findArticles( name );
|
|
|
|
|
|
|
|
if ( links.empty() )
|
|
|
|
return false;
|
|
|
|
|
2012-09-28 12:39:52 +00:00
|
|
|
return loadFile( links[ 0 ].articleOffset, data );
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IndexedZip::loadFile( uint32_t offset, vector< char > & data )
|
|
|
|
{
|
|
|
|
if ( !zipIsOpen )
|
|
|
|
return false;
|
|
|
|
|
2009-10-25 22:49:24 +00:00
|
|
|
// Now seek into the zip file and read its header
|
|
|
|
|
2012-09-28 12:39:52 +00:00
|
|
|
if ( !zip.seek( offset ) )
|
2009-10-25 22:49:24 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
ZipFile::LocalFileHeader header;
|
|
|
|
|
|
|
|
if ( !ZipFile::readLocalHeader( zip, header ) )
|
|
|
|
{
|
2022-06-01 15:40:00 +00:00
|
|
|
GD_DPRINTF( "Failed to load header" );
|
2009-10-25 22:49:24 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Which algorithm was used?
|
|
|
|
|
|
|
|
switch( header.compressionMethod )
|
|
|
|
{
|
|
|
|
case ZipFile::Uncompressed:
|
2022-06-01 15:40:00 +00:00
|
|
|
GD_DPRINTF( "Uncompressed" );
|
2009-10-25 22:49:24 +00:00
|
|
|
data.resize( header.uncompressedSize );
|
2012-10-31 13:58:35 +00:00
|
|
|
return (size_t) zip.read( &data.front(), data.size() ) == data.size();
|
2009-10-25 22:49:24 +00:00
|
|
|
|
|
|
|
case ZipFile::Deflated:
|
|
|
|
{
|
|
|
|
// Now do the deflation
|
|
|
|
|
|
|
|
QByteArray compressedData = zip.read( header.compressedSize );
|
|
|
|
|
|
|
|
if ( compressedData.size() != (int)header.compressedSize )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
data.resize( header.uncompressedSize );
|
|
|
|
|
|
|
|
z_stream stream;
|
|
|
|
|
|
|
|
memset( &stream, 0, sizeof( stream ) );
|
|
|
|
|
|
|
|
stream.next_in = ( Bytef * ) compressedData.data();
|
|
|
|
stream.avail_in = compressedData.size();
|
|
|
|
stream.next_out = ( Bytef * ) &data.front();
|
|
|
|
stream.avail_out = data.size();
|
|
|
|
|
|
|
|
if ( inflateInit2( &stream, -MAX_WBITS ) != Z_OK )
|
|
|
|
{
|
|
|
|
data.clear();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( inflate( &stream, Z_FINISH ) != Z_STREAM_END )
|
|
|
|
{
|
2014-05-10 21:02:31 +00:00
|
|
|
GD_DPRINTF( "Not zstream end!" );
|
2009-10-25 22:49:24 +00:00
|
|
|
|
|
|
|
data.clear();
|
|
|
|
|
|
|
|
inflateEnd( &stream );
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
inflateEnd( &stream );
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2012-01-30 13:13:58 +00:00
|
|
|
|
2014-10-10 12:52:22 +00:00
|
|
|
bool IndexedZip::indexFile( BtreeIndexing::IndexedWords &zipFileNames, quint32 * filesCount )
|
2012-01-30 13:13:58 +00:00
|
|
|
{
|
|
|
|
if ( !zipIsOpen )
|
|
|
|
return false;
|
|
|
|
if ( !ZipFile::positionAtCentralDir( zip ) )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// File seems to be a valid zip file
|
|
|
|
|
2014-10-10 12:51:39 +00:00
|
|
|
|
|
|
|
QTextCodec * localeCodec = QTextCodec::codecForLocale();
|
|
|
|
|
2012-01-30 13:13:58 +00:00
|
|
|
ZipFile::CentralDirEntry entry;
|
|
|
|
|
2014-10-10 12:52:22 +00:00
|
|
|
bool alreadyCounted;
|
|
|
|
if( filesCount )
|
|
|
|
*filesCount = 0;
|
|
|
|
|
2012-01-30 13:13:58 +00:00
|
|
|
while( ZipFile::readNextEntry( zip, entry ) )
|
|
|
|
{
|
|
|
|
if ( entry.compressionMethod == ZipFile::Unsupported )
|
|
|
|
{
|
2013-09-20 14:25:44 +00:00
|
|
|
qWarning( "Zip warning: compression method unsupported -- skipping file \"%s\"\n",
|
|
|
|
entry.fileName.data() );
|
2012-01-30 13:13:58 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if the file name has some non-ascii letters.
|
|
|
|
|
|
|
|
unsigned char const * ptr = ( unsigned char const * )
|
|
|
|
entry.fileName.constData();
|
|
|
|
|
|
|
|
bool hasNonAscii = false;
|
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
if ( *ptr & 0x80 )
|
|
|
|
{
|
|
|
|
hasNonAscii = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( !*ptr++ )
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-10-10 12:52:22 +00:00
|
|
|
alreadyCounted = false;
|
|
|
|
|
2012-01-30 13:13:58 +00:00
|
|
|
if ( !hasNonAscii )
|
|
|
|
{
|
|
|
|
// Add entry as is
|
|
|
|
|
|
|
|
zipFileNames.addSingleWord( Utf8::decode( entry.fileName.data() ),
|
|
|
|
entry.localHeaderOffset );
|
2014-10-10 12:52:22 +00:00
|
|
|
if( filesCount )
|
|
|
|
*filesCount += 1;
|
2012-01-30 13:13:58 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2014-10-10 12:51:39 +00:00
|
|
|
// Try assuming different encodings. Those are UTF8, system locale and two
|
2012-01-30 13:13:58 +00:00
|
|
|
// Russian ones (Windows and Windows OEM). Unfortunately, zip
|
|
|
|
// files do not say which encoding they utilize.
|
|
|
|
|
|
|
|
// Utf8
|
|
|
|
try
|
|
|
|
{
|
|
|
|
wstring decoded = Utf8::decode( entry.fileName.constData() );
|
|
|
|
|
|
|
|
zipFileNames.addSingleWord( decoded,
|
|
|
|
entry.localHeaderOffset );
|
2014-10-10 12:52:22 +00:00
|
|
|
if( filesCount != 0 && !alreadyCounted )
|
|
|
|
{
|
|
|
|
*filesCount += 1;
|
|
|
|
alreadyCounted = true;
|
|
|
|
}
|
2012-01-30 13:13:58 +00:00
|
|
|
}
|
2018-05-22 14:48:14 +00:00
|
|
|
catch( Utf8::exCantDecode & )
|
2012-01-30 13:13:58 +00:00
|
|
|
{
|
|
|
|
// Failed to decode
|
|
|
|
}
|
|
|
|
|
2014-10-09 19:30:11 +00:00
|
|
|
if( !entry.fileNameInUTF8 )
|
2012-01-30 13:13:58 +00:00
|
|
|
{
|
2014-10-10 12:51:39 +00:00
|
|
|
wstring nameInSystemLocale;
|
|
|
|
|
|
|
|
// System locale
|
|
|
|
if( localeCodec )
|
|
|
|
{
|
|
|
|
QString name = localeCodec->toUnicode( entry.fileName.constData(),
|
|
|
|
entry.fileName.size() );
|
|
|
|
nameInSystemLocale = gd::toWString( name );
|
|
|
|
if( !nameInSystemLocale.empty() )
|
|
|
|
{
|
|
|
|
zipFileNames.addSingleWord( nameInSystemLocale,
|
|
|
|
entry.localHeaderOffset );
|
2014-10-10 12:52:22 +00:00
|
|
|
|
|
|
|
if( filesCount != 0 && !alreadyCounted )
|
|
|
|
{
|
|
|
|
*filesCount += 1;
|
|
|
|
alreadyCounted = true;
|
|
|
|
}
|
2014-10-10 12:51:39 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-10-09 19:30:11 +00:00
|
|
|
// CP866
|
|
|
|
try
|
|
|
|
{
|
|
|
|
wstring decoded = Iconv::toWstring( "CP866", entry.fileName.constData(),
|
|
|
|
entry.fileName.size() );
|
|
|
|
|
2014-10-10 12:51:39 +00:00
|
|
|
if( nameInSystemLocale.compare( decoded ) != 0 )
|
|
|
|
{
|
|
|
|
zipFileNames.addSingleWord( decoded,
|
|
|
|
entry.localHeaderOffset );
|
2014-10-10 12:52:22 +00:00
|
|
|
|
|
|
|
if( filesCount != 0 && !alreadyCounted )
|
|
|
|
{
|
|
|
|
*filesCount += 1;
|
|
|
|
alreadyCounted = true;
|
|
|
|
}
|
2014-10-10 12:51:39 +00:00
|
|
|
}
|
2014-10-09 19:30:11 +00:00
|
|
|
}
|
2018-05-22 14:48:14 +00:00
|
|
|
catch( Iconv::Ex & )
|
2014-10-09 19:30:11 +00:00
|
|
|
{
|
|
|
|
// Failed to decode
|
|
|
|
}
|
|
|
|
|
|
|
|
// CP1251
|
|
|
|
try
|
|
|
|
{
|
|
|
|
wstring decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(),
|
|
|
|
entry.fileName.size() );
|
|
|
|
|
2014-10-10 12:51:39 +00:00
|
|
|
if( nameInSystemLocale.compare( decoded ) != 0 )
|
|
|
|
{
|
|
|
|
zipFileNames.addSingleWord( decoded,
|
|
|
|
entry.localHeaderOffset );
|
2014-10-10 12:52:22 +00:00
|
|
|
|
|
|
|
if( filesCount != 0 && !alreadyCounted )
|
|
|
|
{
|
|
|
|
*filesCount += 1;
|
|
|
|
alreadyCounted = true;
|
|
|
|
}
|
2014-10-10 12:51:39 +00:00
|
|
|
}
|
2014-10-09 19:30:11 +00:00
|
|
|
}
|
2018-05-22 14:48:14 +00:00
|
|
|
catch( Iconv::Ex & )
|
2014-10-09 19:30:11 +00:00
|
|
|
{
|
2012-01-30 13:13:58 +00:00
|
|
|
// Failed to decode
|
2014-10-09 19:30:11 +00:00
|
|
|
}
|
2012-01-30 13:13:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|