goldendict-ng/zim.cc
Igor Kushnir 5034348c1a Allow silencing failed-loading-resource warnings
When a referenced audio resource is not found in a DSL or XDXF
dictionary, GoldenDict searches for this resource by filename in all
other dictionaries within the current group. Naturally, the file is
absent from most dictionaries (see #970). Therefore a "Failed loading
resource" warning is printed for almost every dictionary in the current
group.

These warnings are by far the most frequent on my system. And in the
scenario described above there is nothing wrong at all. So the user may
want to silence these warnings to help notice less frequent and more
important messages. Implement categorized logging to enable this
customization. These warnings can now be disabled by adding the
following line in the [Rules] section of a logging configuration file
(e.g. ~/.config/QtProject/qtlogging.ini on GNU/Linux):
    goldendict.dictionary.resource.warning=false
See also https://doc.qt.io/qt-5/qloggingcategory.html#logging-rules
2022-11-25 22:48:38 +03:00

1903 lines
52 KiB
C++

/* This file is (c) 2012 Abs62
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifdef MAKE_ZIM_SUPPORT
#include "zim.hh"
#include "btreeidx.hh"
#include "fsencoding.hh"
#include "folding.hh"
#include "categorized_logging.hh"
#include "gddebug.hh"
#include "utf8.hh"
#include "decompress.hh"
#include "langcoder.hh"
#include "wstring_qt.hh"
#include "filetype.hh"
#include "file.hh"
#include "qt4x5.hh"
#include "tiff.hh"
#include "ftshelpers.hh"
#include "htmlescape.hh"
#include "splitfile.hh"
#ifdef _MSC_VER
#include <stub_msvc.h>
#endif
#include <QByteArray>
#include <QFile>
#include <QFileInfo>
#include <QString>
#include <QRunnable>
#include <QSemaphore>
#include <QAtomicInt>
#include <QImage>
#include <QDir>
#include <QDebug>
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
#include <QRegularExpression>
#endif
#include <string>
#include <set>
#include <map>
#include <algorithm>
namespace Zim {
#define CACHE_SIZE 3
using std::string;
using std::map;
using std::vector;
using std::multimap;
using std::pair;
using std::set;
using gd::wstring;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
DEF_EX_STR( exNotZimFile, "Not an Zim file", Dictionary::Ex )
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
//namespace {
class ZimFile;
#pragma pack( push, 1 )
enum CompressionType
{
Default = 0, None, Zlib, Bzip2, Lzma2, Zstd
};
/// Zim file header
struct ZIM_header
{
quint32 magicNumber;
quint16 majorVersion;
quint16 minorVersion;
quint8 uuid[ 16 ];
quint32 articleCount;
quint32 clusterCount;
quint64 urlPtrPos;
quint64 titlePtrPos;
quint64 clusterPtrPos;
quint64 mimeListPos;
quint32 mainPage;
quint32 layoutPage;
quint64 checksumPos;
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
struct ArticleEntry
{
quint16 mimetype;
quint8 parameterLen;
char nameSpace;
quint32 revision;
quint32 clusterNumber;
quint32 blobNumber;
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
struct RedirectEntry
{
quint16 mimetype;
quint8 parameterLen;
char nameSpace;
quint32 revision;
quint32 redirectIndex;
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
enum
{
Signature = 0x584D495A, // ZIMX on little-endian, XMIZ on big-endian
CurrentFormatVersion = 3 + BtreeIndexing::FormatVersion + Folding::Version
};
struct IdxHeader
{
quint32 signature; // First comes the signature, ZIMX
quint32 formatVersion; // File format version (CurrentFormatVersion)
quint32 indexBtreeMaxElements; // Two fields from IndexInfo
quint32 indexRootOffset;
quint32 resourceIndexBtreeMaxElements; // Two fields from IndexInfo
quint32 resourceIndexRootOffset;
quint32 wordCount;
quint32 articleCount;
quint32 namePtr;
quint32 descriptionPtr;
quint32 langFrom; // Source language
quint32 langTo; // Target language
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
#pragma pack( pop )
// Class for support of split zim files
struct Cache
{
char * data;
quint32 clusterNumber;
int stamp;
int count, size;
unsigned blobs_offset_size;
Cache() :
data( 0 ),
clusterNumber( 0 ),
stamp( -1 ),
count( 0 ),
size( 0 ),
blobs_offset_size( 0 )
{}
};
class ZimFile : public SplitFile::SplitFile
{
public:
ZimFile();
ZimFile( const QString & name );
~ZimFile();
virtual void setFileName( const QString & name );
bool open();
void close()
{
SplitFile::close();
clearCache();
}
const ZIM_header & header() const
{ return zimHeader; }
string getClusterData( quint32 cluster_nom, unsigned & blob_offset_size );
const QString getMimeType( quint16 nom )
{ return mimeTypes.value( nom ); }
bool isArticleMime( quint16 mime_type )
{ return getMimeType( mime_type ).startsWith( "text/html", Qt::CaseInsensitive )
|| getMimeType( mime_type ).startsWith( "text/plain", Qt::CaseInsensitive ); }
quint16 redirectedMimeType( RedirectEntry const & redEntry );
private:
ZIM_header zimHeader;
Cache cache[ CACHE_SIZE ];
int stamp;
QVector< QPair< quint64, quint32 > > clusterOffsets;
QStringList mimeTypes;
void clearCache();
};
ZimFile::ZimFile() :
stamp( 0 )
{
memset( &zimHeader, 0, sizeof( zimHeader ) );
}
ZimFile::ZimFile( const QString & name )
{
setFileName( name );
}
ZimFile::~ZimFile()
{
clearCache();
}
void ZimFile::setFileName( const QString & name )
{
close();
memset( &zimHeader, 0, sizeof( zimHeader ) );
clearCache();
appendFile( name );
if( name.endsWith( ".zimaa", Qt::CaseInsensitive ) )
{
QString fname = name;
for( int i = 0; i < 26; i++ )
{
fname[ fname.size() - 2 ] = (char)( 'a' + i );
int j;
for( j = 1; j < 26; j++ )
{
fname[ fname.size() - 1 ] = (char)( 'a' + j );
if( !QFileInfo( fname ).isFile() )
break;
appendFile( fname );
}
if( j < 26 )
break;
}
}
}
void ZimFile::clearCache()
{
for( int i = 0; i < CACHE_SIZE; i++ )
{
if( cache[ i ].data )
{
free( cache[ i ].data );
cache[ i ].data = 0;
}
cache[ i ].clusterNumber = 0;
cache[ i ].stamp = -1;
cache[ i ].count = 0;
cache[ i ].size = 0;
}
stamp = 0;
}
bool ZimFile::open()
{
if( !SplitFile::open( QIODevice::ReadOnly ) )
return false;
memset( &zimHeader, 0, sizeof( zimHeader ) );
if( read( reinterpret_cast< char * >( &zimHeader ), sizeof( zimHeader ) ) != sizeof( zimHeader ) )
return false;
// Clusters in zim file may be placed in random order.
// We create sorted offsets list to calculate clusters size.
clusterOffsets.resize( zimHeader.clusterCount );
QVector< quint64 > offs;
offs.resize( zimHeader.clusterCount );
seek( zimHeader.clusterPtrPos );
qint64 size = zimHeader.clusterCount * sizeof( quint64 );
if( read( reinterpret_cast< char * >( offs.data() ), size) != size )
{
vector< string > names;
getFilenames( names );
throw exCantReadFile( names[ 0 ] );
}
for( quint32 i = 0; i < zimHeader.clusterCount; i++ )
clusterOffsets[ i ] = QPair< quint64, quint32 >( offs.at( i ), i );
std::sort( clusterOffsets.begin(), clusterOffsets.end() );
// Read mime types
string type;
char ch;
seek( zimHeader.mimeListPos );
for( ; ; )
{
type.clear();
while( getChar( &ch ) )
{
if( ch == 0 )
break;
type.push_back( ch );
}
if( type.empty() )
break;
QString s = QString::fromUtf8( type.c_str(), type.size() );
mimeTypes.append( s );
}
return true;
}
string ZimFile::getClusterData( quint32 cluster_nom, unsigned & blobs_offset_size )
{
// Check cache
int target = 0;
bool found = false;
int lastStamp = INT_MAX;
for( int i = 0; i < CACHE_SIZE; i++ )
{
if( cache[ i ].clusterNumber == cluster_nom && cache[ i ].count )
{
found = true;
target = i;
break;
}
if( cache[ i ].stamp < lastStamp )
{
lastStamp = cache[ i ].stamp;
target = i;
}
}
cache[ target ].stamp = ++stamp;
if( stamp < 0 )
{
stamp = 0;
for (int i = 0; i < CACHE_SIZE; i++)
cache[ i ].stamp = -1;
}
if( found )
{
// Cache hit
blobs_offset_size = cache[ target ].blobs_offset_size;
return string( cache[ target ].data, cache[ target ].count );
}
// Cache miss, read data from file
// Calculate cluster size
quint64 clusterSize;
quint32 nom;
for( nom = 0; nom < zimHeader.clusterCount; nom++ )
if( clusterOffsets.at( nom ).second == cluster_nom )
break;
if( nom >= zimHeader.clusterCount ) // Invalid cluster nom
return string();
if( nom < zimHeader.clusterCount - 1 )
clusterSize = clusterOffsets.at( nom + 1 ).first - clusterOffsets.at( nom ).first;
else
clusterSize = size() - clusterOffsets.at( nom ).first;
// Read cluster data
seek( clusterOffsets.at( nom ).first );
char compressionType, cluster_info;
if( !getChar( &cluster_info ) )
return string();
compressionType = cluster_info & 0x0F;
blobs_offset_size = cluster_info & 0x10 && zimHeader.majorVersion >= 6 ? 8 : 4;
string decompressedData;
QByteArray data = read( clusterSize );
if( compressionType == Default || compressionType == None )
decompressedData = string( data.data(), data.size() );
else
if( compressionType == Zlib )
decompressedData = decompressZlib( data.constData(), data.size() );
else
if( compressionType == Bzip2 )
decompressedData = decompressBzip2( data.constData(), data.size() );
else
if( compressionType == Lzma2 )
decompressedData = decompressLzma2( data.constData(), data.size() );
else
if( compressionType == Zstd )
decompressedData = decompressZstd( data.constData(), data.size() );
else
return string();
if( decompressedData.empty() )
return string();
// Check BLOBs number in the cluster
// We cache multi-element clusters only
quint32 firstOffset32;
quint64 firstOffset;
if( blobs_offset_size == 8 )
memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
else
{
memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) );
firstOffset = firstOffset32;
}
quint32 blobCount = ( firstOffset - blobs_offset_size ) / blobs_offset_size;
if( blobCount > 1 )
{
// Fill cache
int size = decompressedData.size();
if( cache[ target ].count < size )
{
if( cache[ target ].data )
free( cache[ target ].data );
cache[ target ].data = ( char * )malloc( size );
if( cache[ target ].data )
cache[ target ].size = size;
else
{
cache[ target ].size = 0;
cache[ target ].count = 0;
}
}
if( cache[ target ].size )
{
memcpy( cache[ target ].data, decompressedData.c_str(), size );
cache[ target ].count = size;
cache[ target ].clusterNumber = cluster_nom;
cache[ target ].blobs_offset_size = blobs_offset_size;
}
}
return decompressedData;
}
quint16 ZimFile::redirectedMimeType( RedirectEntry const & redEntry )
{
RedirectEntry current_entry = redEntry;
quint64 current_pos = pos();
quint16 mimetype = 0xFFFF;
for( ; ; )
{
quint32 current_nom = current_entry.redirectIndex;
seek( zimHeader.urlPtrPos + (quint64)current_nom * 8 );
quint64 new_pos;
if( read( reinterpret_cast< char * >( &new_pos ), sizeof(new_pos) ) != sizeof(new_pos) )
break;
seek( new_pos );
quint16 new_mimetype;
if( read( reinterpret_cast< char * >( &new_mimetype ), sizeof(new_mimetype) ) != sizeof(new_mimetype) )
break;
if( new_mimetype == 0xFFFF ) // Redirect to other article
{
if( read( reinterpret_cast< char * >( &current_entry ) + 2, sizeof( current_entry ) - 2 ) != sizeof( current_entry ) - 2 )
break;
if( current_nom == current_entry.redirectIndex )
break;
}
else
{
mimetype = new_mimetype;
break;
}
}
seek( current_pos );
return mimetype;
}
// Some supporting functions
bool indexIsOldOrBad( string const & indexFile )
{
File::Class idx( indexFile, "rb" );
IdxHeader header;
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
header.signature != Signature ||
header.formatVersion != CurrentFormatVersion;
}
quint32 getArticleCluster( ZimFile & file, quint32 articleNumber )
{
while( 1 )
{
ZIM_header const & header = file.header();
if( articleNumber >= header.articleCount )
break;
file.seek( header.urlPtrPos + (quint64)articleNumber * 8 );
quint64 pos;
if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) )
break;
// Read article info
quint16 mimetype;
file.seek( pos );
if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) )
break;
if( mimetype == 0xFFFF ) // Redirect to other article
{
RedirectEntry redEntry;
if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 )
break;
if( articleNumber == redEntry.redirectIndex )
break;
articleNumber = redEntry.redirectIndex;
continue;
}
ArticleEntry artEntry;
artEntry.mimetype = mimetype;
if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 )
break;
return artEntry.clusterNumber;
}
return 0xFFFFFFFF;
}
quint32 readArticle( ZimFile & file, quint32 articleNumber, string & result,
set< quint32 > * loadedArticles = NULL )
{
result.clear();
while( 1 )
{
ZIM_header const & header = file.header();
if( articleNumber >= header.articleCount )
break;
file.seek( header.urlPtrPos + (quint64)articleNumber * 8 );
quint64 pos;
if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) )
break;
// Read article info
quint16 mimetype;
file.seek( pos );
if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) )
break;
if( mimetype == 0xFFFF ) // Redirect to other article
{
RedirectEntry redEntry;
if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 )
break;
if( articleNumber == redEntry.redirectIndex )
break;
articleNumber = redEntry.redirectIndex;
continue;
}
if( loadedArticles && loadedArticles->find( articleNumber ) != loadedArticles->end() )
break;
ArticleEntry artEntry;
artEntry.mimetype = mimetype;
if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 )
break;
// Read cluster data
unsigned offset_size = 0;
string decompressedData = file.getClusterData( artEntry.clusterNumber, offset_size );
if( decompressedData.empty() )
break;
// Take article data from cluster
quint32 firstOffset32;
quint64 firstOffset;
if( offset_size == 8 )
memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
else
{
memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) );
firstOffset = firstOffset32;
}
quint32 blobCount = ( firstOffset - offset_size ) / offset_size;
if( artEntry.blobNumber > blobCount )
break;
quint32 size;
if( offset_size == 8 )
{
quint64 offsets[ 2 ];
memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 8, sizeof(offsets) );
size = offsets[ 1 ] - offsets[ 0 ];
result.append( decompressedData, offsets[ 0 ], size );
}
else
{
quint32 offsets[ 2 ];
memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 4, sizeof(offsets) );
size = offsets[ 1 ] - offsets[ 0 ];
result.append( decompressedData, offsets[ 0 ], size );
}
return articleNumber;
}
return 0xFFFFFFFF;
}
// ZimDictionary
class ZimDictionary: public BtreeIndexing::BtreeDictionary
{
enum LINKS_TYPE { UNKNOWN, SLASH, NO_SLASH };
Mutex idxMutex;
Mutex zimMutex, idxResourceMutex;
File::Class idx;
BtreeIndex resourceIndex;
IdxHeader idxHeader;
string dictionaryName;
ZimFile df;
set< quint32 > articlesIndexedForFTS;
LINKS_TYPE linksType;
public:
ZimDictionary( string const & id, string const & indexFile,
vector< string > const & dictionaryFiles );
~ZimDictionary();
virtual string getName() throw()
{ return dictionaryName; }
virtual map< Dictionary::Property, string > getProperties() throw()
{ return map< Dictionary::Property, string >(); }
virtual unsigned long getArticleCount() throw()
{ return idxHeader.articleCount; }
virtual unsigned long getWordCount() throw()
{ return idxHeader.wordCount; }
inline virtual quint32 getLangFrom() const
{ return idxHeader.langFrom; }
inline virtual quint32 getLangTo() const
{ return idxHeader.langTo; }
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
vector< wstring > const & alts,
wstring const &,
bool ignoreDiacritics )
THROW_SPEC( std::exception );
virtual sptr< Dictionary::DataRequest > getResource( string const & name )
THROW_SPEC( std::exception );
virtual QString const& getDescription();
/// Loads the resource.
void loadResource( std::string &resourceName, string & data );
virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
int searchMode, bool matchCase,
int distanceBetweenWords,
int maxResults,
bool ignoreWordsOrder,
bool ignoreDiacritics,
QThreadPool * ftsThreadPoolPtr );
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
quint32 getArticleText( uint32_t articleAddress, QString & headword, QString & text,
set< quint32 > * loadedArticles );
virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
virtual void setFTSParameters( Config::FullTextSearch const & fts )
{
can_FTS = fts.enabled
&& !fts.disabledTypes.contains( "ZIM", Qt::CaseInsensitive )
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
}
virtual void sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets, QAtomicInt & isCancelled );
protected:
virtual void loadIcon() throw();
private:
/// Loads the article.
quint32 loadArticle( quint32 address,
string & articleText,
set< quint32 > * loadedArticles,
bool rawText = false );
string convert( string const & in_data );
friend class ZimArticleRequest;
friend class ZimResourceRequest;
};
ZimDictionary::ZimDictionary( string const & id,
string const & indexFile,
vector< string > const & dictionaryFiles ):
BtreeDictionary( id, dictionaryFiles ),
idx( indexFile, "rb" ),
idxHeader( idx.read< IdxHeader >() ),
df( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) ),
linksType( UNKNOWN )
{
// Open data file
df.open();
// Initialize the indexes
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
resourceIndex.openIndex( IndexInfo( idxHeader.resourceIndexBtreeMaxElements,
idxHeader.resourceIndexRootOffset ),
idx, idxResourceMutex );
// Read dictionary name
if( idxHeader.namePtr == 0xFFFFFFFF )
{
QString name = QDir::fromNativeSeparators( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) );
int n = name.lastIndexOf( '/' );
dictionaryName = string( name.mid( n + 1 ).toUtf8().constData() );
}
else
{
readArticle( df, idxHeader.namePtr, dictionaryName );
}
// Full-text search parameters
can_FTS = true;
ftsIdxName = indexFile + "_FTS";
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
FTS_index_completed.ref();
}
ZimDictionary::~ZimDictionary()
{
df.close();
}
void ZimDictionary::loadIcon() throw()
{
if ( dictionaryIconLoaded )
return;
QString fileName =
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
// Remove the extension
fileName.chop( 3 );
if( !loadIconFromFile( fileName ) )
{
// Load failed -- use default icons
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_zim.png");
}
dictionaryIconLoaded = true;
}
quint32 ZimDictionary::loadArticle( quint32 address,
string & articleText,
set< quint32 > * loadedArticles,
bool rawText )
{
quint32 ret;
{
Mutex::Lock _( zimMutex );
ret = readArticle( df, address, articleText, loadedArticles );
}
if( !rawText )
articleText = convert( articleText );
return ret;
}
string ZimDictionary::convert( const string & in )
{
QString text = QString::fromUtf8( in.c_str() );
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
// replace background
text.replace( QRegularExpression( "<\\s*body\\s+([^>]*)(background(|-color)):([^;\"]*(;|))" ),
QString( "<body \\1" ) );
// pattern of img and script
text.replace( QRegularExpression( "<\\s*(img|script)\\s+([^>]*)src=(\"|)(\\.\\.|)/" ),
QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
// Fix links without '"'
text.replace( QRegularExpression( "href=(\\.\\.|)/([^\\s>]+)" ),
QString( "href=\"\\1/\\2\"" ) );
// pattern <link... href="..." ...>
text.replace( QRegularExpression( "<\\s*link\\s+([^>]*)href=\"(\\.\\.|)/" ),
QString( "<link \\1href=\"bres://%1/").arg( getId().c_str() ) );
// localize the http://en.wiki***.com|org/wiki/<key> series links
// excluding those keywords that have ":" in it
QString urlWiki = "\"http(s|)://en\\.(wiki(pedia|books|news|quote|source|voyage|versity)|wiktionary)\\.(org|com)/wiki/([^:\"]*)\"";
text.replace( QRegularExpression( "<\\s*a\\s+(class=\"external\"\\s+|)href=" + urlWiki ),
QString( "<a href=\"gdlookup://localhost/\\6\"" ) );
#else
// replace background
text.replace( QRegExp( "<\\s*body\\s+([^>]*)(background(|-color)):([^;\"]*(|;))" ),
QString( "<body \\1" ) );
// pattern of img and script
text.replace( QRegExp( "<\\s*(img|script)\\s+([^>]*)src=(\"|)(\\.\\.|)/" ),
QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
// Fix links without '"'
text.replace( QRegExp( "href=(\\.\\.|)/([^\\s>]+)" ), QString( "href=\"\\1/\\2\"" ) );
// pattern <link... href="..." ...>
text.replace( QRegExp( "<\\s*link\\s+([^>]*)href=\"(\\.\\.|)/" ),
QString( "<link \\1href=\"bres://%1/").arg( getId().c_str() ) );
// localize the http://en.wiki***.com|org/wiki/<key> series links
// excluding those keywords that have ":" in it
QString urlWiki = "\"http(s|)://en\\.(wiki(pedia|books|news|quote|source|voyage|versity)|wiktionary)\\.(org|com)/wiki/([^:\"]*)\"";
text.replace( QRegExp( "<\\s*a\\s+(class=\"external\"\\s+|)href=" + urlWiki ),
QString( "<a href=\"gdlookup://localhost/\\6\"" ) );
#endif
// pattern <a href="..." ...>, excluding any known protocols such as http://, mailto:, #(comment)
// these links will be translated into local definitions
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
QRegularExpression rxLink( "<\\s*a\\s+([^>]*)href=\"(?!(?:\\w+://|#|mailto:|tel:))(/|)([^\"]*)\"\\s*(title=\"[^\"]*\")?[^>]*>" );
QRegularExpressionMatchIterator it = rxLink.globalMatch( text );
int pos = 0;
QString newText;
while( it.hasNext() )
{
QRegularExpressionMatch match = it.next();
newText += text.midRef( pos, match.capturedStart() - pos );
pos = match.capturedEnd();
QStringList list = match.capturedTexts();
// Add empty strings for compatibility with QRegExp behaviour
for( int i = match.lastCapturedIndex() + 1; i < 5; i++ )
list.append( QString() );
#else
QRegExp rxLink( "<\\s*a\\s+([^>]*)href=\"(?!(\\w+://|#|mailto:|tel:))(/|)([^\"]*)\"\\s*(title=\"[^\"]*\")?[^>]*>",
Qt::CaseSensitive,
QRegExp::RegExp2 );
int pos = 0;
while( (pos = rxLink.indexIn( text, pos )) >= 0 )
{
QStringList list = rxLink.capturedTexts();
#endif
QString tag = list[3]; // a url, ex: Precambrian_Chaotian.html
if ( !list[4].isEmpty() ) // a title, ex: title="Precambrian/Chaotian"
tag = list[4].split("\"")[1];
// Check type of links inside articles
if( linksType == UNKNOWN && tag.indexOf( '/' ) >= 0 )
{
QString word = QUrl::fromPercentEncoding( tag.toLatin1() );
word.remove( QRegExp( "\\.(s|)htm(l|)$", Qt::CaseInsensitive ) ).
replace( "_", " " );
vector< WordArticleLink > links;
links = findArticles( gd::toWString( word ) );
if( !links.empty() )
{
linksType = SLASH;
}
else
{
word.remove( QRegExp(".*/") );
links = findArticles( gd::toWString( word ) );
if( !links.empty() )
{
linksType = NO_SLASH;
links.clear();
}
}
}
if( linksType == SLASH || linksType == UNKNOWN )
{
tag.remove( QRegExp( "\\.(s|)htm(l|)$", Qt::CaseInsensitive ) ).
replace( "_", "%20" ).
prepend( "<a href=\"gdlookup://localhost/" ).
append( "\" " + list[4] + ">" );
}
else
{
tag.remove( QRegExp(".*/") ).
remove( QRegExp( "\\.(s|)htm(l|)$", Qt::CaseInsensitive ) ).
replace( "_", "%20" ).
prepend( "<a href=\"gdlookup://localhost/" ).
append( "\" " + list[4] + ">" );
}
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
newText += tag;
}
if( pos )
{
newText += text.midRef( pos );
text = newText;
}
newText.clear();
#else
text.replace( pos, list[0].length(), tag );
pos += tag.length() + 1;
}
#endif
// Occasionally words needs to be displayed in vertical, but <br/> were changed to <br\> somewhere
// proper style: <a href="gdlookup://localhost/Neoptera" ... >N<br/>e<br/>o<br/>p<br/>t<br/>e<br/>r<br/>a</a>
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
QRegularExpression rxBR( "(<a href=\"gdlookup://localhost/[^\"]*\"\\s*[^>]*>)\\s*((\\w\\s*&lt;br(\\\\|/|)&gt;\\s*)+\\w)\\s*</a>",
QRegularExpression::UseUnicodePropertiesOption );
pos = 0;
QRegularExpressionMatchIterator it2 = rxLink.globalMatch( text );
while( it2.hasNext() )
{
QRegularExpressionMatch match = it.next();
newText += text.midRef( pos, match.capturedStart() - pos );
pos = match.capturedEnd();
QStringList list = match.capturedTexts();
// Add empty strings for compatibility with QRegExp behaviour
for( int i = match.lastCapturedIndex() + 1; i < 3; i++ )
list.append( QString() );
QString tag = list[2];
tag.replace( QRegExp( "&lt;br( |)(\\\\|/|)&gt;", Qt::CaseInsensitive ) , "<br/>" ).
prepend( list[1] ).
append( "</a>" );
newText += tag;
}
if( pos )
{
newText += text.midRef( pos );
text = newText;
}
newText.clear();
#else
QRegExp rxBR( "(<a href=\"gdlookup://localhost/[^\"]*\"\\s*[^>]*>)\\s*((\\w\\s*&lt;br(\\\\|/|)&gt;\\s*)+\\w)\\s*</a>",
Qt::CaseSensitive,
QRegExp::RegExp2 );
pos = 0;
while( (pos = rxBR.indexIn( text, pos )) >= 0 )
{
QStringList list = rxBR.capturedTexts();
QString tag = list[2];
tag.replace( QRegExp( "&lt;br( |)(\\\\|/|)&gt;", Qt::CaseInsensitive ) , "<br/>" ).
prepend( list[1] ).
append( "</a>" );
text.replace( pos, list[0].length(), tag );
pos += tag.length() + 1;
}
#endif
// // output all links in the page - only for analysis
// QRegExp rxPrintAllLinks( "<\\s*a\\s+[^>]*href=\"[^\"]*\"[^>]*>",
// Qt::CaseSensitive,
// QRegExp::RegExp2 );
// pos = 0;
// while( (pos = rxPrintAllLinks.indexIn( text, pos )) >= 0 )
// {
// QStringList list = rxPrintAllLinks.capturedTexts();
// qDebug() << "\n--Alllinks--" << list[0];
// pos += list[0].length() + 1;
// }
// Fix outstanding elements
text += "<br style=\"clear:both;\" />";
return text.toUtf8().data();
}
void ZimDictionary::loadResource( std::string & resourceName, string & data )
{
vector< WordArticleLink > link;
string resData;
link = resourceIndex.findArticles( Utf8::decode( resourceName ) );
if( link.empty() )
return;
{
Mutex::Lock _( zimMutex );
readArticle( df, link[ 0 ].articleOffset, data );
}
}
QString const& ZimDictionary::getDescription()
{
if( !dictionaryDescription.isEmpty() || idxHeader.descriptionPtr == 0xFFFFFFFF )
return dictionaryDescription;
string str;
{
Mutex::Lock _( zimMutex );
readArticle( df, idxHeader.descriptionPtr, str );
}
if( !str.empty() )
dictionaryDescription = QString::fromUtf8( str.c_str(), str.size() );
return dictionaryDescription;
}
void ZimDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
{
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
FTS_index_completed.ref();
if( haveFTSIndex() )
return;
if( ensureInitDone().size() )
return;
if( firstIteration )
return;
gdDebug( "Zim: Building the full-text index for dictionary: %s\n",
getName().c_str() );
try
{
Mutex::Lock _( getFtsMutex() );
File::Class ftsIdx( ftsIndexName(), "wb" );
FtsHelpers::FtsIdxHeader ftsIdxHeader;
memset( &ftsIdxHeader, 0, sizeof( ftsIdxHeader ) );
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
ftsIdx.write( ftsIdxHeader );
ChunkedStorage::Writer chunks( ftsIdx );
BtreeIndexing::IndexedWords indexedWords;
QSet< uint32_t > setOfOffsets;
setOfOffsets.reserve( getWordCount() );
findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
// We should sort articles order by cluster number
// to effective use clusters data caching
QVector< QPair< quint32, uint32_t > > offsetsWithClusters;
offsetsWithClusters.reserve( setOfOffsets.size() );
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
it != setOfOffsets.constEnd(); ++it )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
Mutex::Lock _( zimMutex );
offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) );
}
// Free memory
setOfOffsets.clear();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() );
QVector< uint32_t > offsets;
offsets.resize( offsetsWithClusters.size() );
for( int i = 0; i < offsetsWithClusters.size(); i++ )
offsets[ i ] = offsetsWithClusters.at( i ).second;
// Free memory
offsetsWithClusters.clear();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QMap< QString, QVector< uint32_t > > ftsWords;
set< quint32 > indexedArticles;
quint32 articleNumber;
// index articles for full-text search
for( int i = 0; i < offsets.size(); i++ )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
QString headword, articleStr;
articleNumber = getArticleText( offsets.at( i ), headword, articleStr,
&indexedArticles );
if( articleNumber == 0xFFFFFFFF )
continue;
indexedArticles.insert( articleNumber );
FtsHelpers::parseArticleForFts( offsets.at( i ), articleStr, ftsWords );
}
// Free memory
offsets.clear();
QMap< QString, QVector< uint32_t > >::iterator it = ftsWords.begin();
while( it != ftsWords.end() )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
uint32_t offset = chunks.startNewBlock();
uint32_t size = it.value().size();
chunks.addToBlock( &size, sizeof(uint32_t) );
chunks.addToBlock( it.value().data(), size * sizeof(uint32_t) );
indexedWords.addSingleWord( gd::toWString( it.key() ), offset );
it = ftsWords.erase( it );
}
// Free memory
ftsWords.clear();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
ftsIdxHeader.chunksOffset = chunks.finish();
ftsIdxHeader.wordCount = indexedWords.size();
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
throw exUserAbort();
BtreeIndexing::IndexInfo ftsIdxInfo = BtreeIndexing::buildIndex( indexedWords, ftsIdx );
// Free memory
indexedWords.clear();
ftsIdxHeader.indexBtreeMaxElements = ftsIdxInfo.btreeMaxElements;
ftsIdxHeader.indexRootOffset = ftsIdxInfo.rootOffset;
ftsIdxHeader.signature = FtsHelpers::FtsSignature;
ftsIdxHeader.formatVersion = FtsHelpers::CurrentFtsFormatVersion + getFtsIndexVersion();
ftsIdx.rewind();
ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 );
FTS_index_completed.ref();
}
catch( std::exception &ex )
{
gdWarning( "Zim: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
}
}
void ZimDictionary::sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets,
QAtomicInt & isCancelled )
{
QVector< QPair< quint32, uint32_t > > offsetsWithClusters;
offsetsWithClusters.reserve( offsets.size() );
for( QVector< uint32_t >::ConstIterator it = offsets.constBegin();
it != offsets.constEnd(); ++it )
{
if( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
return;
Mutex::Lock _( zimMutex );
offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) );
}
std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() );
for( int i = 0; i < offsetsWithClusters.size(); i++ )
offsets[ i ] = offsetsWithClusters.at( i ).second;
}
void ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
{
try
{
headword.clear();
string articleText;
loadArticle( articleAddress, articleText, 0, true );
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
}
catch( std::exception &ex )
{
gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
}
}
quint32 ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text,
set< quint32 > * loadedArticles )
{
quint32 articleNumber = 0xFFFFFFFF;
try
{
headword.clear();
string articleText;
articleNumber = loadArticle( articleAddress, articleText, loadedArticles, true );
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
}
catch( std::exception &ex )
{
gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
}
return articleNumber;
}
sptr< Dictionary::DataRequest > ZimDictionary::getSearchResults( QString const & searchString,
int searchMode, bool matchCase,
int distanceBetweenWords,
int maxResults,
bool ignoreWordsOrder,
bool ignoreDiacritics,
QThreadPool * ftsThreadPoolPtr )
{
return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics, ftsThreadPoolPtr );
}
/// ZimDictionary::getArticle()
class ZimArticleRequest;
class ZimArticleRequestRunnable: public QRunnable
{
ZimArticleRequest & r;
QSemaphore & hasExited;
public:
ZimArticleRequestRunnable( ZimArticleRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~ZimArticleRequestRunnable()
{
hasExited.release();
}
virtual void run();
};
class ZimArticleRequest: public Dictionary::DataRequest
{
friend class ZimArticleRequestRunnable;
wstring word;
vector< wstring > alts;
ZimDictionary & dict;
bool ignoreDiacritics;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
ZimArticleRequest( wstring const & word_,
vector< wstring > const & alts_,
ZimDictionary & dict_, bool ignoreDiacritics_ ):
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
{
QThreadPool::globalInstance()->start(
new ZimArticleRequestRunnable( *this, hasExited ) );
}
void run(); // Run from another thread by ZimArticleRequestRunnable
virtual void cancel()
{
isCancelled.ref();
}
~ZimArticleRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void ZimArticleRequestRunnable::run()
{
r.run();
}
void ZimArticleRequest::run()
{
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
for( unsigned x = 0; x < alts.size(); ++x )
{
/// Make an additional query for each alt
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
chain.insert( chain.end(), altChain.begin(), altChain.end() );
}
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
set< quint32 > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this
// by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
if( ignoreDiacritics )
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
for( unsigned x = 0; x < chain.size(); ++x )
{
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
// Now grab that article
string headword, articleText;
headword = chain[ x ].word;
quint32 articleNumber = 0xFFFFFFFF;
try
{
articleNumber = dict.loadArticle( chain[ x ].articleOffset, articleText, &articlesIncluded );
}
catch(...)
{
}
if( articleNumber == 0xFFFFFFFF )
continue; // No article loaded
if ( articlesIncluded.find( articleNumber ) != articlesIncluded.end() )
continue; // We already have this article in the body.
// Ok. Now, does it go to main articles, or to alternate ones? We list
// main ones first, and alternates after.
// We do the case-folded comparison here.
wstring headwordStripped =
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
if( ignoreDiacritics )
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
multimap< wstring, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ?
mainArticles : alternateArticles;
mapToUse.insert( pair< wstring, pair< string, string > >(
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
pair< string, string >( headword, articleText ) ) );
articlesIncluded.insert( articleNumber );
}
if ( mainArticles.empty() && alternateArticles.empty() )
{
// No such word
finish();
return;
}
string result;
// See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
multimap< wstring, pair< string, string > >::const_iterator i;
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{
result += "<div class=\"zimdict\">";
result += "<h2 class=\"zimdict_headword\">";
result += i->second.first;
result += "</h2>";
result += i->second.second;
result += cleaner + "</div>";
}
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
{
result += "<div class=\"zimdict\">";
result += "<h2 class=\"zimdict_headword\">";
result += i->second.first;
result += "</h2>";
result += i->second.second;
result += cleaner + "</div>";
}
Mutex::Lock _( dataMutex );
data.resize( result.size() );
memcpy( &data.front(), result.data(), result.size() );
hasAnyData = true;
finish();
}
sptr< Dictionary::DataRequest > ZimDictionary::getArticle( wstring const & word,
vector< wstring > const & alts,
wstring const &,
bool ignoreDiacritics )
THROW_SPEC( std::exception )
{
return new ZimArticleRequest( word, alts, *this, ignoreDiacritics );
}
//// ZimDictionary::getResource()
class ZimResourceRequest;
class ZimResourceRequestRunnable: public QRunnable
{
ZimResourceRequest & r;
QSemaphore & hasExited;
public:
ZimResourceRequestRunnable( ZimResourceRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~ZimResourceRequestRunnable()
{
hasExited.release();
}
virtual void run();
};
class ZimResourceRequest: public Dictionary::DataRequest
{
friend class ZimResourceRequestRunnable;
ZimDictionary & dict;
string resourceName;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
ZimResourceRequest( ZimDictionary & dict_,
string const & resourceName_ ):
dict( dict_ ),
resourceName( resourceName_ )
{
QThreadPool::globalInstance()->start(
new ZimResourceRequestRunnable( *this, hasExited ) );
}
void run(); // Run from another thread by ZimResourceRequestRunnable
virtual void cancel()
{
isCancelled.ref();
}
~ZimResourceRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void ZimResourceRequestRunnable::run()
{
r.run();
}
void ZimResourceRequest::run()
{
// Some runnables linger enough that they are cancelled before they start
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
try
{
string resource;
dict.loadResource( resourceName, resource );
if( resource.empty() )
throw File::Ex();
if( Filetype::isNameOfCSS( resourceName ) )
{
QString css = QString::fromUtf8( resource.data(), resource.size() );
dict.isolateCSS( css, ".zimdict" );
QByteArray bytes = css.toUtf8();
Mutex::Lock _( dataMutex );
data.resize( bytes.size() );
memcpy( &data.front(), bytes.constData(), bytes.size() );
}
else
if ( Filetype::isNameOfTiff( resourceName ) )
{
// Convert it
dataMutex.lock();
QImage img = QImage::fromData( reinterpret_cast< const uchar * >( resource.data() ), resource.size() );
#ifdef MAKE_EXTRA_TIFF_HANDLER
if( img.isNull() )
GdTiff::tiffToQImage( &data.front(), data.size(), img );
#endif
dataMutex.unlock();
if ( !img.isNull() )
{
// Managed to load -- now store it back as BMP
QByteArray ba;
QBuffer buffer( &ba );
buffer.open( QIODevice::WriteOnly );
img.save( &buffer, "BMP" );
Mutex::Lock _( dataMutex );
data.resize( buffer.size() );
memcpy( &data.front(), buffer.data(), data.size() );
}
}
else
{
Mutex::Lock _( dataMutex );
data.resize( resource.size() );
memcpy( &data.front(), resource.data(), data.size() );
}
Mutex::Lock _( dataMutex );
hasAnyData = true;
}
catch( std::exception &ex )
{
gdCWarning( dictionaryResourceLc, "ZIM: Failed loading resource \"%s\" from \"%s\", reason: %s\n",
resourceName.c_str(), dict.getName().c_str(), ex.what() );
// Resource not loaded -- we don't set the hasAnyData flag then
}
finish();
}
sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name )
THROW_SPEC( std::exception )
{
return new ZimResourceRequest( *this, name );
}
//} // anonymous namespace
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing,
unsigned maxHeadwordsToExpand )
THROW_SPEC( std::exception )
{
vector< sptr< Dictionary::Class > > dictionaries;
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
++i )
{
// Skip files with the extensions different to .zim to speed up the
// scanning
QString firstName = QDir::fromNativeSeparators( FsEncoding::decode( i->c_str() ) );
if( !firstName.endsWith( ".zim") && !firstName.endsWith( ".zimaa" ) )
continue;
// Got the file -- check if we need to rebuid the index
ZimFile df( firstName );
vector< string > dictFiles;
df.getFilenames( dictFiles );
string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId;
try
{
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
indexIsOldOrBad( indexFile ) )
{
gdDebug( "Zim: Building the index for dictionary: %s\n", i->c_str() );
unsigned articleCount = 0;
unsigned wordCount = 0;
df.open();
ZIM_header const & zh = df.header();
bool new_namespaces = ( zh.majorVersion >= 6 && zh.minorVersion >= 1 );
if( zh.magicNumber != 0x44D495A )
throw exNotZimFile( i->c_str() );
{
int n = firstName.lastIndexOf( '/' );
initializing.indexingDictionary( firstName.mid( n + 1 ).toUtf8().constData() );
}
File::Class idx( indexFile, "wb" );
IdxHeader idxHeader;
memset( &idxHeader, 0, sizeof( idxHeader ) );
idxHeader.namePtr = 0xFFFFFFFF;
idxHeader.descriptionPtr = 0xFFFFFFFF;
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
idx.write( idxHeader );
IndexedWords indexedWords, indexedResources;
QByteArray artEntries;
df.seek( zh.urlPtrPos );
artEntries = df.read( (quint64)zh.articleCount * 8 );
QVector< quint64 > clusters;
clusters.reserve( zh.clusterCount );
df.seek( zh.clusterPtrPos );
{
QByteArray data = df.read( (quint64)zh.clusterCount * 8 );
for( unsigned n = 0; n < zh.clusterCount; n++ )
clusters.append( *( reinterpret_cast< const quint64 * >( data.constData() ) + n ) );
}
const quint64 * ptr;
quint16 mimetype, redirected_mime = 0xFFFF;
ArticleEntry artEntry;
RedirectEntry redEntry;
string url, title;
char nameSpace;
for( unsigned n = 0; n < zh.articleCount; n++ )
{
ptr = reinterpret_cast< const quint64 * >( artEntries.constData() ) + n;
df.seek( *ptr );
df.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) );
if( mimetype == 0xFFFF )
{
redEntry.mimetype = mimetype;
qint64 ret = df.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(RedirectEntry) - 2 );
if( ret != sizeof(RedirectEntry) - 2 )
throw exCantReadFile( i->c_str() );
redirected_mime = df.redirectedMimeType( redEntry );
nameSpace = redEntry.nameSpace;
}
else
{
artEntry.mimetype = mimetype;
qint64 ret = df.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(ArticleEntry) - 2 );
if( ret != sizeof(ArticleEntry) - 2 )
throw exCantReadFile( i->c_str() );
nameSpace = artEntry.nameSpace;
if( ( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces ) ) && df.isArticleMime( mimetype ) )
articleCount++;
}
// Read article url and title
char ch;
url.clear();
while( df.getChar( &ch ) )
{
if( ch == 0 )
break;
url.push_back( ch );
}
title.clear();
while( df.getChar( &ch ) )
{
if( ch == 0 )
break;
title.push_back( ch );
}
if( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces && ( df.isArticleMime( mimetype )
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) ) ) )
{
wstring word;
if( !title.empty() )
word = Utf8::decode( title );
else
word = Utf8::decode( url );
if( df.isArticleMime( mimetype )
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) )
{
if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand )
indexedWords.addSingleWord( word, n );
else
indexedWords.addWord( word, n );
wordCount++;
}
else
{
url.insert( url.begin(), '/' );
url.insert( url.begin(), nameSpace );
indexedResources.addSingleWord( Utf8::decode( url ), n );
}
}
else
if( nameSpace == 'M' )
{
if( url.compare( "Title" ) == 0 )
{
idxHeader.namePtr = n;
string name;
readArticle( df, n, name );
initializing.indexingDictionary( name );
}
else
if( url.compare( "Description" ) == 0 )
idxHeader.descriptionPtr = n;
else
if( url.compare( "Language" ) == 0 )
{
string lang;
readArticle( df, n, lang );
if( lang.size() == 2 )
idxHeader.langFrom = LangCoder::code2toInt( lang.c_str() );
else
if( lang.size() == 3 )
idxHeader.langFrom = LangCoder::findIdForLanguageCode3( lang.c_str() );
idxHeader.langTo = idxHeader.langFrom;
}
}
else
if( nameSpace == 'X' )
{
continue;
}
else
{
url.insert( url.begin(), '/' );
url.insert( url.begin(), nameSpace );
indexedResources.addSingleWord( Utf8::decode( url ), n );
}
}
// Build index
{
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
indexedWords.clear(); // Release memory -- no need for this data
}
{
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedResources, idx );
idxHeader.resourceIndexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.resourceIndexRootOffset = idxInfo.rootOffset;
indexedResources.clear(); // Release memory -- no need for this data
}
idxHeader.signature = Signature;
idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.articleCount = articleCount;
idxHeader.wordCount = wordCount;
idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );
}
dictionaries.push_back( new ZimDictionary( dictId,
indexFile,
dictFiles ) );
}
catch( std::exception & e )
{
gdWarning( "Zim dictionary initializing failed: %s, error: %s\n",
i->c_str(), e.what() );
continue;
}
catch( ... )
{
qWarning( "Zim dictionary initializing failed\n" );
continue;
}
}
return dictionaries;
}
} // namespace Zim
#endif