mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-12-18 03:14:06 +00:00
1760 lines
49 KiB
C++
1760 lines
49 KiB
C++
/* This file is (c) 2012 Abs62
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#ifdef MAKE_ZIM_SUPPORT
|
|
|
|
#include "zim.hh"
|
|
#include "btreeidx.hh"
|
|
#include "fsencoding.hh"
|
|
#include "folding.hh"
|
|
#include "gddebug.hh"
|
|
#include "utf8.hh"
|
|
#include "decompress.hh"
|
|
#include "langcoder.hh"
|
|
#include "wstring_qt.hh"
|
|
#include "filetype.hh"
|
|
#include "file.hh"
|
|
#include "utils.hh"
|
|
#include "tiff.hh"
|
|
#include "ftshelpers.hh"
|
|
#include "htmlescape.hh"
|
|
#include "splitfile.hh"
|
|
|
|
#ifdef _MSC_VER
|
|
#include <stub_msvc.h>
|
|
#endif
|
|
|
|
#include <QByteArray>
|
|
#include <QFile>
|
|
#include <QFileInfo>
|
|
#include <QString>
|
|
#include <QRunnable>
|
|
#include <QSemaphore>
|
|
#include <QAtomicInt>
|
|
#include <QImage>
|
|
#include <QDir>
|
|
#include <QDebug>
|
|
|
|
#include <QRegularExpression>
|
|
|
|
#include <string>
|
|
#include <set>
|
|
#include <map>
|
|
#include <algorithm>
|
|
#include <QtConcurrent>
|
|
|
|
namespace Zim {
|
|
|
|
#define CACHE_SIZE 3
|
|
|
|
using std::string;
|
|
using std::map;
|
|
using std::vector;
|
|
using std::multimap;
|
|
using std::pair;
|
|
using std::set;
|
|
using gd::wstring;
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
using BtreeIndexing::IndexedWords;
|
|
using BtreeIndexing::IndexInfo;
|
|
|
|
DEF_EX_STR( exNotZimFile, "Not an Zim file", Dictionary::Ex )
|
|
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
|
|
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
|
|
|
|
|
|
//namespace {
|
|
|
|
class ZimFile;
|
|
|
|
#pragma pack( push, 1 )
|
|
|
|
enum CompressionType
|
|
{
|
|
Default = 0, None, Zlib, Bzip2, Lzma2, Zstd
|
|
};
|
|
|
|
/// Zim file header
|
|
struct ZIM_header
|
|
{
|
|
quint32 magicNumber;
|
|
quint16 majorVersion;
|
|
quint16 minorVersion;
|
|
quint8 uuid[ 16 ];
|
|
quint32 articleCount;
|
|
quint32 clusterCount;
|
|
quint64 urlPtrPos;
|
|
quint64 titlePtrPos;
|
|
quint64 clusterPtrPos;
|
|
quint64 mimeListPos;
|
|
quint32 mainPage;
|
|
quint32 layoutPage;
|
|
quint64 checksumPos;
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
struct ArticleEntry
|
|
{
|
|
quint16 mimetype;
|
|
quint8 parameterLen;
|
|
char nameSpace;
|
|
quint32 revision;
|
|
quint32 clusterNumber;
|
|
quint32 blobNumber;
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
struct RedirectEntry
|
|
{
|
|
quint16 mimetype;
|
|
quint8 parameterLen;
|
|
char nameSpace;
|
|
quint32 revision;
|
|
quint32 redirectIndex;
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
enum
|
|
{
|
|
Signature = 0x584D495A, // ZIMX on little-endian, XMIZ on big-endian
|
|
CurrentFormatVersion = 3 + BtreeIndexing::FormatVersion + Folding::Version
|
|
};
|
|
|
|
struct IdxHeader
|
|
{
|
|
quint32 signature; // First comes the signature, ZIMX
|
|
quint32 formatVersion; // File format version (CurrentFormatVersion)
|
|
quint32 indexBtreeMaxElements; // Two fields from IndexInfo
|
|
quint32 indexRootOffset;
|
|
quint32 resourceIndexBtreeMaxElements; // Two fields from IndexInfo
|
|
quint32 resourceIndexRootOffset;
|
|
quint32 wordCount;
|
|
quint32 articleCount;
|
|
quint32 namePtr;
|
|
quint32 descriptionPtr;
|
|
quint32 langFrom; // Source language
|
|
quint32 langTo; // Target language
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
#pragma pack( pop )
|
|
|
|
// Class for support of split zim files
|
|
|
|
struct Cache
|
|
{
|
|
char * data;
|
|
quint32 clusterNumber;
|
|
int stamp;
|
|
int count, size;
|
|
unsigned blobs_offset_size;
|
|
|
|
Cache() :
|
|
data( 0 ),
|
|
clusterNumber( 0 ),
|
|
stamp( -1 ),
|
|
count( 0 ),
|
|
size( 0 ),
|
|
blobs_offset_size( 0 )
|
|
{}
|
|
};
|
|
|
|
class ZimFile : public SplitFile::SplitFile
|
|
{
|
|
public:
|
|
ZimFile();
|
|
ZimFile( const QString & name );
|
|
~ZimFile();
|
|
|
|
virtual void setFileName( const QString & name );
|
|
bool open();
|
|
void close()
|
|
{
|
|
SplitFile::close();
|
|
clearCache();
|
|
}
|
|
const ZIM_header & header() const
|
|
{ return zimHeader; }
|
|
|
|
string getClusterData( quint32 cluster_nom, unsigned & blob_offset_size );
|
|
|
|
const QString getMimeType( quint16 nom )
|
|
{ return mimeTypes.value( nom ); }
|
|
|
|
bool isArticleMime( quint16 mime_type )
|
|
{ return getMimeType( mime_type ).startsWith( "text/html", Qt::CaseInsensitive )
|
|
|| getMimeType( mime_type ).startsWith( "text/plain", Qt::CaseInsensitive ); }
|
|
|
|
|
|
quint16 redirectedMimeType( RedirectEntry const & redEntry );
|
|
|
|
private:
|
|
ZIM_header zimHeader;
|
|
Cache cache[ CACHE_SIZE ];
|
|
int stamp;
|
|
QVector< QPair< quint64, quint32 > > clusterOffsets;
|
|
QStringList mimeTypes;
|
|
|
|
void clearCache();
|
|
};
|
|
|
|
ZimFile::ZimFile() :
|
|
stamp( 0 )
|
|
{
|
|
memset( &zimHeader, 0, sizeof( zimHeader ) );
|
|
}
|
|
|
|
ZimFile::ZimFile( const QString & name )
|
|
{
|
|
setFileName( name );
|
|
}
|
|
|
|
ZimFile::~ZimFile()
|
|
{
|
|
clearCache();
|
|
}
|
|
|
|
void ZimFile::setFileName( const QString & name )
|
|
{
|
|
close();
|
|
memset( &zimHeader, 0, sizeof( zimHeader ) );
|
|
clearCache();
|
|
|
|
appendFile( name );
|
|
|
|
if( name.endsWith( ".zimaa", Qt::CaseInsensitive ) )
|
|
{
|
|
QString fname = name;
|
|
|
|
for( int i = 0; i < 26; i++ )
|
|
{
|
|
fname[ fname.size() - 2 ] = (char)( 'a' + i );
|
|
|
|
int j;
|
|
for( j = 1; j < 26; j++ )
|
|
{
|
|
fname[ fname.size() - 1 ] = (char)( 'a' + j );
|
|
if( !QFileInfo( fname ).isFile() )
|
|
break;
|
|
|
|
appendFile( fname );
|
|
}
|
|
|
|
if( j < 26 )
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void ZimFile::clearCache()
|
|
{
|
|
for( int i = 0; i < CACHE_SIZE; i++ )
|
|
{
|
|
if( cache[ i ].data )
|
|
{
|
|
free( cache[ i ].data );
|
|
cache[ i ].data = 0;
|
|
}
|
|
cache[ i ].clusterNumber = 0;
|
|
cache[ i ].stamp = -1;
|
|
cache[ i ].count = 0;
|
|
cache[ i ].size = 0;
|
|
}
|
|
stamp = 0;
|
|
}
|
|
|
|
bool ZimFile::open()
|
|
{
|
|
if( !SplitFile::open( QIODevice::ReadOnly ) )
|
|
return false;
|
|
|
|
memset( &zimHeader, 0, sizeof( zimHeader ) );
|
|
|
|
if( read( reinterpret_cast< char * >( &zimHeader ), sizeof( zimHeader ) ) != sizeof( zimHeader ) )
|
|
return false;
|
|
|
|
// Clusters in zim file may be placed in random order.
|
|
// We create sorted offsets list to calculate clusters size.
|
|
|
|
clusterOffsets.resize( zimHeader.clusterCount );
|
|
QVector< quint64 > offs;
|
|
offs.resize( zimHeader.clusterCount );
|
|
|
|
seek( zimHeader.clusterPtrPos );
|
|
qint64 size = zimHeader.clusterCount * sizeof( quint64 );
|
|
if( read( reinterpret_cast< char * >( offs.data() ), size) != size )
|
|
{
|
|
vector< string > names;
|
|
getFilenames( names );
|
|
throw exCantReadFile( names[ 0 ] );
|
|
}
|
|
|
|
for( quint32 i = 0; i < zimHeader.clusterCount; i++ )
|
|
clusterOffsets[ i ] = QPair< quint64, quint32 >( offs.at( i ), i );
|
|
|
|
std::sort( clusterOffsets.begin(), clusterOffsets.end() );
|
|
|
|
// Read mime types
|
|
|
|
string type;
|
|
char ch;
|
|
|
|
seek( zimHeader.mimeListPos );
|
|
|
|
for( ; ; )
|
|
{
|
|
type.clear();
|
|
while( getChar( &ch ) )
|
|
{
|
|
if( ch == 0 )
|
|
break;
|
|
type.push_back( ch );
|
|
}
|
|
if( type.empty() )
|
|
break;
|
|
|
|
QString s = QString::fromUtf8( type.c_str(), type.size() );
|
|
mimeTypes.append( s );
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
string ZimFile::getClusterData( quint32 cluster_nom, unsigned & blobs_offset_size )
|
|
{
|
|
// Check cache
|
|
int target = 0;
|
|
bool found = false;
|
|
int lastStamp = INT_MAX;
|
|
|
|
for( int i = 0; i < CACHE_SIZE; i++ )
|
|
{
|
|
if( cache[ i ].clusterNumber == cluster_nom && cache[ i ].count )
|
|
{
|
|
found = true;
|
|
target = i;
|
|
break;
|
|
}
|
|
|
|
if( cache[ i ].stamp < lastStamp )
|
|
{
|
|
lastStamp = cache[ i ].stamp;
|
|
target = i;
|
|
}
|
|
}
|
|
|
|
cache[ target ].stamp = ++stamp;
|
|
if( stamp < 0 )
|
|
{
|
|
stamp = 0;
|
|
for (int i = 0; i < CACHE_SIZE; i++)
|
|
cache[ i ].stamp = -1;
|
|
}
|
|
|
|
if( found )
|
|
{
|
|
// Cache hit
|
|
blobs_offset_size = cache[ target ].blobs_offset_size;
|
|
return string( cache[ target ].data, cache[ target ].count );
|
|
}
|
|
|
|
// Cache miss, read data from file
|
|
|
|
// Calculate cluster size
|
|
|
|
quint64 clusterSize;
|
|
quint32 nom;
|
|
for( nom = 0; nom < zimHeader.clusterCount; nom++ )
|
|
if( clusterOffsets.at( nom ).second == cluster_nom )
|
|
break;
|
|
|
|
if( nom >= zimHeader.clusterCount ) // Invalid cluster nom
|
|
return string();
|
|
|
|
if( nom < zimHeader.clusterCount - 1 )
|
|
clusterSize = clusterOffsets.at( nom + 1 ).first - clusterOffsets.at( nom ).first;
|
|
else
|
|
clusterSize = size() - clusterOffsets.at( nom ).first;
|
|
|
|
// Read cluster data
|
|
|
|
seek( clusterOffsets.at( nom ).first );
|
|
|
|
char compressionType, cluster_info;
|
|
if( !getChar( &cluster_info ) )
|
|
return string();
|
|
compressionType = cluster_info & 0x0F;
|
|
blobs_offset_size = cluster_info & 0x10 && zimHeader.majorVersion >= 6 ? 8 : 4;
|
|
|
|
string decompressedData;
|
|
|
|
QByteArray data = read( clusterSize );
|
|
|
|
if( compressionType == Default || compressionType == None )
|
|
decompressedData = string( data.data(), data.size() );
|
|
else
|
|
if( compressionType == Zlib )
|
|
decompressedData = decompressZlib( data.constData(), data.size() );
|
|
else
|
|
if( compressionType == Bzip2 )
|
|
decompressedData = decompressBzip2( data.constData(), data.size() );
|
|
else
|
|
if( compressionType == Lzma2 )
|
|
decompressedData = decompressLzma2( data.constData(), data.size() );
|
|
else
|
|
if( compressionType == Zstd )
|
|
decompressedData = decompressZstd( data.constData(), data.size() );
|
|
else
|
|
return string();
|
|
|
|
if( decompressedData.empty() )
|
|
return string();
|
|
|
|
// Check BLOBs number in the cluster
|
|
// We cache multi-element clusters only
|
|
|
|
quint32 firstOffset32;
|
|
quint64 firstOffset;
|
|
if( blobs_offset_size == 8 )
|
|
memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
|
|
else
|
|
{
|
|
memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) );
|
|
firstOffset = firstOffset32;
|
|
}
|
|
quint32 blobCount = ( firstOffset - blobs_offset_size ) / blobs_offset_size;
|
|
|
|
if( blobCount > 1 )
|
|
{
|
|
// Fill cache
|
|
int size = decompressedData.size();
|
|
if( cache[ target ].count < size )
|
|
{
|
|
if( cache[ target ].data )
|
|
free( cache[ target ].data );
|
|
cache[ target ].data = ( char * )malloc( size );
|
|
if( cache[ target ].data )
|
|
cache[ target ].size = size;
|
|
else
|
|
{
|
|
cache[ target ].size = 0;
|
|
cache[ target ].count = 0;
|
|
}
|
|
}
|
|
if( cache[ target ].size )
|
|
{
|
|
memcpy( cache[ target ].data, decompressedData.c_str(), size );
|
|
cache[ target ].count = size;
|
|
cache[ target ].clusterNumber = cluster_nom;
|
|
cache[ target ].blobs_offset_size = blobs_offset_size;
|
|
}
|
|
}
|
|
|
|
return decompressedData;
|
|
}
|
|
|
|
quint16 ZimFile::redirectedMimeType( RedirectEntry const & redEntry )
|
|
{
|
|
RedirectEntry current_entry = redEntry;
|
|
quint64 current_pos = pos();
|
|
quint16 mimetype = 0xFFFF;
|
|
|
|
for( ; ; )
|
|
{
|
|
quint32 current_nom = current_entry.redirectIndex;
|
|
|
|
seek( zimHeader.urlPtrPos + (quint64)current_nom * 8 );
|
|
quint64 new_pos;
|
|
if( read( reinterpret_cast< char * >( &new_pos ), sizeof(new_pos) ) != sizeof(new_pos) )
|
|
break;
|
|
|
|
seek( new_pos );
|
|
quint16 new_mimetype;
|
|
if( read( reinterpret_cast< char * >( &new_mimetype ), sizeof(new_mimetype) ) != sizeof(new_mimetype) )
|
|
break;
|
|
|
|
if( new_mimetype == 0xFFFF ) // Redirect to other article
|
|
{
|
|
if( read( reinterpret_cast< char * >( ¤t_entry ) + 2, sizeof( current_entry ) - 2 ) != sizeof( current_entry ) - 2 )
|
|
break;
|
|
if( current_nom == current_entry.redirectIndex )
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
mimetype = new_mimetype;
|
|
break;
|
|
}
|
|
}
|
|
|
|
seek( current_pos );
|
|
return mimetype;
|
|
}
|
|
|
|
|
|
// Some supporting functions
|
|
|
|
bool indexIsOldOrBad( string const & indexFile )
|
|
{
|
|
File::Class idx( indexFile, "rb" );
|
|
|
|
IdxHeader header;
|
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
|
|
header.signature != Signature ||
|
|
header.formatVersion != CurrentFormatVersion;
|
|
}
|
|
|
|
quint32 getArticleCluster( ZimFile & file, quint32 articleNumber )
|
|
{
|
|
while( 1 )
|
|
{
|
|
ZIM_header const & header = file.header();
|
|
if( articleNumber >= header.articleCount )
|
|
break;
|
|
|
|
file.seek( header.urlPtrPos + (quint64)articleNumber * 8 );
|
|
quint64 pos;
|
|
if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) )
|
|
break;
|
|
|
|
// Read article info
|
|
|
|
quint16 mimetype;
|
|
|
|
file.seek( pos );
|
|
if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) )
|
|
break;
|
|
|
|
if( mimetype == 0xFFFF ) // Redirect to other article
|
|
{
|
|
RedirectEntry redEntry;
|
|
if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 )
|
|
break;
|
|
if( articleNumber == redEntry.redirectIndex )
|
|
break;
|
|
articleNumber = redEntry.redirectIndex;
|
|
continue;
|
|
}
|
|
|
|
ArticleEntry artEntry;
|
|
artEntry.mimetype = mimetype;
|
|
if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 )
|
|
break;
|
|
|
|
return artEntry.clusterNumber;
|
|
}
|
|
return 0xFFFFFFFF;
|
|
}
|
|
|
|
quint32 readArticle( ZimFile & file, quint32 articleNumber, string & result,
|
|
set< quint32 > * loadedArticles = NULL )
|
|
{
|
|
result.clear();
|
|
|
|
while( 1 )
|
|
{
|
|
ZIM_header const & header = file.header();
|
|
if( articleNumber >= header.articleCount )
|
|
break;
|
|
|
|
file.seek( header.urlPtrPos + (quint64)articleNumber * 8 );
|
|
quint64 pos;
|
|
if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) )
|
|
break;
|
|
|
|
// Read article info
|
|
|
|
quint16 mimetype;
|
|
|
|
file.seek( pos );
|
|
if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) )
|
|
break;
|
|
|
|
if( mimetype == 0xFFFF ) // Redirect to other article
|
|
{
|
|
RedirectEntry redEntry;
|
|
if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 )
|
|
break;
|
|
if( articleNumber == redEntry.redirectIndex )
|
|
break;
|
|
articleNumber = redEntry.redirectIndex;
|
|
continue;
|
|
}
|
|
|
|
if( loadedArticles && loadedArticles->find( articleNumber ) != loadedArticles->end() )
|
|
break;
|
|
|
|
ArticleEntry artEntry;
|
|
artEntry.mimetype = mimetype;
|
|
if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 )
|
|
break;
|
|
|
|
// Read cluster data
|
|
|
|
unsigned offset_size = 0;
|
|
string decompressedData = file.getClusterData( artEntry.clusterNumber, offset_size );
|
|
if( decompressedData.empty() )
|
|
break;
|
|
|
|
// Take article data from cluster
|
|
|
|
quint32 firstOffset32;
|
|
quint64 firstOffset;
|
|
|
|
if( offset_size == 8 )
|
|
memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
|
|
else
|
|
{
|
|
memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) );
|
|
firstOffset = firstOffset32;
|
|
}
|
|
quint32 blobCount = ( firstOffset - offset_size ) / offset_size;
|
|
if( artEntry.blobNumber > blobCount )
|
|
break;
|
|
|
|
quint32 size;
|
|
if( offset_size == 8 )
|
|
{
|
|
quint64 offsets[ 2 ];
|
|
memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 8, sizeof(offsets) );
|
|
size = offsets[ 1 ] - offsets[ 0 ];
|
|
result.append( decompressedData, offsets[ 0 ], size );
|
|
}
|
|
else
|
|
{
|
|
quint32 offsets[ 2 ];
|
|
memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 4, sizeof(offsets) );
|
|
size = offsets[ 1 ] - offsets[ 0 ];
|
|
result.append( decompressedData, offsets[ 0 ], size );
|
|
}
|
|
|
|
return articleNumber;
|
|
}
|
|
return 0xFFFFFFFF;
|
|
}
|
|
|
|
// ZimDictionary
|
|
|
|
class ZimDictionary: public BtreeIndexing::BtreeDictionary
|
|
{
|
|
enum LINKS_TYPE { UNKNOWN, SLASH, NO_SLASH };
|
|
|
|
Mutex idxMutex;
|
|
Mutex zimMutex, idxResourceMutex;
|
|
File::Class idx;
|
|
BtreeIndex resourceIndex;
|
|
IdxHeader idxHeader;
|
|
string dictionaryName;
|
|
ZimFile df;
|
|
set< quint32 > articlesIndexedForFTS;
|
|
LINKS_TYPE linksType;
|
|
|
|
public:
|
|
|
|
ZimDictionary( string const & id, string const & indexFile,
|
|
vector< string > const & dictionaryFiles );
|
|
|
|
~ZimDictionary();
|
|
|
|
virtual string getName() noexcept
|
|
{ return dictionaryName; }
|
|
|
|
virtual map< Dictionary::Property, string > getProperties() noexcept
|
|
{ return map< Dictionary::Property, string >(); }
|
|
|
|
virtual unsigned long getArticleCount() noexcept
|
|
{ return idxHeader.articleCount; }
|
|
|
|
virtual unsigned long getWordCount() noexcept
|
|
{ return idxHeader.wordCount; }
|
|
|
|
inline virtual quint32 getLangFrom() const
|
|
{ return idxHeader.langFrom; }
|
|
|
|
inline virtual quint32 getLangTo() const
|
|
{ return idxHeader.langTo; }
|
|
|
|
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
;
|
|
|
|
virtual sptr< Dictionary::DataRequest > getResource( string const & name )
|
|
;
|
|
|
|
virtual QString const& getDescription();
|
|
|
|
/// Loads the resource.
|
|
void loadResource( std::string &resourceName, string & data );
|
|
|
|
virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics );
|
|
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
|
|
|
|
quint32 getArticleText( uint32_t articleAddress, QString & headword, QString & text,
|
|
set< quint32 > * loadedArticles );
|
|
|
|
virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
|
|
|
|
virtual void setFTSParameters( Config::FullTextSearch const & fts )
|
|
{
|
|
can_FTS = fts.enabled
|
|
&& !fts.disabledTypes.contains( "ZIM", Qt::CaseInsensitive )
|
|
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
|
|
}
|
|
|
|
virtual void sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets, QAtomicInt & isCancelled );
|
|
|
|
protected:
|
|
|
|
virtual void loadIcon() noexcept;
|
|
|
|
private:
|
|
|
|
/// Loads the article.
|
|
quint32 loadArticle( quint32 address,
|
|
string & articleText,
|
|
set< quint32 > * loadedArticles,
|
|
bool rawText = false );
|
|
|
|
string convert( string const & in_data );
|
|
friend class ZimArticleRequest;
|
|
friend class ZimResourceRequest;
|
|
};
|
|
|
|
ZimDictionary::ZimDictionary( string const & id,
|
|
string const & indexFile,
|
|
vector< string > const & dictionaryFiles ):
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
idx( indexFile, "rb" ),
|
|
idxHeader( idx.read< IdxHeader >() ),
|
|
df( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) ),
|
|
linksType( UNKNOWN )
|
|
{
|
|
// Open data file
|
|
|
|
df.open();
|
|
|
|
// Initialize the indexes
|
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
|
idxHeader.indexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
resourceIndex.openIndex( IndexInfo( idxHeader.resourceIndexBtreeMaxElements,
|
|
idxHeader.resourceIndexRootOffset ),
|
|
idx, idxResourceMutex );
|
|
|
|
// Read dictionary name
|
|
|
|
if( idxHeader.namePtr == 0xFFFFFFFF )
|
|
{
|
|
QString name = QDir::fromNativeSeparators( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) );
|
|
int n = name.lastIndexOf( '/' );
|
|
dictionaryName = name.mid( n + 1 ).toStdString();
|
|
}
|
|
else
|
|
{
|
|
readArticle( df, idxHeader.namePtr, dictionaryName );
|
|
}
|
|
|
|
// Full-text search parameters
|
|
|
|
can_FTS = true;
|
|
|
|
ftsIdxName = indexFile + "_FTS";
|
|
|
|
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
|
|
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
|
|
FTS_index_completed.ref();
|
|
}
|
|
|
|
ZimDictionary::~ZimDictionary()
|
|
{
|
|
df.close();
|
|
}
|
|
|
|
void ZimDictionary::loadIcon() noexcept
|
|
{
|
|
if ( dictionaryIconLoaded )
|
|
return;
|
|
|
|
QString fileName =
|
|
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
|
|
|
|
// Remove the extension
|
|
fileName.chop( 3 );
|
|
|
|
if( !loadIconFromFile( fileName ) )
|
|
{
|
|
// Load failed -- use default icons
|
|
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_zim.png");
|
|
}
|
|
|
|
dictionaryIconLoaded = true;
|
|
}
|
|
|
|
quint32 ZimDictionary::loadArticle( quint32 address,
|
|
string & articleText,
|
|
set< quint32 > * loadedArticles,
|
|
bool rawText )
|
|
{
|
|
quint32 ret;
|
|
{
|
|
Mutex::Lock _( zimMutex );
|
|
ret = readArticle( df, address, articleText, loadedArticles );
|
|
}
|
|
if( !rawText )
|
|
articleText = convert( articleText );
|
|
|
|
return ret;
|
|
}
|
|
|
|
string ZimDictionary::convert( const string & in )
|
|
{
|
|
QString text = QString::fromUtf8( in.c_str() );
|
|
|
|
// replace background
|
|
text.replace( QRegularExpression( "<\\s*body\\s+([^>]*)(background(|-color)):([^;\"]*(;|))" ),
|
|
QString( "<body \\1" ) );
|
|
|
|
// pattern of img and script
|
|
text.replace( QRegularExpression( "<\\s*(img|script)\\s+([^>]*)src=(\"|)(\\.\\.|)/" ),
|
|
QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
|
|
|
|
// Fix links without '"'
|
|
text.replace( QRegularExpression( "href=(\\.\\.|)/([^\\s>]+)" ),
|
|
QString( "href=\"\\1/\\2\"" ) );
|
|
|
|
// pattern <link... href="..." ...>
|
|
text.replace( QRegularExpression( "<\\s*link\\s+([^>]*)href=\"(\\.\\.|)/" ),
|
|
QString( "<link \\1href=\"bres://%1/").arg( getId().c_str() ) );
|
|
|
|
// localize the http://en.wiki***.com|org/wiki/<key> series links
|
|
// excluding those keywords that have ":" in it
|
|
QString urlWiki = "\"http(s|)://en\\.(wiki(pedia|books|news|quote|source|voyage|versity)|wiktionary)\\.(org|com)/wiki/([^:\"]*)\"";
|
|
text.replace( QRegularExpression( "<\\s*a\\s+(class=\"external\"\\s+|)href=" + urlWiki ),
|
|
QString( "<a href=\"gdlookup://localhost/\\6\"" ) );
|
|
|
|
// pattern <a href="..." ...>, excluding any known protocols such as http://, mailto:, #(comment)
|
|
// these links will be translated into local definitions
|
|
// <meta http-equiv="Refresh" content="0;url=../dsalsrv02.uchicago.edu/cgi-bin/0994.html">
|
|
QRegularExpression rxLink( "<\\s*(?:a|meta)\\s+([^>]*)(?:href|url)=\"?(?!(?:\\w+://|#|mailto:|tel:))(/|)([^\"]*)\"\\s*(title=\"[^\"]*\")?[^>]*>" );
|
|
QRegularExpressionMatchIterator it = rxLink.globalMatch( text );
|
|
int pos = 0;
|
|
QString newText;
|
|
while( it.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it.next();
|
|
|
|
newText += text.mid( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QStringList list = match.capturedTexts();
|
|
// Add empty strings for compatibility with QRegExp behaviour
|
|
for( int i = list.size(); i < 5; i++ )
|
|
list.append( QString() );
|
|
|
|
QString tag = list[3]; // a url, ex: Precambrian_Chaotian.html
|
|
if ( !list[4].isEmpty() ) // a title, ex: title="Precambrian/Chaotian"
|
|
tag = list[4].split("\"")[1];
|
|
|
|
// Check type of links inside articles
|
|
if( linksType == UNKNOWN && tag.indexOf( '/' ) >= 0 )
|
|
{
|
|
QString word = QUrl::fromPercentEncoding( tag.toLatin1() );
|
|
QRegularExpression htmlRx( "\\.(s|)htm(l|)$", QRegularExpression::CaseInsensitiveOption );
|
|
word.remove( htmlRx ).
|
|
replace( "_", " " );
|
|
|
|
vector< WordArticleLink > links;
|
|
links = findArticles( gd::toWString( word ) );
|
|
|
|
if( !links.empty() )
|
|
{
|
|
linksType = SLASH;
|
|
}
|
|
else
|
|
{
|
|
word.remove( QRegularExpression(".*/") );
|
|
links = findArticles( gd::toWString( word ) );
|
|
if( !links.empty() )
|
|
{
|
|
linksType = NO_SLASH;
|
|
links.clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
if( linksType == SLASH || linksType == UNKNOWN )
|
|
{
|
|
tag.remove( QRegularExpression( "\\.(s|)htm(l|)$", QRegularExpression::PatternOption::CaseInsensitiveOption ) ).
|
|
replace( "_", "%20" ).
|
|
prepend( "<a href=\"gdlookup://localhost/" ).
|
|
append( "\" " + list[4] + ">" );
|
|
}
|
|
else
|
|
{
|
|
tag.remove( QRegularExpression(".*/") ).
|
|
remove( QRegularExpression( "\\.(s|)htm(l|)$", QRegularExpression::PatternOption::CaseInsensitiveOption ) ).
|
|
replace( "_", "%20" ).
|
|
prepend( "<a href=\"gdlookup://localhost/" ).
|
|
append( "\" " + list[4] + ">" );
|
|
}
|
|
|
|
newText += tag;
|
|
}
|
|
if( pos )
|
|
{
|
|
newText += text.mid( pos );
|
|
text = newText;
|
|
}
|
|
newText.clear();
|
|
|
|
// Occasionally words needs to be displayed in vertical, but <br/> were changed to <br\> somewhere
|
|
// proper style: <a href="gdlookup://localhost/Neoptera" ... >N<br/>e<br/>o<br/>p<br/>t<br/>e<br/>r<br/>a</a>
|
|
QRegularExpression rxBR( "(<a href=\"gdlookup://localhost/[^\"]*\"\\s*[^>]*>)\\s*((\\w\\s*<br(\\\\|/|)>\\s*)+\\w)\\s*</a>",
|
|
QRegularExpression::UseUnicodePropertiesOption );
|
|
pos = 0;
|
|
QRegularExpressionMatchIterator it2 = rxBR.globalMatch( text );
|
|
while( it2.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it2.next();
|
|
|
|
newText += text.mid( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QStringList list = match.capturedTexts();
|
|
// Add empty strings for compatibility with QRegExp behaviour
|
|
for( int i = match.lastCapturedIndex() + 1; i < 3; i++ )
|
|
list.append( QString() );
|
|
|
|
QString tag = list[2];
|
|
tag.replace( QRegularExpression( "<br( |)(\\\\|/|)>", QRegularExpression::PatternOption::CaseInsensitiveOption ) , "<br/>" ).
|
|
prepend( list[1] ).
|
|
append( "</a>" );
|
|
|
|
newText += tag;
|
|
}
|
|
if( pos )
|
|
{
|
|
newText += text.mid( pos );
|
|
text = newText;
|
|
}
|
|
newText.clear();
|
|
|
|
|
|
// // output all links in the page - only for analysis
|
|
// QRegExp rxPrintAllLinks( "<\\s*a\\s+[^>]*href=\"[^\"]*\"[^>]*>",
|
|
// Qt::CaseSensitive,
|
|
// QRegExp::RegExp2 );
|
|
// pos = 0;
|
|
// while( (pos = rxPrintAllLinks.indexIn( text, pos )) >= 0 )
|
|
// {
|
|
// QStringList list = rxPrintAllLinks.capturedTexts();
|
|
// qDebug() << "\n--Alllinks--" << list[0];
|
|
// pos += list[0].length() + 1;
|
|
// }
|
|
|
|
// Fix outstanding elements
|
|
text += "<br style=\"clear:both;\" />";
|
|
|
|
return text.toUtf8().data();
|
|
}
|
|
|
|
void ZimDictionary::loadResource( std::string & resourceName, string & data )
|
|
{
|
|
vector< WordArticleLink > link;
|
|
string resData;
|
|
|
|
link = resourceIndex.findArticles( Utf8::decode( resourceName ) );
|
|
|
|
if( link.empty() )
|
|
return;
|
|
|
|
{
|
|
Mutex::Lock _( zimMutex );
|
|
readArticle( df, link[ 0 ].articleOffset, data );
|
|
}
|
|
}
|
|
|
|
QString const& ZimDictionary::getDescription()
|
|
{
|
|
if( !dictionaryDescription.isEmpty() || idxHeader.descriptionPtr == 0xFFFFFFFF )
|
|
return dictionaryDescription;
|
|
|
|
string str;
|
|
{
|
|
Mutex::Lock _( zimMutex );
|
|
readArticle( df, idxHeader.descriptionPtr, str );
|
|
}
|
|
|
|
if( !str.empty() )
|
|
dictionaryDescription = QString::fromUtf8( str.c_str(), str.size() );
|
|
|
|
return dictionaryDescription;
|
|
}
|
|
|
|
void ZimDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
|
|
{
|
|
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|
|
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
|
|
FTS_index_completed.ref();
|
|
|
|
if( haveFTSIndex() )
|
|
return;
|
|
|
|
if( ensureInitDone().size() )
|
|
return;
|
|
|
|
if( firstIteration )
|
|
return;
|
|
|
|
gdDebug( "Zim: Building the full-text index for dictionary: %s\n",
|
|
getName().c_str() );
|
|
|
|
try
|
|
{
|
|
Mutex::Lock _( getFtsMutex() );
|
|
|
|
File::Class ftsIdx( ftsIndexName(), "wb" );
|
|
|
|
FtsHelpers::FtsIdxHeader ftsIdxHeader;
|
|
memset( &ftsIdxHeader, 0, sizeof( ftsIdxHeader ) );
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
// will be rewritten with the right values.
|
|
|
|
ftsIdx.write( ftsIdxHeader );
|
|
|
|
ChunkedStorage::Writer chunks( ftsIdx );
|
|
|
|
BtreeIndexing::IndexedWords indexedWords;
|
|
|
|
QSet< uint32_t > setOfOffsets;
|
|
setOfOffsets.reserve( getWordCount() );
|
|
|
|
findArticleLinks( 0, &setOfOffsets, 0, &isCancelled );
|
|
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
// We should sort articles order by cluster number
|
|
// to effective use clusters data caching
|
|
|
|
QVector< QPair< quint32, uint32_t > > offsetsWithClusters;
|
|
offsetsWithClusters.reserve( setOfOffsets.size() );
|
|
|
|
for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin();
|
|
it != setOfOffsets.constEnd(); ++it )
|
|
{
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
Mutex::Lock _( zimMutex );
|
|
offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) );
|
|
}
|
|
|
|
// Free memory
|
|
setOfOffsets.clear();
|
|
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() );
|
|
|
|
QVector< uint32_t > offsets;
|
|
offsets.resize( offsetsWithClusters.size() );
|
|
for( int i = 0; i < offsetsWithClusters.size(); i++ )
|
|
offsets[ i ] = offsetsWithClusters.at( i ).second;
|
|
|
|
// Free memory
|
|
offsetsWithClusters.clear();
|
|
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
QMap< QString, QVector< uint32_t > > ftsWords;
|
|
|
|
set< quint32 > indexedArticles;
|
|
quint32 articleNumber;
|
|
|
|
// index articles for full-text search
|
|
for( int i = 0; i < offsets.size(); i++ )
|
|
{
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
QString headword, articleStr;
|
|
|
|
articleNumber = getArticleText( offsets.at( i ), headword, articleStr,
|
|
&indexedArticles );
|
|
if( articleNumber == 0xFFFFFFFF )
|
|
continue;
|
|
|
|
indexedArticles.insert( articleNumber );
|
|
|
|
FtsHelpers::parseArticleForFts( offsets.at( i ), articleStr, ftsWords );
|
|
}
|
|
|
|
// Free memory
|
|
offsets.clear();
|
|
|
|
QMap< QString, QVector< uint32_t > >::iterator it = ftsWords.begin();
|
|
while( it != ftsWords.end() )
|
|
{
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
uint32_t offset = chunks.startNewBlock();
|
|
uint32_t size = it.value().size();
|
|
|
|
chunks.addToBlock( &size, sizeof(uint32_t) );
|
|
chunks.addToBlock( it.value().data(), size * sizeof(uint32_t) );
|
|
|
|
indexedWords.addSingleWord( gd::toWString( it.key() ), offset );
|
|
|
|
it = ftsWords.erase( it );
|
|
}
|
|
|
|
// Free memory
|
|
ftsWords.clear();
|
|
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
ftsIdxHeader.chunksOffset = chunks.finish();
|
|
ftsIdxHeader.wordCount = indexedWords.size();
|
|
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
throw exUserAbort();
|
|
|
|
BtreeIndexing::IndexInfo ftsIdxInfo = BtreeIndexing::buildIndex( indexedWords, ftsIdx );
|
|
|
|
// Free memory
|
|
indexedWords.clear();
|
|
|
|
ftsIdxHeader.indexBtreeMaxElements = ftsIdxInfo.btreeMaxElements;
|
|
ftsIdxHeader.indexRootOffset = ftsIdxInfo.rootOffset;
|
|
|
|
ftsIdxHeader.signature = FtsHelpers::FtsSignature;
|
|
ftsIdxHeader.formatVersion = FtsHelpers::CurrentFtsFormatVersion + getFtsIndexVersion();
|
|
|
|
ftsIdx.rewind();
|
|
ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 );
|
|
|
|
FTS_index_completed.ref();
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Zim: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
|
|
}
|
|
}
|
|
|
|
void ZimDictionary::sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets,
|
|
QAtomicInt & isCancelled )
|
|
{
|
|
QVector< QPair< quint32, uint32_t > > offsetsWithClusters;
|
|
offsetsWithClusters.reserve( offsets.size() );
|
|
|
|
for( QVector< uint32_t >::ConstIterator it = offsets.constBegin();
|
|
it != offsets.constEnd(); ++it )
|
|
{
|
|
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
return;
|
|
|
|
Mutex::Lock _( zimMutex );
|
|
offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) );
|
|
}
|
|
|
|
std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() );
|
|
|
|
for( int i = 0; i < offsetsWithClusters.size(); i++ )
|
|
offsets[ i ] = offsetsWithClusters.at( i ).second;
|
|
}
|
|
|
|
void ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
|
|
{
|
|
try
|
|
{
|
|
headword.clear();
|
|
string articleText;
|
|
|
|
loadArticle( articleAddress, articleText, 0, true );
|
|
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
}
|
|
}
|
|
|
|
quint32 ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text,
|
|
set< quint32 > * loadedArticles )
|
|
{
|
|
quint32 articleNumber = 0xFFFFFFFF;
|
|
try
|
|
{
|
|
headword.clear();
|
|
string articleText;
|
|
|
|
articleNumber = loadArticle( articleAddress, articleText, loadedArticles, true );
|
|
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
}
|
|
return articleNumber;
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > ZimDictionary::getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics )
|
|
{
|
|
return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
|
|
}
|
|
|
|
/// ZimDictionary::getArticle()
|
|
|
|
class ZimArticleRequest: public Dictionary::DataRequest
|
|
{
|
|
wstring word;
|
|
vector< wstring > alts;
|
|
ZimDictionary & dict;
|
|
bool ignoreDiacritics;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
QFuture< void > f;
|
|
|
|
public:
|
|
|
|
ZimArticleRequest( wstring const & word_,
|
|
vector< wstring > const & alts_,
|
|
ZimDictionary & dict_, bool ignoreDiacritics_ ):
|
|
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
|
|
{
|
|
f = QtConcurrent::run( [ this ]() { this->run(); } );
|
|
// QThreadPool::globalInstance()->start( [ this ]() { this->run(); } );
|
|
}
|
|
|
|
void run();
|
|
|
|
virtual void cancel()
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~ZimArticleRequest()
|
|
{
|
|
isCancelled.ref();
|
|
f.waitForFinished();
|
|
//hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void ZimArticleRequest::run()
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
|
|
|
for( unsigned x = 0; x < alts.size(); ++x )
|
|
{
|
|
/// Make an additional query for each alt
|
|
|
|
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
}
|
|
|
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
|
|
|
set< quint32 > articlesIncluded; // Some synonims make it that the articles
|
|
// appear several times. We combat this
|
|
// by only allowing them to appear once.
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
|
if( ignoreDiacritics )
|
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
// Now grab that article
|
|
|
|
string headword, articleText;
|
|
|
|
headword = chain[ x ].word;
|
|
|
|
quint32 articleNumber = 0xFFFFFFFF;
|
|
try
|
|
{
|
|
articleNumber = dict.loadArticle( chain[ x ].articleOffset, articleText, &articlesIncluded );
|
|
}
|
|
catch(...)
|
|
{
|
|
}
|
|
|
|
if( articleNumber == 0xFFFFFFFF )
|
|
continue; // No article loaded
|
|
|
|
if ( articlesIncluded.find( articleNumber ) != articlesIncluded.end() )
|
|
continue; // We already have this article in the body.
|
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
// main ones first, and alternates after.
|
|
|
|
// We do the case-folded comparison here.
|
|
|
|
wstring headwordStripped =
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
|
|
if( ignoreDiacritics )
|
|
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
|
|
|
|
multimap< wstring, pair< string, string > > & mapToUse =
|
|
( wordCaseFolded == headwordStripped ) ?
|
|
mainArticles : alternateArticles;
|
|
|
|
mapToUse.insert( pair< wstring, pair< string, string > >(
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
|
|
pair< string, string >( headword, articleText ) ) );
|
|
|
|
articlesIncluded.insert( articleNumber );
|
|
}
|
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() )
|
|
{
|
|
// No such word
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string result;
|
|
|
|
// See Issue #271: A mechanism to clean-up invalid HTML cards.
|
|
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</b></b></b></b></b></b></b></b>"
|
|
"</i></i></i></i></i></i></i></i>"
|
|
"</a></a></a></a></a></a></a></a>";
|
|
|
|
multimap< wstring, pair< string, string > >::const_iterator i;
|
|
|
|
|
|
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
|
|
{
|
|
result += "<div class=\"zimdict\">";
|
|
result += "<h2 class=\"zimdict_headword\">";
|
|
result += i->second.first;
|
|
result += "</h2>";
|
|
result += i->second.second;
|
|
result += cleaner + "</div>";
|
|
}
|
|
|
|
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
|
|
{
|
|
result += "<div class=\"zimdict\">";
|
|
result += "<h2 class=\"zimdict_headword\">";
|
|
result += i->second.first;
|
|
result += "</h2>";
|
|
result += i->second.second;
|
|
result += cleaner + "</div>";
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
data.resize( result.size() );
|
|
|
|
memcpy( &data.front(), result.data(), result.size() );
|
|
|
|
hasAnyData = true;
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > ZimDictionary::getArticle( wstring const & word,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
|
|
{
|
|
return new ZimArticleRequest( word, alts, *this, ignoreDiacritics );
|
|
}
|
|
|
|
//// ZimDictionary::getResource()
|
|
|
|
class ZimResourceRequest: public Dictionary::DataRequest
|
|
{
|
|
ZimDictionary & dict;
|
|
|
|
string resourceName;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
QFuture< void > f;
|
|
|
|
public:
|
|
ZimResourceRequest(ZimDictionary &dict_, string const &resourceName_)
|
|
: dict(dict_), resourceName(resourceName_) {
|
|
f = QtConcurrent::run( [ this ]() { this->run(); } );
|
|
// QThreadPool::globalInstance()->start( [ this ]() { this->run(); } );
|
|
}
|
|
|
|
void run();
|
|
|
|
virtual void cancel()
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~ZimResourceRequest()
|
|
{
|
|
isCancelled.ref();
|
|
f.waitForFinished();
|
|
//hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void ZimResourceRequest::run()
|
|
{
|
|
// Some runnables linger enough that they are cancelled before they start
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
string resource;
|
|
dict.loadResource( resourceName, resource );
|
|
if( resource.empty() )
|
|
throw File::Ex();
|
|
|
|
if( Filetype::isNameOfCSS( resourceName ) )
|
|
{
|
|
QString css = QString::fromUtf8( resource.data(), resource.size() );
|
|
dict.isolateCSS( css, ".zimdict" );
|
|
QByteArray bytes = css.toUtf8();
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
data.resize( bytes.size() );
|
|
memcpy( &data.front(), bytes.constData(), bytes.size() );
|
|
}
|
|
else
|
|
if ( Filetype::isNameOfTiff( resourceName ) )
|
|
{
|
|
// Convert it
|
|
Mutex::Lock _( dataMutex );
|
|
GdTiff::tiff2img( data );
|
|
}
|
|
else
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
data.resize( resource.size() );
|
|
memcpy( &data.front(), resource.data(), data.size() );
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
hasAnyData = true;
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "ZIM: Failed loading resource \"%s\" from \"%s\", reason: %s\n",
|
|
resourceName.c_str(), dict.getName().c_str(), ex.what() );
|
|
// Resource not loaded -- we don't set the hasAnyData flag then
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name )
|
|
|
|
{
|
|
return new ZimResourceRequest( *this, name );
|
|
}
|
|
|
|
//} // anonymous namespace
|
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries(
|
|
vector< string > const & fileNames,
|
|
string const & indicesDir,
|
|
Dictionary::Initializing & initializing,
|
|
unsigned maxHeadwordsToExpand )
|
|
|
|
{
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
|
|
++i )
|
|
{
|
|
// Skip files with the extensions different to .zim to speed up the
|
|
// scanning
|
|
|
|
QString firstName = QDir::fromNativeSeparators( FsEncoding::decode( i->c_str() ) );
|
|
if( !firstName.endsWith( ".zim") && !firstName.endsWith( ".zimaa" ) )
|
|
continue;
|
|
|
|
// Got the file -- check if we need to rebuid the index
|
|
ZimFile df( firstName );
|
|
|
|
vector< string > dictFiles;
|
|
df.getFilenames( dictFiles );
|
|
|
|
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
|
|
|
string indexFile = indicesDir + dictId;
|
|
|
|
try
|
|
{
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
|
|
indexIsOldOrBad( indexFile ) )
|
|
{
|
|
gdDebug( "Zim: Building the index for dictionary: %s\n", i->c_str() );
|
|
|
|
unsigned articleCount = 0;
|
|
unsigned wordCount = 0;
|
|
|
|
df.open();
|
|
ZIM_header const & zh = df.header();
|
|
bool new_namespaces = ( zh.majorVersion >= 6 && zh.minorVersion >= 1 );
|
|
|
|
if( zh.magicNumber != 0x44D495A )
|
|
throw exNotZimFile( i->c_str() );
|
|
|
|
{
|
|
int n = firstName.lastIndexOf( '/' );
|
|
initializing.indexingDictionary( firstName.mid( n + 1 ).toUtf8().constData() );
|
|
}
|
|
|
|
File::Class idx( indexFile, "wb" );
|
|
IdxHeader idxHeader;
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
idxHeader.namePtr = 0xFFFFFFFF;
|
|
idxHeader.descriptionPtr = 0xFFFFFFFF;
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
// will be rewritten with the right values.
|
|
|
|
idx.write( idxHeader );
|
|
|
|
IndexedWords indexedWords, indexedResources;
|
|
|
|
QByteArray artEntries;
|
|
df.seek( zh.urlPtrPos );
|
|
artEntries = df.read( (quint64)zh.articleCount * 8 );
|
|
|
|
QVector< quint64 > clusters;
|
|
clusters.reserve( zh.clusterCount );
|
|
df.seek( zh.clusterPtrPos );
|
|
{
|
|
QByteArray data = df.read( (quint64)zh.clusterCount * 8 );
|
|
for( unsigned n = 0; n < zh.clusterCount; n++ )
|
|
clusters.append( *( reinterpret_cast< const quint64 * >( data.constData() ) + n ) );
|
|
}
|
|
|
|
const quint64 * ptr;
|
|
quint16 mimetype, redirected_mime = 0xFFFF;
|
|
ArticleEntry artEntry;
|
|
RedirectEntry redEntry;
|
|
string url, title;
|
|
char nameSpace;
|
|
for( unsigned n = 0; n < zh.articleCount; n++ )
|
|
{
|
|
ptr = reinterpret_cast< const quint64 * >( artEntries.constData() ) + n;
|
|
df.seek( *ptr );
|
|
df.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) );
|
|
if( mimetype == 0xFFFF )
|
|
{
|
|
redEntry.mimetype = mimetype;
|
|
qint64 ret = df.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(RedirectEntry) - 2 );
|
|
if( ret != sizeof(RedirectEntry) - 2 )
|
|
throw exCantReadFile( i->c_str() );
|
|
|
|
redirected_mime = df.redirectedMimeType( redEntry );
|
|
nameSpace = redEntry.nameSpace;
|
|
}
|
|
else
|
|
{
|
|
artEntry.mimetype = mimetype;
|
|
qint64 ret = df.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(ArticleEntry) - 2 );
|
|
if( ret != sizeof(ArticleEntry) - 2 )
|
|
throw exCantReadFile( i->c_str() );
|
|
|
|
nameSpace = artEntry.nameSpace;
|
|
|
|
if( ( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces ) ) && df.isArticleMime( mimetype ) )
|
|
articleCount++;
|
|
}
|
|
|
|
// Read article url and title
|
|
char ch;
|
|
|
|
url.clear();
|
|
while( df.getChar( &ch ) )
|
|
{
|
|
if( ch == 0 )
|
|
break;
|
|
url.push_back( ch );
|
|
}
|
|
|
|
title.clear();
|
|
while( df.getChar( &ch ) )
|
|
{
|
|
if( ch == 0 )
|
|
break;
|
|
title.push_back( ch );
|
|
}
|
|
|
|
if( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces && ( df.isArticleMime( mimetype )
|
|
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) ) ) )
|
|
{
|
|
wstring word;
|
|
if( !title.empty() )
|
|
word = Utf8::decode( title );
|
|
else
|
|
word = Utf8::decode( url );
|
|
|
|
if( df.isArticleMime( mimetype )
|
|
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) )
|
|
{
|
|
if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand )
|
|
indexedWords.addSingleWord( word, n );
|
|
else
|
|
indexedWords.addWord( word, n );
|
|
wordCount++;
|
|
}
|
|
else
|
|
{
|
|
url.insert( url.begin(), '/' );
|
|
url.insert( url.begin(), nameSpace );
|
|
indexedResources.addSingleWord( Utf8::decode( url ), n );
|
|
}
|
|
}
|
|
else
|
|
if( nameSpace == 'M' )
|
|
{
|
|
if( url.compare( "Title" ) == 0 )
|
|
{
|
|
idxHeader.namePtr = n;
|
|
string name;
|
|
readArticle( df, n, name );
|
|
initializing.indexingDictionary( name );
|
|
}
|
|
else
|
|
if( url.compare( "Description" ) == 0 )
|
|
idxHeader.descriptionPtr = n;
|
|
else
|
|
if( url.compare( "Language" ) == 0 )
|
|
{
|
|
string lang;
|
|
readArticle( df, n, lang );
|
|
if( lang.size() == 2 )
|
|
idxHeader.langFrom = LangCoder::code2toInt( lang.c_str() );
|
|
else
|
|
if( lang.size() == 3 )
|
|
idxHeader.langFrom = LangCoder::findIdForLanguageCode3( lang.c_str() );
|
|
idxHeader.langTo = idxHeader.langFrom;
|
|
}
|
|
}
|
|
else
|
|
if( nameSpace == 'X' )
|
|
{
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
url.insert( url.begin(), '/' );
|
|
url.insert( url.begin(), nameSpace );
|
|
indexedResources.addSingleWord( Utf8::decode( url ), n );
|
|
}
|
|
}
|
|
|
|
// Build index
|
|
|
|
{
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
|
|
|
indexedWords.clear(); // Release memory -- no need for this data
|
|
}
|
|
|
|
{
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedResources, idx );
|
|
|
|
idxHeader.resourceIndexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.resourceIndexRootOffset = idxInfo.rootOffset;
|
|
|
|
indexedResources.clear(); // Release memory -- no need for this data
|
|
}
|
|
|
|
idxHeader.signature = Signature;
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
|
|
|
idxHeader.articleCount = articleCount;
|
|
idxHeader.wordCount = wordCount;
|
|
|
|
idx.rewind();
|
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
}
|
|
|
|
dictionaries.push_back( new ZimDictionary( dictId,
|
|
indexFile,
|
|
dictFiles ) );
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
gdWarning( "Zim dictionary initializing failed: %s, error: %s\n",
|
|
i->c_str(), e.what() );
|
|
continue;
|
|
}
|
|
catch( ... )
|
|
{
|
|
qWarning( "Zim dictionary initializing failed\n" );
|
|
continue;
|
|
}
|
|
}
|
|
return dictionaries;
|
|
}
|
|
|
|
} // namespace Zim
|
|
|
|
#endif
|