2013-09-14 16:17:32 +00:00
|
|
|
/* This file is (c) 2012 Abs62
|
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#ifdef MAKE_ZIM_SUPPORT
|
|
|
|
|
|
|
|
#include "zim.hh"
|
|
|
|
#include "btreeidx.hh"
|
2023-04-28 16:09:45 +00:00
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
#include "folding.hh"
|
2013-11-16 18:34:09 +00:00
|
|
|
#include "gddebug.hh"
|
2023-05-27 04:12:16 +00:00
|
|
|
#include "utf8.hh"
|
|
|
|
#include "langcoder.hh"
|
|
|
|
#include "filetype.hh"
|
|
|
|
#include "file.hh"
|
|
|
|
#include "utils.hh"
|
|
|
|
#include "tiff.hh"
|
|
|
|
#include "ftshelpers.hh"
|
|
|
|
#include "htmlescape.hh"
|
|
|
|
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
#include <stub_msvc.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <QByteArray>
|
|
|
|
#include <QFile>
|
|
|
|
#include <QString>
|
|
|
|
#include <QAtomicInt>
|
|
|
|
#include <QImage>
|
|
|
|
#include <QDir>
|
|
|
|
|
|
|
|
#include <QRegularExpression>
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <set>
|
|
|
|
#include <map>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <QtConcurrent>
|
|
|
|
#include <utility>
|
|
|
|
#include "globalregex.hh"
|
|
|
|
#include <zim/zim.h>
|
|
|
|
#include <zim/archive.h>
|
|
|
|
#include <zim/entry.h>
|
|
|
|
#include <zim/item.h>
|
2023-06-18 09:04:50 +00:00
|
|
|
#include <zim/error.h>
|
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
namespace Zim {
|
|
|
|
|
|
|
|
using std::string;
|
|
|
|
using std::map;
|
|
|
|
using std::vector;
|
|
|
|
using std::multimap;
|
|
|
|
using std::pair;
|
|
|
|
using std::set;
|
|
|
|
using gd::wstring;
|
|
|
|
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
|
|
using BtreeIndexing::IndexedWords;
|
|
|
|
using BtreeIndexing::IndexInfo;
|
|
|
|
|
|
|
|
DEF_EX_STR( exNotZimFile, "Not an Zim file", Dictionary::Ex )
|
2023-06-19 22:36:43 +00:00
|
|
|
using Dictionary::exCantReadFile;
|
2023-01-17 17:59:13 +00:00
|
|
|
DEF_EX_STR( exInvalidZimHeader, "Invalid Zim header", Dictionary::Ex )
|
2014-04-16 16:18:28 +00:00
|
|
|
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
|
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
using ZimFile = zim::Archive;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
#pragma pack( push, 1 )
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
enum {
|
|
|
|
Signature = 0x584D495A, // ZIMX on little-endian, XMIZ on big-endian
|
2022-11-22 10:03:59 +00:00
|
|
|
CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version
|
2013-09-14 16:17:32 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct IdxHeader
|
|
|
|
{
|
|
|
|
quint32 signature; // First comes the signature, ZIMX
|
|
|
|
quint32 formatVersion; // File format version (CurrentFormatVersion)
|
|
|
|
quint32 indexBtreeMaxElements; // Two fields from IndexInfo
|
|
|
|
quint32 indexRootOffset;
|
|
|
|
quint32 resourceIndexBtreeMaxElements; // Two fields from IndexInfo
|
|
|
|
quint32 resourceIndexRootOffset;
|
|
|
|
quint32 wordCount;
|
|
|
|
quint32 articleCount;
|
|
|
|
quint32 namePtr;
|
|
|
|
quint32 descriptionPtr;
|
|
|
|
quint32 langFrom; // Source language
|
|
|
|
quint32 langTo; // Target language
|
|
|
|
}
|
|
|
|
#ifndef _MSC_VER
|
|
|
|
__attribute__( ( packed ) )
|
|
|
|
#endif
|
|
|
|
;
|
|
|
|
|
2013-09-20 14:25:44 +00:00
|
|
|
#pragma pack( pop )
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
// Some supporting functions
|
|
|
|
bool indexIsOldOrBad( string const & indexFile )
|
2013-09-19 14:04:04 +00:00
|
|
|
{
|
2024-03-23 04:29:51 +00:00
|
|
|
File::Index idx( indexFile, "rb" );
|
2018-03-07 13:45:54 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
IdxHeader header;
|
2013-09-19 14:04:04 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 || header.signature != Signature
|
|
|
|
|| header.formatVersion != CurrentFormatVersion;
|
2013-09-19 14:04:04 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
quint32 getArticleCluster( ZimFile const & file, quint32 articleNumber )
|
2013-09-19 14:04:04 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
try {
|
|
|
|
auto entry = file.getEntryByPath( articleNumber );
|
|
|
|
|
|
|
|
auto item = entry.getItem( true );
|
|
|
|
|
|
|
|
return item.getIndex();
|
|
|
|
}
|
|
|
|
catch ( std::exception & e ) {
|
|
|
|
qDebug() << e.what();
|
|
|
|
return 0xFFFFFFFF;
|
|
|
|
}
|
2013-09-19 14:04:04 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
bool isArticleMime( const string & mime_type )
|
2013-09-19 14:04:04 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
return mime_type == "text/html" /*|| mime_type.compare( "text/plain" ) == 0*/;
|
2013-09-19 14:04:04 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
quint32 readArticle( ZimFile const & file, quint32 articleNumber, string & result )
|
2013-09-19 14:04:04 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
try {
|
|
|
|
auto entry = file.getEntryByPath( articleNumber );
|
2013-09-19 14:04:04 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
auto item = entry.getItem( true );
|
|
|
|
result = string( item.getData( 0 ).data(), item.getData( 0 ).size() );
|
|
|
|
return item.getIndex();
|
2013-09-19 14:04:04 +00:00
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
catch ( std::exception & e ) {
|
|
|
|
qDebug() << e.what();
|
|
|
|
return 0xFFFFFFFF;
|
2018-03-07 13:45:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
quint32 readArticleByPath( ZimFile const & file, const string & path, string & result )
|
2018-03-07 13:45:54 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
try {
|
|
|
|
auto entry = file.getEntryByPath( path );
|
2020-10-06 14:52:30 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
auto item = entry.getItem( true );
|
|
|
|
result = item.getData();
|
|
|
|
return item.getIndex();
|
2020-10-06 14:52:30 +00:00
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
catch ( std::exception & e ) {
|
|
|
|
qDebug() << e.what();
|
|
|
|
return 0xFFFFFFFF;
|
2022-05-31 18:07:03 +00:00
|
|
|
}
|
2018-03-07 13:45:54 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
// ZimDictionary
|
|
|
|
|
|
|
|
class ZimDictionary: public BtreeIndexing::BtreeDictionary
|
2018-03-07 13:45:54 +00:00
|
|
|
{
|
2023-05-29 13:56:04 +00:00
|
|
|
QMutex idxMutex;
|
|
|
|
QMutex zimMutex;
|
2024-03-23 04:29:51 +00:00
|
|
|
File::Index idx;
|
2023-05-27 04:12:16 +00:00
|
|
|
IdxHeader idxHeader;
|
|
|
|
ZimFile df;
|
|
|
|
set< quint32 > articlesIndexedForFTS;
|
2018-03-07 13:45:54 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
public:
|
2018-03-07 13:45:54 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
ZimDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles );
|
2018-03-07 13:45:54 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
~ZimDictionary() = default;
|
2018-03-07 13:45:54 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
string getName() noexcept override
|
2018-03-07 13:45:54 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
return dictionaryName;
|
2018-03-07 13:45:54 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
map< Dictionary::Property, string > getProperties() noexcept override
|
2022-06-01 14:51:56 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
return {};
|
2022-06-01 14:51:56 +00:00
|
|
|
}
|
2018-03-07 13:45:54 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
unsigned long getArticleCount() noexcept override
|
2018-03-07 13:45:54 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
return idxHeader.articleCount;
|
2018-03-07 13:45:54 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
unsigned long getWordCount() noexcept override
|
2022-05-31 18:07:03 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
return idxHeader.wordCount;
|
2022-05-31 18:07:03 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
inline quint32 getLangFrom() const override
|
2018-03-07 20:32:20 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
return idxHeader.langFrom;
|
2018-03-07 20:32:20 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
inline quint32 getLangTo() const override
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
return idxHeader.langTo;
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
sptr< Dictionary::DataRequest >
|
|
|
|
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override;
|
2013-09-19 14:04:04 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
sptr< Dictionary::DataRequest > getResource( string const & name ) override;
|
2015-05-27 15:04:48 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
QString const & getDescription() override;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
/// Loads the resource.
|
|
|
|
void loadResource( std::string & resourceName, string & data );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-30 23:42:31 +00:00
|
|
|
sptr< Dictionary::DataRequest >
|
|
|
|
getSearchResults( QString const & searchString, int searchMode, bool matchCase, bool ignoreDiacritics ) override;
|
2023-05-27 04:12:16 +00:00
|
|
|
void getArticleText( uint32_t articleAddress, QString & headword, QString & text ) override;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
void makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration ) override;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
void setFTSParameters( Config::FullTextSearch const & fts ) override
|
|
|
|
{
|
2023-07-26 02:03:20 +00:00
|
|
|
can_FTS = enable_FTS && fts.enabled && !fts.disabledTypes.contains( "ZIM", Qt::CaseInsensitive )
|
2023-05-27 04:12:16 +00:00
|
|
|
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
|
|
|
|
}
|
2014-04-17 14:31:51 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
void sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets, QAtomicInt & isCancelled ) override;
|
2018-03-07 21:17:09 +00:00
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
protected:
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void loadIcon() noexcept override;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
/// Loads the article.
|
2023-05-27 04:12:16 +00:00
|
|
|
quint32 loadArticle( quint32 address, string & articleText, bool rawText = false );
|
|
|
|
|
|
|
|
string convert( string const & in_data );
|
|
|
|
friend class ZimArticleRequest;
|
|
|
|
friend class ZimResourceRequest;
|
2013-09-14 16:17:32 +00:00
|
|
|
};
|
|
|
|
|
2023-04-13 10:08:32 +00:00
|
|
|
ZimDictionary::ZimDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ):
|
|
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
|
|
idx( indexFile, "rb" ),
|
|
|
|
idxHeader( idx.read< IdxHeader >() ),
|
2023-05-27 04:12:16 +00:00
|
|
|
df( dictionaryFiles[ 0 ] )
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
// Initialize the indexes
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
// Read dictionary name
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
dictionaryName = df.getMetadata( "Title" );
|
|
|
|
if ( dictionaryName.empty() ) {
|
|
|
|
QString name = QDir::fromNativeSeparators( dictionaryFiles[ 0 ].c_str() );
|
|
|
|
int n = name.lastIndexOf( '/' );
|
|
|
|
dictionaryName = name.mid( n + 1 ).toStdString();
|
|
|
|
}
|
2014-04-16 16:18:28 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
// Full-text search parameters
|
2014-04-16 16:18:28 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
ftsIdxName = indexFile + Dictionary::getFtsSuffix();
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2022-06-03 13:28:41 +00:00
|
|
|
void ZimDictionary::loadIcon() noexcept
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
|
|
|
if ( dictionaryIconLoaded )
|
|
|
|
return;
|
|
|
|
|
2023-06-18 09:04:50 +00:00
|
|
|
// Try to load Original GD's user provided icon
|
2023-04-13 10:08:32 +00:00
|
|
|
QString fileName = QDir::fromNativeSeparators( getDictionaryFilenames()[ 0 ].c_str() );
|
2013-09-14 16:17:32 +00:00
|
|
|
// Remove the extension
|
|
|
|
fileName.chop( 3 );
|
2023-06-18 09:04:50 +00:00
|
|
|
if ( loadIconFromFile( fileName ) ) {
|
|
|
|
dictionaryIconLoaded = true;
|
|
|
|
return;
|
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-06-18 09:04:50 +00:00
|
|
|
// Try to load zim's illustration, which is usually 48x48 png
|
|
|
|
try {
|
|
|
|
auto illustration = df.getIllustrationItem( 48 ).getData();
|
|
|
|
QImage img = QImage::fromData( reinterpret_cast< const uchar * >( illustration.data() ), illustration.size() );
|
2023-06-19 09:01:27 +00:00
|
|
|
|
|
|
|
if ( img.isNull() ) {
|
|
|
|
// Fallback to default icon
|
|
|
|
dictionaryIcon = QIcon( ":/icons/icon32_zim.png" );
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
dictionaryIcon = QIcon( QPixmap::fromImage( img ) );
|
|
|
|
}
|
2023-06-18 09:04:50 +00:00
|
|
|
|
|
|
|
dictionaryIconLoaded = true;
|
|
|
|
return;
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
2023-06-18 09:04:50 +00:00
|
|
|
catch ( zim::EntryNotFound & e ) {
|
|
|
|
gdDebug( "ZIM icon not loaded for: %s", dictionaryName.c_str() );
|
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
quint32 ZimDictionary::loadArticle( quint32 address, string & articleText, bool rawText )
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
quint32 ret = 0;
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
2023-05-29 13:56:04 +00:00
|
|
|
QMutexLocker _( &zimMutex );
|
2023-05-27 04:12:16 +00:00
|
|
|
ret = readArticle( df, address, articleText );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
2014-04-29 18:17:06 +00:00
|
|
|
if ( !rawText )
|
|
|
|
articleText = convert( articleText );
|
2018-02-21 14:43:35 +00:00
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
string ZimDictionary::convert( const string & in )
|
|
|
|
{
|
|
|
|
QString text = QString::fromUtf8( in.c_str() );
|
|
|
|
|
2015-01-07 12:08:13 +00:00
|
|
|
// replace background
|
2022-12-24 22:01:50 +00:00
|
|
|
text.replace( QRegularExpression( R"(<\s*body\s+([^>]*)(background(|-color)):([^;"]*(;|)))" ),
|
2015-03-22 17:40:37 +00:00
|
|
|
QString( "<body \\1" ) );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2015-01-07 12:08:13 +00:00
|
|
|
// pattern of img and script
|
2022-11-22 10:03:59 +00:00
|
|
|
// text.replace( QRegularExpression( "<\\s*(img|script)\\s+([^>]*)src=(\")([^\"]*)\\3" ),
|
|
|
|
// QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
QRegularExpression rxImgScript( R"(<\s*(img|script|source)\s+([^>]*)src=(")([^"]*)\3)" );
|
2022-11-22 10:03:59 +00:00
|
|
|
QRegularExpressionMatchIterator it = rxImgScript.globalMatch( text );
|
|
|
|
int pos = 0;
|
|
|
|
QString newText;
|
|
|
|
while ( it.hasNext() ) {
|
|
|
|
QRegularExpressionMatch match = it.next();
|
|
|
|
|
|
|
|
newText += text.mid( pos, match.capturedStart() - pos );
|
|
|
|
pos = match.capturedEnd();
|
|
|
|
|
|
|
|
QStringList list = match.capturedTexts();
|
|
|
|
|
|
|
|
QString url = list[ 4 ]; // a url
|
|
|
|
|
|
|
|
QString urlLink = match.captured();
|
|
|
|
|
|
|
|
QString replacedLink = urlLink;
|
|
|
|
if ( !url.isEmpty() && !url.startsWith( "//" ) && !url.startsWith( "http://" ) && !url.startsWith( "https://" ) ) {
|
2023-05-27 04:12:16 +00:00
|
|
|
//the pattern like : <\\1 \\2src=\\3bres://%1/
|
|
|
|
|
|
|
|
//remove leading dot and slash
|
|
|
|
url.remove( RX::Zim::leadingDotSlash );
|
2022-11-22 10:03:59 +00:00
|
|
|
replacedLink =
|
|
|
|
QString( "<%1 %2 src=\"bres://%3/%4\"" ).arg( list[ 1 ], list[ 2 ], QString::fromStdString( getId() ), url );
|
|
|
|
}
|
|
|
|
|
|
|
|
newText += replacedLink;
|
|
|
|
}
|
|
|
|
if ( pos ) {
|
|
|
|
newText += text.mid( pos );
|
|
|
|
text = newText;
|
|
|
|
}
|
|
|
|
newText.clear();
|
|
|
|
|
2018-02-21 14:43:35 +00:00
|
|
|
|
|
|
|
// Fix links without '"'
|
2022-12-24 22:01:50 +00:00
|
|
|
text.replace( QRegularExpression( R"(href=(\.\.|)/([^\s>]+))" ), QString( R"(href="\1/\2")" ) );
|
2018-02-21 14:43:35 +00:00
|
|
|
|
|
|
|
// pattern <link... href="..." ...>
|
2022-12-24 22:01:50 +00:00
|
|
|
text.replace( QRegularExpression( R"(<\s*link\s+([^>]*)href="(\.\.|)/)" ),
|
2018-02-21 14:43:35 +00:00
|
|
|
QString( "<link \\1href=\"bres://%1/" ).arg( getId().c_str() ) );
|
|
|
|
|
|
|
|
// localize the http://en.wiki***.com|org/wiki/<key> series links
|
|
|
|
// excluding those keywords that have ":" in it
|
|
|
|
QString urlWiki =
|
|
|
|
"\"http(s|)://en\\.(wiki(pedia|books|news|quote|source|voyage|versity)|wiktionary)\\.(org|com)/wiki/([^:\"]*)\"";
|
2022-12-24 22:01:50 +00:00
|
|
|
text.replace( QRegularExpression( R"(<\s*a\s+(class="external"\s+|)href=)" + urlWiki ),
|
|
|
|
QString( R"(<a href="gdlookup://localhost/\6")" ) );
|
2018-02-21 14:43:35 +00:00
|
|
|
|
2015-01-07 12:08:13 +00:00
|
|
|
// pattern <a href="..." ...>, excluding any known protocols such as http://, mailto:, #(comment)
|
|
|
|
// these links will be translated into local definitions
|
2022-04-03 11:39:58 +00:00
|
|
|
// <meta http-equiv="Refresh" content="0;url=../dsalsrv02.uchicago.edu/cgi-bin/0994.html">
|
2022-12-24 22:01:50 +00:00
|
|
|
QRegularExpression rxLink(
|
|
|
|
R"lit(<\s*(?:a|meta)\s+([^>]*)(?:href|url)="?(?!(?:\w+://|#|mailto:|tel:))()([^"]*)"\s*(title="[^"]*")?[^>]*>)lit" );
|
2022-11-22 10:03:59 +00:00
|
|
|
it = rxLink.globalMatch( text );
|
|
|
|
pos = 0;
|
2018-02-21 14:43:35 +00:00
|
|
|
while ( it.hasNext() ) {
|
|
|
|
QRegularExpressionMatch match = it.next();
|
|
|
|
|
2022-02-27 05:17:37 +00:00
|
|
|
newText += text.mid( pos, match.capturedStart() - pos );
|
2018-02-21 14:43:35 +00:00
|
|
|
pos = match.capturedEnd();
|
|
|
|
|
|
|
|
QStringList list = match.capturedTexts();
|
|
|
|
// Add empty strings for compatibility with QRegExp behaviour
|
2022-04-03 11:39:58 +00:00
|
|
|
for ( int i = list.size(); i < 5; i++ )
|
2018-02-21 14:43:35 +00:00
|
|
|
list.append( QString() );
|
2021-11-19 13:47:22 +00:00
|
|
|
|
2022-11-19 06:12:31 +00:00
|
|
|
QString formatTag;
|
2022-11-21 12:00:09 +00:00
|
|
|
QString tag = list[ 3 ]; // a url, ex: Precambrian_Chaotian.html
|
|
|
|
QString url = tag;
|
|
|
|
if ( !list[ 4 ].isEmpty() ) // a title, ex: title="Precambrian/Chaotian"
|
2015-05-27 15:04:48 +00:00
|
|
|
{
|
2022-11-21 12:00:09 +00:00
|
|
|
tag = list[ 4 ];
|
|
|
|
formatTag = tag.split( "\"" )[ 1 ];
|
2022-11-19 06:12:31 +00:00
|
|
|
}
|
2022-11-21 12:00:09 +00:00
|
|
|
else {
|
2022-11-19 06:12:31 +00:00
|
|
|
//tag from list[3]
|
2022-11-19 08:34:31 +00:00
|
|
|
formatTag = tag;
|
2023-05-27 04:12:16 +00:00
|
|
|
formatTag.remove( RX::Zim::leadingDotSlash );
|
2022-11-19 06:12:31 +00:00
|
|
|
}
|
2015-05-27 15:04:48 +00:00
|
|
|
|
2022-11-19 06:12:31 +00:00
|
|
|
QString urlLink = match.captured();
|
|
|
|
|
2022-11-21 12:00:09 +00:00
|
|
|
QString replacedLink = urlLink;
|
|
|
|
if ( !url.isEmpty() && !url.startsWith( "//" ) ) {
|
|
|
|
replacedLink = urlLink.replace( url, "gdlookup://localhost/" + formatTag );
|
2015-05-27 15:04:48 +00:00
|
|
|
}
|
2015-01-07 12:08:13 +00:00
|
|
|
|
2022-11-19 06:12:31 +00:00
|
|
|
newText += replacedLink;
|
2018-02-21 14:43:35 +00:00
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
if ( pos != 0 ) {
|
2022-02-27 05:17:37 +00:00
|
|
|
newText += text.mid( pos );
|
2018-02-21 14:43:35 +00:00
|
|
|
text = newText;
|
|
|
|
}
|
|
|
|
newText.clear();
|
2021-11-19 13:47:22 +00:00
|
|
|
|
2018-07-07 09:33:15 +00:00
|
|
|
// Occasionally words needs to be displayed in vertical, but <br/> were changed to <br\> somewhere
|
2015-01-07 12:08:13 +00:00
|
|
|
// proper style: <a href="gdlookup://localhost/Neoptera" ... >N<br/>e<br/>o<br/>p<br/>t<br/>e<br/>r<br/>a</a>
|
2022-12-24 22:01:50 +00:00
|
|
|
QRegularExpression rxBR(
|
|
|
|
R"((<a href="gdlookup://localhost/[^"]*"\s*[^>]*>)\s*((\w\s*<br(\\|/|)>\s*)+\w)\s*</a>)",
|
2018-02-21 14:43:35 +00:00
|
|
|
QRegularExpression::UseUnicodePropertiesOption );
|
|
|
|
pos = 0;
|
2022-04-03 13:02:31 +00:00
|
|
|
QRegularExpressionMatchIterator it2 = rxBR.globalMatch( text );
|
2018-02-21 14:43:35 +00:00
|
|
|
while ( it2.hasNext() ) {
|
2022-04-03 11:39:58 +00:00
|
|
|
QRegularExpressionMatch match = it2.next();
|
2018-02-21 14:43:35 +00:00
|
|
|
|
2022-02-27 05:17:37 +00:00
|
|
|
newText += text.mid( pos, match.capturedStart() - pos );
|
2018-02-21 14:43:35 +00:00
|
|
|
pos = match.capturedEnd();
|
|
|
|
|
|
|
|
QStringList list = match.capturedTexts();
|
|
|
|
// Add empty strings for compatibility with QRegExp behaviour
|
|
|
|
for ( int i = match.lastCapturedIndex() + 1; i < 3; i++ )
|
|
|
|
list.append( QString() );
|
|
|
|
|
|
|
|
QString tag = list[ 2 ];
|
2022-02-27 05:17:37 +00:00
|
|
|
tag
|
|
|
|
.replace(
|
|
|
|
QRegularExpression( "<br( |)(\\\\|/|)>", QRegularExpression::PatternOption::CaseInsensitiveOption ),
|
|
|
|
"<br/>" )
|
2018-02-21 14:43:35 +00:00
|
|
|
.prepend( list[ 1 ] )
|
|
|
|
.append( "</a>" );
|
|
|
|
|
|
|
|
newText += tag;
|
|
|
|
}
|
|
|
|
if ( pos ) {
|
2022-02-27 05:17:37 +00:00
|
|
|
newText += text.mid( pos );
|
2018-02-21 14:43:35 +00:00
|
|
|
text = newText;
|
|
|
|
}
|
|
|
|
newText.clear();
|
2014-05-23 17:43:44 +00:00
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
// Fix outstanding elements
|
|
|
|
text += "<br style=\"clear:both;\" />";
|
|
|
|
|
|
|
|
return text.toUtf8().data();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ZimDictionary::loadResource( std::string & resourceName, string & data )
|
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
if ( resourceName.empty() )
|
2013-09-14 16:17:32 +00:00
|
|
|
return;
|
2023-05-27 04:12:16 +00:00
|
|
|
QMutexLocker _( &zimMutex );
|
|
|
|
readArticleByPath( df, resourceName, data );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
QString const & ZimDictionary::getDescription()
|
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
if ( !dictionaryDescription.isEmpty() )
|
2013-09-14 16:17:32 +00:00
|
|
|
return dictionaryDescription;
|
2023-05-27 04:12:16 +00:00
|
|
|
|
|
|
|
dictionaryDescription = QString::fromStdString( df.getMetadata( "Description" ) );
|
|
|
|
return dictionaryDescription;
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2014-04-16 16:18:28 +00:00
|
|
|
void ZimDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
|
|
|
|
{
|
2023-06-03 00:29:19 +00:00
|
|
|
if ( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|
|
|
|
|| FtsHelpers::ftsIndexIsOldOrBad( this ) ) )
|
2014-04-16 16:18:28 +00:00
|
|
|
FTS_index_completed.ref();
|
|
|
|
|
|
|
|
if ( haveFTSIndex() )
|
|
|
|
return;
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
if ( !ensureInitDone().empty() )
|
2014-04-16 16:18:28 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
if ( firstIteration )
|
|
|
|
return;
|
|
|
|
|
|
|
|
gdDebug( "Zim: Building the full-text index for dictionary: %s\n", getName().c_str() );
|
2023-05-30 23:42:31 +00:00
|
|
|
try {
|
|
|
|
FtsHelpers::makeFTSIndex( this, isCancelled );
|
|
|
|
FTS_index_completed.ref();
|
2014-04-16 16:18:28 +00:00
|
|
|
}
|
|
|
|
catch ( std::exception & ex ) {
|
|
|
|
gdWarning( "Zim: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
2023-04-13 10:08:32 +00:00
|
|
|
QFile::remove( ftsIdxName.c_str() );
|
2014-04-16 16:18:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-08 08:46:19 +00:00
|
|
|
void ZimDictionary::sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets, QAtomicInt & isCancelled )
|
2018-03-07 21:17:09 +00:00
|
|
|
{
|
|
|
|
QVector< QPair< quint32, uint32_t > > offsetsWithClusters;
|
|
|
|
offsetsWithClusters.reserve( offsets.size() );
|
|
|
|
|
|
|
|
for ( QVector< uint32_t >::ConstIterator it = offsets.constBegin(); it != offsets.constEnd(); ++it ) {
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
2018-03-08 08:46:19 +00:00
|
|
|
return;
|
2018-03-08 13:32:05 +00:00
|
|
|
|
2023-05-29 13:56:04 +00:00
|
|
|
QMutexLocker _( &zimMutex );
|
2018-03-07 21:17:09 +00:00
|
|
|
offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) );
|
2018-03-08 08:46:19 +00:00
|
|
|
}
|
2018-03-07 21:17:09 +00:00
|
|
|
|
2021-02-01 15:00:31 +00:00
|
|
|
std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() );
|
2018-03-07 21:17:09 +00:00
|
|
|
|
|
|
|
for ( int i = 0; i < offsetsWithClusters.size(); i++ )
|
|
|
|
offsets[ i ] = offsetsWithClusters.at( i ).second;
|
|
|
|
}
|
|
|
|
|
2014-04-16 16:18:28 +00:00
|
|
|
void ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
|
|
|
|
{
|
|
|
|
try {
|
|
|
|
headword.clear();
|
|
|
|
string articleText;
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
loadArticle( articleAddress, articleText, true );
|
2014-04-16 16:18:28 +00:00
|
|
|
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
|
|
|
|
}
|
|
|
|
catch ( std::exception & ex ) {
|
|
|
|
gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-30 23:42:31 +00:00
|
|
|
sptr< Dictionary::DataRequest >
|
|
|
|
ZimDictionary::getSearchResults( QString const & searchString, int searchMode, bool matchCase, bool ignoreDiacritics )
|
2014-04-16 16:18:28 +00:00
|
|
|
{
|
2023-05-30 23:42:31 +00:00
|
|
|
return std::make_shared< FtsHelpers::FTSResultsRequest >( *this,
|
|
|
|
searchString,
|
|
|
|
searchMode,
|
|
|
|
matchCase,
|
|
|
|
ignoreDiacritics );
|
2014-04-16 16:18:28 +00:00
|
|
|
}
|
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
/// ZimDictionary::getArticle()
|
|
|
|
|
|
|
|
class ZimArticleRequest: public Dictionary::DataRequest
|
|
|
|
{
|
|
|
|
wstring word;
|
|
|
|
vector< wstring > alts;
|
|
|
|
ZimDictionary & dict;
|
2018-06-13 16:00:42 +00:00
|
|
|
bool ignoreDiacritics;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
QAtomicInt isCancelled;
|
2022-06-19 12:24:34 +00:00
|
|
|
QFuture< void > f;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
ZimArticleRequest( wstring word_, vector< wstring > const & alts_, ZimDictionary & dict_, bool ignoreDiacritics_ ):
|
|
|
|
word( std::move( word_ ) ),
|
|
|
|
alts( alts_ ),
|
|
|
|
dict( dict_ ),
|
|
|
|
ignoreDiacritics( ignoreDiacritics_ )
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
2022-06-19 12:24:34 +00:00
|
|
|
f = QtConcurrent::run( [ this ]() {
|
|
|
|
this->run();
|
|
|
|
} );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2022-05-28 08:30:18 +00:00
|
|
|
void run();
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void cancel() override
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~ZimArticleRequest()
|
|
|
|
{
|
|
|
|
isCancelled.ref();
|
2022-06-19 12:24:34 +00:00
|
|
|
f.waitForFinished();
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
void ZimArticleRequest::run()
|
|
|
|
{
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
|
2013-09-14 16:17:32 +00:00
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-06-13 16:00:42 +00:00
|
|
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
for ( const auto & alt : alts ) {
|
2013-09-14 16:17:32 +00:00
|
|
|
/// Make an additional query for each alt
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
vector< WordArticleLink > altChain = dict.findArticles( alt, ignoreDiacritics );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
|
|
}
|
|
|
|
|
|
|
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
set< quint32 > articlesIncluded; // Some synonyms make it that the articles
|
|
|
|
// appear several times. We combat this
|
|
|
|
// by only allowing them to appear once.
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
2018-06-13 16:00:42 +00:00
|
|
|
if ( ignoreDiacritics )
|
|
|
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
for ( auto & x : chain ) {
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
|
2013-09-14 16:17:32 +00:00
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now grab that article
|
|
|
|
|
|
|
|
string headword, articleText;
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
headword = x.word;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2013-09-19 14:04:04 +00:00
|
|
|
quint32 articleNumber = 0xFFFFFFFF;
|
2013-09-14 16:17:32 +00:00
|
|
|
try {
|
2023-05-27 04:12:16 +00:00
|
|
|
articleNumber = dict.loadArticle( x.articleOffset, articleText );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
catch ( ... ) {
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( articleNumber == 0xFFFFFFFF )
|
|
|
|
continue; // No article loaded
|
|
|
|
|
|
|
|
if ( articlesIncluded.find( articleNumber ) != articlesIncluded.end() )
|
|
|
|
continue; // We already have this article in the body.
|
|
|
|
|
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
|
|
// main ones first, and alternates after.
|
|
|
|
|
|
|
|
// We do the case-folded comparison here.
|
|
|
|
|
2023-04-29 02:35:56 +00:00
|
|
|
wstring headwordStripped = Folding::applySimpleCaseOnly( headword );
|
2018-06-13 16:00:42 +00:00
|
|
|
if ( ignoreDiacritics )
|
|
|
|
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
multimap< wstring, pair< string, string > > & mapToUse =
|
|
|
|
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
|
|
|
|
|
2023-04-29 02:35:56 +00:00
|
|
|
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
articlesIncluded.insert( articleNumber );
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() ) {
|
|
|
|
// No such word
|
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
string result;
|
|
|
|
|
2022-04-10 12:07:00 +00:00
|
|
|
// See Issue #271: A mechanism to clean-up invalid HTML cards.
|
2023-04-15 07:39:49 +00:00
|
|
|
string cleaner = Utils::Html::getHtmlCleaner();
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
multimap< wstring, pair< string, string > >::const_iterator i;
|
|
|
|
|
|
|
|
|
|
|
|
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
|
|
|
|
result += "<div class=\"zimdict\">";
|
|
|
|
result += "<h2 class=\"zimdict_headword\">";
|
|
|
|
result += i->second.first;
|
|
|
|
result += "</h2>";
|
|
|
|
result += i->second.second;
|
|
|
|
result += cleaner + "</div>";
|
|
|
|
}
|
|
|
|
|
|
|
|
for ( i = alternateArticles.begin(); i != alternateArticles.end(); ++i ) {
|
|
|
|
result += "<div class=\"zimdict\">";
|
|
|
|
result += "<h2 class=\"zimdict_headword\">";
|
|
|
|
result += i->second.first;
|
|
|
|
result += "</h2>";
|
|
|
|
result += i->second.second;
|
|
|
|
result += cleaner + "</div>";
|
|
|
|
}
|
|
|
|
|
2023-06-23 15:09:31 +00:00
|
|
|
appendString( result );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
hasAnyData = true;
|
|
|
|
|
|
|
|
finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
sptr< Dictionary::DataRequest > ZimDictionary::getArticle( wstring const & word,
|
|
|
|
vector< wstring > const & alts,
|
2018-06-13 16:00:42 +00:00
|
|
|
wstring const &,
|
|
|
|
bool ignoreDiacritics )
|
2023-07-20 08:02:22 +00:00
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
2022-11-29 03:54:31 +00:00
|
|
|
return std::make_shared< ZimArticleRequest >( word, alts, *this, ignoreDiacritics );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
//// ZimDictionary::getResource()
|
|
|
|
|
|
|
|
class ZimResourceRequest: public Dictionary::DataRequest
|
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
//the dict will outlive this object, so the reference & used here is proper.
|
2013-09-14 16:17:32 +00:00
|
|
|
ZimDictionary & dict;
|
|
|
|
|
|
|
|
string resourceName;
|
|
|
|
|
|
|
|
QAtomicInt isCancelled;
|
2022-06-19 12:24:34 +00:00
|
|
|
QFuture< void > f;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
public:
|
2023-05-27 04:12:16 +00:00
|
|
|
ZimResourceRequest( ZimDictionary & dict_, string resourceName_ ):
|
|
|
|
dict( dict_ ),
|
|
|
|
resourceName( std::move( resourceName_ ) )
|
|
|
|
{
|
|
|
|
f = QtConcurrent::run( [ this ]() {
|
|
|
|
this->run();
|
|
|
|
} );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2022-05-28 08:30:18 +00:00
|
|
|
void run();
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void cancel() override
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~ZimResourceRequest()
|
|
|
|
{
|
|
|
|
isCancelled.ref();
|
2022-06-19 12:24:34 +00:00
|
|
|
f.waitForFinished();
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
void ZimResourceRequest::run()
|
|
|
|
{
|
|
|
|
// Some runnables linger enough that they are cancelled before they start
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
|
2013-09-14 16:17:32 +00:00
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
string resource;
|
|
|
|
dict.loadResource( resourceName, resource );
|
|
|
|
if ( resource.empty() )
|
|
|
|
throw File::Ex();
|
|
|
|
|
|
|
|
if ( Filetype::isNameOfCSS( resourceName ) ) {
|
|
|
|
QString css = QString::fromUtf8( resource.data(), resource.size() );
|
|
|
|
dict.isolateCSS( css, ".zimdict" );
|
|
|
|
QByteArray bytes = css.toUtf8();
|
2019-11-19 16:20:44 +00:00
|
|
|
|
2023-05-29 13:56:04 +00:00
|
|
|
QMutexLocker _( &dataMutex );
|
2013-09-14 16:17:32 +00:00
|
|
|
data.resize( bytes.size() );
|
|
|
|
memcpy( &data.front(), bytes.constData(), bytes.size() );
|
|
|
|
}
|
|
|
|
else if ( Filetype::isNameOfTiff( resourceName ) ) {
|
|
|
|
// Convert it
|
2023-05-29 13:56:04 +00:00
|
|
|
QMutexLocker _( &dataMutex );
|
2022-04-05 13:25:07 +00:00
|
|
|
GdTiff::tiff2img( data );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
else {
|
2023-05-29 13:56:04 +00:00
|
|
|
QMutexLocker _( &dataMutex );
|
2013-09-14 16:17:32 +00:00
|
|
|
data.resize( resource.size() );
|
|
|
|
memcpy( &data.front(), resource.data(), data.size() );
|
|
|
|
}
|
|
|
|
|
2023-05-29 13:56:04 +00:00
|
|
|
QMutexLocker _( &dataMutex );
|
2013-09-14 16:17:32 +00:00
|
|
|
hasAnyData = true;
|
|
|
|
}
|
2013-09-24 13:56:47 +00:00
|
|
|
catch ( std::exception & ex ) {
|
2013-11-16 18:34:09 +00:00
|
|
|
gdWarning( "ZIM: Failed loading resource \"%s\" from \"%s\", reason: %s\n",
|
|
|
|
resourceName.c_str(),
|
|
|
|
dict.getName().c_str(),
|
|
|
|
ex.what() );
|
2013-09-24 13:56:47 +00:00
|
|
|
// Resource not loaded -- we don't set the hasAnyData flag then
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name )
|
|
|
|
{
|
2023-05-27 04:12:16 +00:00
|
|
|
auto noLeadingDot = QString::fromStdString( name ).remove( RX::Zim::leadingDotSlash );
|
|
|
|
return std::make_shared< ZimResourceRequest >( *this, noLeadingDot.toStdString() );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
wstring normalizeWord( const std::string & url );
|
2013-09-14 16:17:32 +00:00
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames,
|
|
|
|
string const & indicesDir,
|
2017-10-25 14:37:39 +00:00
|
|
|
Dictionary::Initializing & initializing,
|
|
|
|
unsigned maxHeadwordsToExpand )
|
2023-07-20 08:02:22 +00:00
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
{
|
|
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
for ( const auto & fileName : fileNames ) {
|
2013-09-14 16:17:32 +00:00
|
|
|
// Skip files with the extensions different to .zim to speed up the
|
|
|
|
// scanning
|
2013-09-19 14:04:04 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
QString firstName = QDir::fromNativeSeparators( fileName.c_str() );
|
|
|
|
if ( !firstName.endsWith( ".zim" ) && !firstName.endsWith( ".zimaa" ) ) {
|
2023-04-13 10:08:32 +00:00
|
|
|
continue;
|
2023-05-27 04:12:16 +00:00
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
// Got the file -- check if we need to rebuid the index
|
|
|
|
//fileName is logical.
|
|
|
|
if ( firstName.endsWith( ".zimaa" ) ) {
|
|
|
|
//remove aa
|
|
|
|
firstName.chop( 2 );
|
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
ZimFile df( firstName.toStdString() );
|
2023-01-17 17:59:13 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
vector< string > dictFiles;
|
|
|
|
dictFiles.push_back( firstName.toStdString() );
|
2023-01-17 17:59:13 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
string dictId = df.getChecksum();
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
string indexFile = indicesDir + dictId;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
initializing.indexingDictionary( df.getFilename() );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
try {
|
|
|
|
//only check zim file.
|
|
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( indexFile ) ) {
|
|
|
|
gdDebug( "Zim: Building the index for dictionary: %s\n", fileName.c_str() );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
unsigned articleCount = df.getArticleCount();
|
|
|
|
unsigned wordCount = 0;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
{
|
|
|
|
int n = firstName.lastIndexOf( '/' );
|
|
|
|
initializing.indexingDictionary( firstName.mid( n + 1 ).toUtf8().constData() );
|
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2024-03-23 04:29:51 +00:00
|
|
|
File::Index idx( indexFile, "wb" );
|
2023-05-27 04:12:16 +00:00
|
|
|
IdxHeader idxHeader;
|
|
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
|
|
idxHeader.namePtr = 0xFFFFFFFF;
|
|
|
|
idxHeader.descriptionPtr = 0xFFFFFFFF;
|
|
|
|
|
|
|
|
auto lang = df.getMetadata( "Language" );
|
|
|
|
if ( lang.size() == 2 )
|
|
|
|
idxHeader.langFrom = LangCoder::code2toInt( lang.c_str() );
|
|
|
|
else if ( lang.size() == 3 )
|
|
|
|
idxHeader.langFrom = LangCoder::findIdForLanguageCode3( lang.c_str() );
|
|
|
|
idxHeader.langTo = idxHeader.langFrom;
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
|
|
// will be rewritten with the right values.
|
|
|
|
idx.write( idxHeader );
|
|
|
|
|
|
|
|
IndexedWords indexedWords;
|
|
|
|
|
|
|
|
//only iterate the article
|
|
|
|
for ( const auto & entry : df.iterByTitle() ) {
|
|
|
|
auto item = entry.getItem( true );
|
|
|
|
auto mimeType = item.getMimetype();
|
|
|
|
auto url = item.getPath();
|
|
|
|
auto title = item.getTitle();
|
|
|
|
auto index = item.getIndex();
|
|
|
|
// Read article url and title
|
|
|
|
if ( !isArticleMime( mimeType ) ) {
|
|
|
|
continue;
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
if ( maxHeadwordsToExpand > 0 && ( articleCount >= maxHeadwordsToExpand ) ) {
|
|
|
|
if ( !title.empty() ) {
|
|
|
|
wstring word = Utf8::decode( title );
|
|
|
|
indexedWords.addSingleWord( word, index );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
else if ( !url.empty() ) {
|
|
|
|
indexedWords.addSingleWord( normalizeWord( url ), index );
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
if ( !title.empty() ) {
|
|
|
|
auto word = Utf8::decode( title );
|
|
|
|
indexedWords.addWord( word, index );
|
|
|
|
wordCount++;
|
2022-05-31 18:07:03 +00:00
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
else if ( !url.empty() ) {
|
|
|
|
indexedWords.addWord( normalizeWord( url ), index );
|
|
|
|
wordCount++;
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
// Build index
|
|
|
|
{
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
indexedWords.clear(); // Release memory -- no need for this data
|
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
idxHeader.signature = Signature;
|
|
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
idxHeader.articleCount = articleCount;
|
|
|
|
idxHeader.wordCount = wordCount;
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
idx.rewind();
|
2013-09-14 16:17:32 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
|
|
}
|
2014-04-25 13:13:56 +00:00
|
|
|
|
2023-05-27 04:12:16 +00:00
|
|
|
dictionaries.push_back( std::make_shared< ZimDictionary >( dictId, indexFile, dictFiles ) );
|
2013-09-19 14:04:04 +00:00
|
|
|
}
|
|
|
|
catch ( std::exception & e ) {
|
2014-04-25 13:13:56 +00:00
|
|
|
gdWarning( "Zim dictionary initializing failed: %s, error: %s\n", fileName.c_str(), e.what() );
|
2013-09-19 14:04:04 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
catch ( ... ) {
|
2014-04-25 13:13:56 +00:00
|
|
|
qWarning( "Zim dictionary initializing failed\n" );
|
2013-09-19 14:04:04 +00:00
|
|
|
continue;
|
2013-09-14 16:17:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return dictionaries;
|
|
|
|
}
|
2023-05-27 04:12:16 +00:00
|
|
|
wstring normalizeWord( const std::string & url )
|
|
|
|
{
|
|
|
|
auto formattedUrl = QString::fromStdString( url ).remove( RX::Zim::leadingDotSlash );
|
|
|
|
return formattedUrl.toStdU32String();
|
|
|
|
}
|
2013-09-14 16:17:32 +00:00
|
|
|
|
|
|
|
} // namespace Zim
|
|
|
|
|
|
|
|
#endif
|