/* This file is (c) 2012 Abs62 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #ifdef MAKE_ZIM_SUPPORT #include "zim.hh" #include "btreeidx.hh" #include "fsencoding.hh" #include "folding.hh" #include "gddebug.hh" #include "utf8.hh" #include "decompress.hh" #include "langcoder.hh" #include "wstring_qt.hh" #include "filetype.hh" #include "file.hh" #include "utils.hh" #include "tiff.hh" #include "ftshelpers.hh" #include "htmlescape.hh" #include "splitfile.hh" #ifdef _MSC_VER #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "base/globalregex.hh" namespace Zim { #define CACHE_SIZE 3 using std::string; using std::map; using std::vector; using std::multimap; using std::pair; using std::set; using gd::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexInfo; DEF_EX_STR( exNotZimFile, "Not an Zim file", Dictionary::Ex ) DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex ) DEF_EX_STR( exInvalidZimHeader, "Invalid Zim header", Dictionary::Ex ) DEF_EX( exUserAbort, "User abort", Dictionary::Ex ) //namespace { class ZimFile; #pragma pack( push, 1 ) enum CompressionType { Default = 0, None, Zlib, Bzip2, Lzma2, Zstd }; /// Zim file header struct ZIM_header { quint32 magicNumber; quint16 majorVersion; quint16 minorVersion; quint8 uuid[ 16 ]; quint32 articleCount; quint32 clusterCount; quint64 urlPtrPos; quint64 titlePtrPos; quint64 clusterPtrPos; quint64 mimeListPos; quint32 mainPage; quint32 layoutPage; quint64 checksumPos; } #ifndef _MSC_VER __attribute__((packed)) #endif ; struct ArticleEntry { quint16 mimetype; quint8 parameterLen; char nameSpace; quint32 revision; quint32 clusterNumber; quint32 blobNumber; } #ifndef _MSC_VER __attribute__((packed)) #endif ; struct RedirectEntry { quint16 mimetype; quint8 parameterLen; char nameSpace; quint32 revision; quint32 redirectIndex; } #ifndef _MSC_VER __attribute__((packed)) #endif ; enum { Signature = 0x584D495A, // ZIMX on little-endian, XMIZ on big-endian CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version }; struct IdxHeader { quint32 signature; // First comes the signature, ZIMX quint32 formatVersion; // File format version (CurrentFormatVersion) quint32 indexBtreeMaxElements; // Two fields from IndexInfo quint32 indexRootOffset; quint32 resourceIndexBtreeMaxElements; // Two fields from IndexInfo quint32 resourceIndexRootOffset; quint32 wordCount; quint32 articleCount; quint32 namePtr; quint32 descriptionPtr; quint32 langFrom; // Source language quint32 langTo; // Target language } #ifndef _MSC_VER __attribute__((packed)) #endif ; #pragma pack( pop ) // Class for support of split zim files struct Cache { char * data; quint32 clusterNumber; int stamp; int count, size; unsigned blobs_offset_size; Cache() : data( 0 ), clusterNumber( 0 ), stamp( -1 ), count( 0 ), size( 0 ), blobs_offset_size( 0 ) {} }; class ZimFile : public SplitFile::SplitFile { public: ZimFile(); ZimFile( const QString & name ); ~ZimFile(); void setFileName( const QString & name ) override; bool open(); void close() { SplitFile::close(); clearCache(); } const ZIM_header & header() const { return zimHeader; } string getClusterData( quint32 cluster_nom, unsigned & blob_offset_size ); const QString getMimeType( quint16 nom ) { return mimeTypes.value( nom ); } bool isArticleMime( quint16 mime_type ) { return getMimeType( mime_type ).startsWith( "text/html", Qt::CaseInsensitive ) || getMimeType( mime_type ).startsWith( "text/plain", Qt::CaseInsensitive ); } quint16 redirectedMimeType( RedirectEntry const & redEntry ); private: ZIM_header zimHeader; Cache cache[ CACHE_SIZE ]; int stamp; QVector< QPair< quint64, quint32 > > clusterOffsets; QStringList mimeTypes; void clearCache(); }; ZimFile::ZimFile() : stamp( 0 ) { memset( &zimHeader, 0, sizeof( zimHeader ) ); } ZimFile::ZimFile( const QString & name ) { setFileName( name ); } ZimFile::~ZimFile() { clearCache(); } void ZimFile::setFileName( const QString & name ) { close(); memset( &zimHeader, 0, sizeof( zimHeader ) ); clearCache(); appendFile( name ); if( name.endsWith( ".zimaa", Qt::CaseInsensitive ) ) { QString fname = name; for( int i = 0; i < 26; i++ ) { fname[ fname.size() - 2 ] = (char)( 'a' + i ); int j; for( j = 1; j < 26; j++ ) { fname[ fname.size() - 1 ] = (char)( 'a' + j ); if( !QFileInfo( fname ).isFile() ) break; appendFile( fname ); } if( j < 26 ) break; } } } void ZimFile::clearCache() { for( int i = 0; i < CACHE_SIZE; i++ ) { if( cache[ i ].data ) { free( cache[ i ].data ); cache[ i ].data = 0; } cache[ i ].clusterNumber = 0; cache[ i ].stamp = -1; cache[ i ].count = 0; cache[ i ].size = 0; } stamp = 0; } bool ZimFile::open() { if( !SplitFile::open( QIODevice::ReadOnly ) ) return false; memset( &zimHeader, 0, sizeof( zimHeader ) ); if( read( reinterpret_cast< char * >( &zimHeader ), sizeof( zimHeader ) ) != sizeof( zimHeader ) ) return false; if( zimHeader.magicNumber != 0x44D495A || zimHeader.mimeListPos != sizeof( zimHeader ) ) return false; // Clusters in zim file may be placed in random order. // We create sorted offsets list to calculate clusters size. clusterOffsets.resize( zimHeader.clusterCount ); QVector< quint64 > offs; offs.resize( zimHeader.clusterCount ); seek( zimHeader.clusterPtrPos ); qint64 size = zimHeader.clusterCount * sizeof( quint64 ); if( read( reinterpret_cast< char * >( offs.data() ), size) != size ) { vector< string > names; getFilenames( names ); throw exCantReadFile( names[ 0 ] ); } for( quint32 i = 0; i < zimHeader.clusterCount; i++ ) clusterOffsets[ i ] = QPair< quint64, quint32 >( offs.at( i ), i ); std::sort( clusterOffsets.begin(), clusterOffsets.end() ); // Read mime types string type; char ch; seek( zimHeader.mimeListPos ); for( ; ; ) { type.clear(); while( getChar( &ch ) ) { if( ch == 0 ) break; type.push_back( ch ); } if( type.empty() ) break; QString s = QString::fromUtf8( type.c_str(), type.size() ); mimeTypes.append( s ); } return true; } string ZimFile::getClusterData( quint32 cluster_nom, unsigned & blobs_offset_size ) { // Check cache int target = 0; bool found = false; int lastStamp = INT_MAX; for( int i = 0; i < CACHE_SIZE; i++ ) { if( cache[ i ].clusterNumber == cluster_nom && cache[ i ].count ) { found = true; target = i; break; } if( cache[ i ].stamp < lastStamp ) { lastStamp = cache[ i ].stamp; target = i; } } cache[ target ].stamp = ++stamp; if( stamp < 0 ) { stamp = 0; for (int i = 0; i < CACHE_SIZE; i++) cache[ i ].stamp = -1; } if( found ) { // Cache hit blobs_offset_size = cache[ target ].blobs_offset_size; return string( cache[ target ].data, cache[ target ].count ); } // Cache miss, read data from file // Calculate cluster size quint64 clusterSize; quint32 nom; for( nom = 0; nom < zimHeader.clusterCount; nom++ ) if( clusterOffsets.at( nom ).second == cluster_nom ) break; if( nom >= zimHeader.clusterCount ) // Invalid cluster nom return string(); if( nom < zimHeader.clusterCount - 1 ) clusterSize = clusterOffsets.at( nom + 1 ).first - clusterOffsets.at( nom ).first; else clusterSize = size() - clusterOffsets.at( nom ).first; // Read cluster data seek( clusterOffsets.at( nom ).first ); char compressionType, cluster_info; if( !getChar( &cluster_info ) ) return string(); compressionType = cluster_info & 0x0F; blobs_offset_size = cluster_info & 0x10 && zimHeader.majorVersion >= 6 ? 8 : 4; string decompressedData; QByteArray data = read( clusterSize ); if( compressionType == Default || compressionType == None ) decompressedData = string( data.data(), data.size() ); else if( compressionType == Zlib ) decompressedData = decompressZlib( data.constData(), data.size() ); else if( compressionType == Bzip2 ) decompressedData = decompressBzip2( data.constData(), data.size() ); else if( compressionType == Lzma2 ) decompressedData = decompressLzma2( data.constData(), data.size() ); else if( compressionType == Zstd ) decompressedData = decompressZstd( data.constData(), data.size() ); else return string(); if( decompressedData.empty() ) return string(); // Check BLOBs number in the cluster // We cache multi-element clusters only quint32 firstOffset32; quint64 firstOffset; if( blobs_offset_size == 8 ) memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) ); else { memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) ); firstOffset = firstOffset32; } quint32 blobCount = ( firstOffset - blobs_offset_size ) / blobs_offset_size; if( blobCount > 1 ) { // Fill cache int size = decompressedData.size(); if( cache[ target ].count < size ) { if( cache[ target ].data ) free( cache[ target ].data ); cache[ target ].data = ( char * )malloc( size ); if( cache[ target ].data ) cache[ target ].size = size; else { cache[ target ].size = 0; cache[ target ].count = 0; } } if( cache[ target ].size ) { memcpy( cache[ target ].data, decompressedData.c_str(), size ); cache[ target ].count = size; cache[ target ].clusterNumber = cluster_nom; cache[ target ].blobs_offset_size = blobs_offset_size; } } return decompressedData; } quint16 ZimFile::redirectedMimeType( RedirectEntry const & redEntry ) { RedirectEntry current_entry = redEntry; quint64 current_pos = pos(); quint16 mimetype = 0xFFFF; for( ; ; ) { quint32 current_nom = current_entry.redirectIndex; seek( zimHeader.urlPtrPos + (quint64)current_nom * 8 ); quint64 new_pos; if( read( reinterpret_cast< char * >( &new_pos ), sizeof(new_pos) ) != sizeof(new_pos) ) break; seek( new_pos ); quint16 new_mimetype; if( read( reinterpret_cast< char * >( &new_mimetype ), sizeof(new_mimetype) ) != sizeof(new_mimetype) ) break; if( new_mimetype == 0xFFFF ) // Redirect to other article { if( read( reinterpret_cast< char * >( ¤t_entry ) + 2, sizeof( current_entry ) - 2 ) != sizeof( current_entry ) - 2 ) break; if( current_nom == current_entry.redirectIndex ) break; } else { mimetype = new_mimetype; break; } } seek( current_pos ); return mimetype; } // Some supporting functions bool indexIsOldOrBad( string const & indexFile ) { File::Class idx( indexFile, "rb" ); IdxHeader header; return idx.readRecords( &header, sizeof( header ), 1 ) != 1 || header.signature != Signature || header.formatVersion != CurrentFormatVersion; } quint32 getArticleCluster( ZimFile & file, quint32 articleNumber ) { while( 1 ) { ZIM_header const & header = file.header(); if( articleNumber >= header.articleCount ) break; file.seek( header.urlPtrPos + (quint64)articleNumber * 8 ); quint64 pos; if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) ) break; // Read article info quint16 mimetype; file.seek( pos ); if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) ) break; if( mimetype == 0xFFFF ) // Redirect to other article { RedirectEntry redEntry; if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 ) break; if( articleNumber == redEntry.redirectIndex ) break; articleNumber = redEntry.redirectIndex; continue; } ArticleEntry artEntry; artEntry.mimetype = mimetype; if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 ) break; return artEntry.clusterNumber; } return 0xFFFFFFFF; } quint32 readArticle( ZimFile & file, quint32 articleNumber, string & result, set< quint32 > * loadedArticles = NULL ) { result.clear(); while( 1 ) { ZIM_header const & header = file.header(); if( articleNumber >= header.articleCount ) break; file.seek( header.urlPtrPos + (quint64)articleNumber * 8 ); quint64 pos; if( file.read( reinterpret_cast< char * >( &pos ), sizeof(pos) ) != sizeof(pos) ) break; // Read article info quint16 mimetype; file.seek( pos ); if( file.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ) != sizeof(mimetype) ) break; if( mimetype == 0xFFFF ) // Redirect to other article { RedirectEntry redEntry; if( file.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(redEntry) - 2 ) != sizeof(redEntry) - 2 ) break; if( articleNumber == redEntry.redirectIndex ) break; articleNumber = redEntry.redirectIndex; continue; } if( loadedArticles && loadedArticles->find( articleNumber ) != loadedArticles->end() ) break; ArticleEntry artEntry; artEntry.mimetype = mimetype; if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 ) break; // Read cluster data unsigned offset_size = 0; string decompressedData = file.getClusterData( artEntry.clusterNumber, offset_size ); if( decompressedData.empty() ) break; // Take article data from cluster quint32 firstOffset32; quint64 firstOffset; if( offset_size == 8 ) memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) ); else { memcpy( &firstOffset32, decompressedData.data(), sizeof(firstOffset32) ); firstOffset = firstOffset32; } quint32 blobCount = ( firstOffset - offset_size ) / offset_size; if( artEntry.blobNumber > blobCount ) break; quint32 size; if( offset_size == 8 ) { quint64 offsets[ 2 ]; memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 8, sizeof(offsets) ); size = offsets[ 1 ] - offsets[ 0 ]; result.append( decompressedData, offsets[ 0 ], size ); } else { quint32 offsets[ 2 ]; memcpy( offsets, decompressedData.data() + artEntry.blobNumber * 4, sizeof(offsets) ); size = offsets[ 1 ] - offsets[ 0 ]; result.append( decompressedData, offsets[ 0 ], size ); } return articleNumber; } return 0xFFFFFFFF; } // ZimDictionary class ZimDictionary: public BtreeIndexing::BtreeDictionary { enum LINKS_TYPE { UNKNOWN, SLASH, NO_SLASH }; Mutex idxMutex; Mutex zimMutex, idxResourceMutex; File::Class idx; BtreeIndex resourceIndex; IdxHeader idxHeader; string dictionaryName; ZimFile df; set< quint32 > articlesIndexedForFTS; LINKS_TYPE linksType; public: ZimDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ); ~ZimDictionary(); string getName() noexcept override { return dictionaryName; } map< Dictionary::Property, string > getProperties() noexcept override { return map< Dictionary::Property, string >(); } unsigned long getArticleCount() noexcept override { return idxHeader.articleCount; } unsigned long getWordCount() noexcept override { return idxHeader.wordCount; } inline quint32 getLangFrom() const override { return idxHeader.langFrom; } inline quint32 getLangTo() const override { return idxHeader.langTo; } sptr< Dictionary::DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override ; sptr< Dictionary::DataRequest > getResource( string const & name ) override ; QString const& getDescription() override; /// Loads the resource. void loadResource( std::string &resourceName, string & data ); sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString, int searchMode, bool matchCase, int distanceBetweenWords, int maxResults, bool ignoreWordsOrder, bool ignoreDiacritics ) override; void getArticleText( uint32_t articleAddress, QString & headword, QString & text ) override; quint32 getArticleText( uint32_t articleAddress, QString & headword, QString & text, set< quint32 > * loadedArticles ); void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration ) override; void setFTSParameters( Config::FullTextSearch const & fts ) override { can_FTS = fts.enabled && !fts.disabledTypes.contains( "ZIM", Qt::CaseInsensitive ) && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize ); } void sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets, QAtomicInt & isCancelled ) override; protected: void loadIcon() noexcept override; private: /// Loads the article. quint32 loadArticle( quint32 address, string & articleText, set< quint32 > * loadedArticles, bool rawText = false ); string convert( string const & in_data ); friend class ZimArticleRequest; friend class ZimResourceRequest; }; ZimDictionary::ZimDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ): BtreeDictionary( id, dictionaryFiles ), idx( indexFile, "rb" ), idxHeader( idx.read< IdxHeader >() ), df( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) ), linksType( UNKNOWN ) { // Open data file df.open(); // Initialize the indexes openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); resourceIndex.openIndex( IndexInfo( idxHeader.resourceIndexBtreeMaxElements, idxHeader.resourceIndexRootOffset ), idx, idxResourceMutex ); // Read dictionary name if( idxHeader.namePtr == 0xFFFFFFFF ) { QString name = QDir::fromNativeSeparators( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) ); int n = name.lastIndexOf( '/' ); dictionaryName = name.mid( n + 1 ).toStdString(); } else { readArticle( df, idxHeader.namePtr, dictionaryName ); } // Full-text search parameters can_FTS = true; ftsIdxName = indexFile + Dictionary::getFtsSuffix(); if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName ) && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) FTS_index_completed.ref(); } ZimDictionary::~ZimDictionary() { df.close(); } void ZimDictionary::loadIcon() noexcept { if ( dictionaryIconLoaded ) return; QString fileName = QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) ); // Remove the extension fileName.chop( 3 ); if( !loadIconFromFile( fileName ) ) { // Load failed -- use default icons dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_zim.png"); } dictionaryIconLoaded = true; } quint32 ZimDictionary::loadArticle( quint32 address, string & articleText, set< quint32 > * loadedArticles, bool rawText ) { quint32 ret; { Mutex::Lock _( zimMutex ); ret = readArticle( df, address, articleText, loadedArticles ); } if( !rawText ) articleText = convert( articleText ); return ret; } string ZimDictionary::convert( const string & in ) { QString text = QString::fromUtf8( in.c_str() ); // replace background text.replace( QRegularExpression( R"(<\s*body\s+([^>]*)(background(|-color)):([^;"]*(;|)))" ), QString( "]*)src=(\")([^\"]*)\\3" ), // QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) ); QRegularExpression rxImgScript( R"(<\s*(img|script)\s+([^>]*)src=(")([^"]*)\3)" ); QRegularExpressionMatchIterator it = rxImgScript.globalMatch( text ); int pos = 0; QString newText; while( it.hasNext() ) { QRegularExpressionMatch match = it.next(); newText += text.mid( pos, match.capturedStart() - pos ); pos = match.capturedEnd(); QStringList list = match.capturedTexts(); QString url = list[ 4 ]; // a url QString urlLink = match.captured(); QString replacedLink = urlLink; if( !url.isEmpty() && !url.startsWith( "//" ) && !url.startsWith( "http://" ) && !url.startsWith( "https://" ) ) { //<\\1 \\2src=\\3bres://%1/ url.remove(QRegularExpression(R"(^\.*\/[A-Z]\/)", QRegularExpression::CaseInsensitiveOption)); replacedLink = QString( "<%1 %2 src=\"bres://%3/%4\"" ).arg( list[ 1 ], list[ 2 ], QString::fromStdString( getId() ), url ); } newText += replacedLink; } if( pos ) { newText += text.mid( pos ); text = newText; } newText.clear(); // Fix links without '"' text.replace( QRegularExpression( R"(href=(\.\.|)/([^\s>]+))" ), QString( R"(href="\1/\2")" ) ); // pattern text.replace( QRegularExpression( R"(<\s*link\s+([^>]*)href="(\.\.|)/)" ), QString( " series links // excluding those keywords that have ":" in it QString urlWiki = "\"http(s|)://en\\.(wiki(pedia|books|news|quote|source|voyage|versity)|wiktionary)\\.(org|com)/wiki/([^:\"]*)\""; text.replace( QRegularExpression( R"(<\s*a\s+(class="external"\s+|)href=)" + urlWiki ), QString( R"(, excluding any known protocols such as http://, mailto:, #(comment) // these links will be translated into local definitions // QRegularExpression rxLink( R"lit(<\s*(?:a|meta)\s+([^>]*)(?:href|url)="?(?!(?:\w+://|#|mailto:|tel:))()([^"]*)"\s*(title="[^"]*")?[^>]*>)lit" ); it = rxLink.globalMatch( text ); pos = 0; while( it.hasNext() ) { QRegularExpressionMatch match = it.next(); newText += text.mid( pos, match.capturedStart() - pos ); pos = match.capturedEnd(); QStringList list = match.capturedTexts(); // Add empty strings for compatibility with QRegExp behaviour for( int i = list.size(); i < 5; i++ ) list.append( QString() ); QString formatTag; QString tag = list[ 3 ]; // a url, ex: Precambrian_Chaotian.html QString url = tag; if( !list[ 4 ].isEmpty() ) // a title, ex: title="Precambrian/Chaotian" { tag = list[ 4 ]; formatTag = tag.split( "\"" )[ 1 ]; } else { //tag from list[3] formatTag = tag; formatTag.replace( RX::Zim::linkSpecialChar, "" ); } QString urlLink = match.captured(); QString replacedLink = urlLink; if( !url.isEmpty() && !url.startsWith( "//" ) ) { replacedLink = urlLink.replace( url, "gdlookup://localhost/" + formatTag ); } newText += replacedLink; } if( pos ) { newText += text.mid( pos ); text = newText; } newText.clear(); // Occasionally words needs to be displayed in vertical, but
were changed to somewhere // proper style: N
e
o
p
t
e
r
a QRegularExpression rxBR( R"((]*>)\s*((\w\s*<br(\\|/|)>\s*)+\w)\s*)", QRegularExpression::UseUnicodePropertiesOption ); pos = 0; QRegularExpressionMatchIterator it2 = rxBR.globalMatch( text ); while( it2.hasNext() ) { QRegularExpressionMatch match = it2.next(); newText += text.mid( pos, match.capturedStart() - pos ); pos = match.capturedEnd(); QStringList list = match.capturedTexts(); // Add empty strings for compatibility with QRegExp behaviour for( int i = match.lastCapturedIndex() + 1; i < 3; i++ ) list.append( QString() ); QString tag = list[2]; tag.replace( QRegularExpression( "<br( |)(\\\\|/|)>", QRegularExpression::PatternOption::CaseInsensitiveOption ) , "
" ). prepend( list[1] ). append( "" ); newText += tag; } if( pos ) { newText += text.mid( pos ); text = newText; } newText.clear(); // // output all links in the page - only for analysis // QRegExp rxPrintAllLinks( "<\\s*a\\s+[^>]*href=\"[^\"]*\"[^>]*>", // Qt::CaseSensitive, // QRegExp::RegExp2 ); // pos = 0; // while( (pos = rxPrintAllLinks.indexIn( text, pos )) >= 0 ) // { // QStringList list = rxPrintAllLinks.capturedTexts(); // qDebug() << "\n--Alllinks--" << list[0]; // pos += list[0].length() + 1; // } // Fix outstanding elements text += "
"; return text.toUtf8().data(); } void ZimDictionary::loadResource( std::string & resourceName, string & data ) { vector< WordArticleLink > link; string resData; link = resourceIndex.findArticles( Utf8::decode( resourceName ) ); if( link.empty() ) return; { Mutex::Lock _( zimMutex ); readArticle( df, link[ 0 ].articleOffset, data ); } } QString const& ZimDictionary::getDescription() { if( !dictionaryDescription.isEmpty() || idxHeader.descriptionPtr == 0xFFFFFFFF ) return dictionaryDescription; string str; { Mutex::Lock _( zimMutex ); readArticle( df, idxHeader.descriptionPtr, str ); } if( !str.empty() ) dictionaryDescription = QString::fromUtf8( str.c_str(), str.size() ); return dictionaryDescription; } void ZimDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration ) { if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName ) || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) ) FTS_index_completed.ref(); if( haveFTSIndex() ) return; if( ensureInitDone().size() ) return; if( firstIteration ) return; gdDebug( "Zim: Building the full-text index for dictionary: %s\n", getName().c_str() ); try { #ifdef USE_XAPIAN return FtsHelpers::makeFTSIndexXapian(this,isCancelled); #endif Mutex::Lock _( getFtsMutex() ); File::Class ftsIdx( ftsIndexName(), "wb" ); FtsHelpers::FtsIdxHeader ftsIdxHeader; memset( &ftsIdxHeader, 0, sizeof( ftsIdxHeader ) ); // We write a dummy header first. At the end of the process the header // will be rewritten with the right values. ftsIdx.write( ftsIdxHeader ); ChunkedStorage::Writer chunks( ftsIdx ); BtreeIndexing::IndexedWords indexedWords; QSet< uint32_t > setOfOffsets; setOfOffsets.reserve( getWordCount() ); findArticleLinks( 0, &setOfOffsets, 0, &isCancelled ); if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); // We should sort articles order by cluster number // to effective use clusters data caching QVector< QPair< quint32, uint32_t > > offsetsWithClusters; offsetsWithClusters.reserve( setOfOffsets.size() ); for( QSet< uint32_t >::ConstIterator it = setOfOffsets.constBegin(); it != setOfOffsets.constEnd(); ++it ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); Mutex::Lock _( zimMutex ); offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) ); } // Free memory setOfOffsets.clear(); if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() ); QVector< uint32_t > offsets; offsets.resize( offsetsWithClusters.size() ); for( int i = 0; i < offsetsWithClusters.size(); i++ ) offsets[ i ] = offsetsWithClusters.at( i ).second; // Free memory offsetsWithClusters.clear(); if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); QMap< QString, QVector< uint32_t > > ftsWords; set< quint32 > indexedArticles; quint32 articleNumber; // index articles for full-text search for( int i = 0; i < offsets.size(); i++ ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); QString headword, articleStr; articleNumber = getArticleText( offsets.at( i ), headword, articleStr, &indexedArticles ); if( articleNumber == 0xFFFFFFFF ) continue; indexedArticles.insert( articleNumber ); FtsHelpers::parseArticleForFts( offsets.at( i ), articleStr, ftsWords ); setIndexedFtsDoc(i); } // Free memory offsets.clear(); QMap< QString, QVector< uint32_t > >::iterator it = ftsWords.begin(); while( it != ftsWords.end() ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); uint32_t offset = chunks.startNewBlock(); uint32_t size = it.value().size(); chunks.addToBlock( &size, sizeof(uint32_t) ); chunks.addToBlock( it.value().data(), size * sizeof(uint32_t) ); indexedWords.addSingleWord( gd::toWString( it.key() ), offset ); it = ftsWords.erase( it ); } // Free memory ftsWords.clear(); if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); ftsIdxHeader.chunksOffset = chunks.finish(); ftsIdxHeader.wordCount = indexedWords.size(); if( Utils::AtomicInt::loadAcquire( isCancelled ) ) throw exUserAbort(); BtreeIndexing::IndexInfo ftsIdxInfo = BtreeIndexing::buildIndex( indexedWords, ftsIdx ); // Free memory indexedWords.clear(); ftsIdxHeader.indexBtreeMaxElements = ftsIdxInfo.btreeMaxElements; ftsIdxHeader.indexRootOffset = ftsIdxInfo.rootOffset; ftsIdxHeader.signature = FtsHelpers::FtsSignature; ftsIdxHeader.formatVersion = FtsHelpers::CurrentFtsFormatVersion + getFtsIndexVersion(); ftsIdx.rewind(); ftsIdx.writeRecords( &ftsIdxHeader, sizeof(ftsIdxHeader), 1 ); FTS_index_completed.ref(); } catch( std::exception &ex ) { gdWarning( "Zim: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() ); QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) ); } } void ZimDictionary::sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets, QAtomicInt & isCancelled ) { QVector< QPair< quint32, uint32_t > > offsetsWithClusters; offsetsWithClusters.reserve( offsets.size() ); for( QVector< uint32_t >::ConstIterator it = offsets.constBegin(); it != offsets.constEnd(); ++it ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) return; Mutex::Lock _( zimMutex ); offsetsWithClusters.append( QPair< uint32_t, quint32 >( getArticleCluster( df, *it ), *it ) ); } std::sort( offsetsWithClusters.begin(), offsetsWithClusters.end() ); for( int i = 0; i < offsetsWithClusters.size(); i++ ) offsets[ i ] = offsetsWithClusters.at( i ).second; } void ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text ) { try { headword.clear(); string articleText; loadArticle( articleAddress, articleText, 0, true ); text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) ); } catch( std::exception &ex ) { gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() ); } } quint32 ZimDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text, set< quint32 > * loadedArticles ) { quint32 articleNumber = 0xFFFFFFFF; try { headword.clear(); string articleText; articleNumber = loadArticle( articleAddress, articleText, loadedArticles, true ); text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) ); } catch( std::exception &ex ) { gdWarning( "Zim: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() ); } return articleNumber; } sptr< Dictionary::DataRequest > ZimDictionary::getSearchResults( QString const & searchString, int searchMode, bool matchCase, int distanceBetweenWords, int maxResults, bool ignoreWordsOrder, bool ignoreDiacritics ) { return std::make_shared( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics ); } /// ZimDictionary::getArticle() class ZimArticleRequest: public Dictionary::DataRequest { wstring word; vector< wstring > alts; ZimDictionary & dict; bool ignoreDiacritics; QAtomicInt isCancelled; QSemaphore hasExited; QFuture< void > f; public: ZimArticleRequest( wstring const & word_, vector< wstring > const & alts_, ZimDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ ) { f = QtConcurrent::run( [ this ]() { this->run(); } ); // QThreadPool::globalInstance()->start( [ this ]() { this->run(); } ); } void run(); void cancel() override { isCancelled.ref(); } ~ZimArticleRequest() { isCancelled.ref(); f.waitForFinished(); //hasExited.acquire(); } }; void ZimArticleRequest::run() { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics ); for( unsigned x = 0; x < alts.size(); ++x ) { /// Make an additional query for each alt vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics ); chain.insert( chain.end(), altChain.begin(), altChain.end() ); } multimap< wstring, pair< string, string > > mainArticles, alternateArticles; set< quint32 > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); if( ignoreDiacritics ) wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); for( unsigned x = 0; x < chain.size(); ++x ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } // Now grab that article string headword, articleText; headword = chain[ x ].word; quint32 articleNumber = 0xFFFFFFFF; try { articleNumber = dict.loadArticle( chain[ x ].articleOffset, articleText, &articlesIncluded ); } catch(...) { } if( articleNumber == 0xFFFFFFFF ) continue; // No article loaded if ( articlesIncluded.find( articleNumber ) != articlesIncluded.end() ) continue; // We already have this article in the body. // Ok. Now, does it go to main articles, or to alternate ones? We list // main ones first, and alternates after. // We do the case-folded comparison here. wstring headwordStripped = Folding::applySimpleCaseOnly( Utf8::decode( headword ) ); if( ignoreDiacritics ) headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); multimap< wstring, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair< wstring, pair< string, string > >( Folding::applySimpleCaseOnly( Utf8::decode( headword ) ), pair< string, string >( headword, articleText ) ) ); articlesIncluded.insert( articleNumber ); } if ( mainArticles.empty() && alternateArticles.empty() ) { // No such word finish(); return; } string result; // See Issue #271: A mechanism to clean-up invalid HTML cards. string cleaner = """""""""""" """""""""""" "" "" ""; multimap< wstring, pair< string, string > >::const_iterator i; for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += "

"; result += "

"; result += i->second.first; result += "

"; result += i->second.second; result += cleaner + "

"; } for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i ) { result += "

"; result += "

"; result += i->second.first; result += "

"; result += i->second.second; result += cleaner + "

"; } Mutex::Lock _( dataMutex ); data.resize( result.size() ); memcpy( &data.front(), result.data(), result.size() ); hasAnyData = true; finish(); } sptr< Dictionary::DataRequest > ZimDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) { return std::make_shared( word, alts, *this, ignoreDiacritics ); } //// ZimDictionary::getResource() class ZimResourceRequest: public Dictionary::DataRequest { ZimDictionary & dict; string resourceName; QAtomicInt isCancelled; QSemaphore hasExited; QFuture< void > f; public: ZimResourceRequest(ZimDictionary &dict_, string const &resourceName_) : dict(dict_), resourceName(resourceName_) { f = QtConcurrent::run( [ this ]() { this->run(); } ); // QThreadPool::globalInstance()->start( [ this ]() { this->run(); } ); } void run(); void cancel() override { isCancelled.ref(); } ~ZimResourceRequest() { isCancelled.ref(); f.waitForFinished(); //hasExited.acquire(); } }; void ZimResourceRequest::run() { // Some runnables linger enough that they are cancelled before they start if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } try { string resource; dict.loadResource( resourceName, resource ); if( resource.empty() ) throw File::Ex(); if( Filetype::isNameOfCSS( resourceName ) ) { QString css = QString::fromUtf8( resource.data(), resource.size() ); dict.isolateCSS( css, ".zimdict" ); QByteArray bytes = css.toUtf8(); Mutex::Lock _( dataMutex ); data.resize( bytes.size() ); memcpy( &data.front(), bytes.constData(), bytes.size() ); } else if ( Filetype::isNameOfTiff( resourceName ) ) { // Convert it Mutex::Lock _( dataMutex ); GdTiff::tiff2img( data ); } else { Mutex::Lock _( dataMutex ); data.resize( resource.size() ); memcpy( &data.front(), resource.data(), data.size() ); } Mutex::Lock _( dataMutex ); hasAnyData = true; } catch( std::exception &ex ) { gdWarning( "ZIM: Failed loading resource \"%s\" from \"%s\", reason: %s\n", resourceName.c_str(), dict.getName().c_str(), ex.what() ); // Resource not loaded -- we don't set the hasAnyData flag then } finish(); } sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name ) { auto formatedName = QString::fromStdString(name).remove(QRegularExpression(R"(^\.*\/[A-Z]\/)", QRegularExpression::CaseInsensitiveOption)); return std::make_shared( *this, formatedName.toStdString() ); } //} // anonymous namespace vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, Dictionary::Initializing & initializing, unsigned maxHeadwordsToExpand ) { vector< sptr< Dictionary::Class > > dictionaries; for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); ++i ) { // Skip files with the extensions different to .zim to speed up the // scanning QString firstName = QDir::fromNativeSeparators( FsEncoding::decode( i->c_str() ) ); if( !firstName.endsWith( ".zim") && !firstName.endsWith( ".zimaa" ) ) continue; // Got the file -- check if we need to rebuid the index ZimFile df( firstName ); vector< string > dictFiles; df.getFilenames( dictFiles ); string dictId = Dictionary::makeDictionaryId( dictFiles ); string indexFile = indicesDir + dictId; try { if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( indexFile ) ) { gdDebug( "Zim: Building the index for dictionary: %s\n", i->c_str() ); unsigned articleCount = 0; unsigned wordCount = 0; df.open(); ZIM_header const & zh = df.header(); if( zh.magicNumber != 0x44D495A ) throw exNotZimFile( i->c_str() ); if( zh.mimeListPos != sizeof( ZIM_header ) ) throw exInvalidZimHeader( i->c_str() ); bool new_namespaces = ( zh.majorVersion >= 6 && zh.minorVersion >= 1 ); { int n = firstName.lastIndexOf( '/' ); initializing.indexingDictionary( firstName.mid( n + 1 ).toUtf8().constData() ); } File::Class idx( indexFile, "wb" ); IdxHeader idxHeader; memset( &idxHeader, 0, sizeof( idxHeader ) ); idxHeader.namePtr = 0xFFFFFFFF; idxHeader.descriptionPtr = 0xFFFFFFFF; // We write a dummy header first. At the end of the process the header // will be rewritten with the right values. idx.write( idxHeader ); IndexedWords indexedWords, indexedResources; QByteArray artEntries; df.seek( zh.urlPtrPos ); artEntries = df.read( (quint64)zh.articleCount * 8 ); QVector< quint64 > clusters; clusters.reserve( zh.clusterCount ); df.seek( zh.clusterPtrPos ); { QByteArray data = df.read( (quint64)zh.clusterCount * 8 ); for( unsigned n = 0; n < zh.clusterCount; n++ ) clusters.append( *( reinterpret_cast< const quint64 * >( data.constData() ) + n ) ); } const quint64 * ptr; quint16 mimetype, redirected_mime = 0xFFFF; ArticleEntry artEntry; RedirectEntry redEntry; string url, title; char nameSpace; for( unsigned n = 0; n < zh.articleCount; n++ ) { ptr = reinterpret_cast< const quint64 * >( artEntries.constData() ) + n; df.seek( *ptr ); df.read( reinterpret_cast< char * >( &mimetype ), sizeof(mimetype) ); if( mimetype == 0xFFFF ) { redEntry.mimetype = mimetype; qint64 ret = df.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(RedirectEntry) - 2 ); if( ret != sizeof(RedirectEntry) - 2 ) throw exCantReadFile( i->c_str() ); redirected_mime = df.redirectedMimeType( redEntry ); nameSpace = redEntry.nameSpace; } else { artEntry.mimetype = mimetype; qint64 ret = df.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(ArticleEntry) - 2 ); if( ret != sizeof(ArticleEntry) - 2 ) throw exCantReadFile( i->c_str() ); nameSpace = artEntry.nameSpace; if( ( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces ) ) && df.isArticleMime( mimetype ) ) articleCount++; } // Read article url and title char ch; url.clear(); while( df.getChar( &ch ) ) { if( ch == 0 ) break; url.push_back( ch ); } title.clear(); while( df.getChar( &ch ) ) { if( ch == 0 ) break; title.push_back( ch ); } if( nameSpace == 'A' || ( nameSpace == 'C' && new_namespaces && ( df.isArticleMime( mimetype ) || ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) ) ) ) { wstring word; if( df.isArticleMime( mimetype ) || ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) ) { if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand ) { if( !title.empty() ) { word = Utf8::decode( title ); indexedWords.addSingleWord( word, n ); } if( !url.empty() ) { auto formatedUrl = QString::fromStdString( url ).replace( RX::Zim::linkSpecialChar, "" ); indexedWords.addSingleWord( Utf8::decode( formatedUrl.toStdString() ), n ); } } else { if( !title.empty() ) { word = Utf8::decode( title ); indexedWords.addWord( word, n ); } if( !url.empty() ) { auto formatedUrl = QString::fromStdString( url ).replace( RX::Zim::linkSpecialChar, "" ); indexedWords.addWord( Utf8::decode( formatedUrl.toStdString() ), n ); } } wordCount++; } else { // url.insert( url.begin(), '/' ); // url.insert( url.begin(), nameSpace ); // auto formatedUrl = QString::fromStdString(url).replace(RX::Zim::linkSpecialChar," "); indexedResources.addSingleWord( Utf8::decode( url ), n ); } } else if( nameSpace == 'M' ) { if( url.compare( "Title" ) == 0 ) { idxHeader.namePtr = n; string name; readArticle( df, n, name ); initializing.indexingDictionary( name ); } else if( url.compare( "Description" ) == 0 ) idxHeader.descriptionPtr = n; else if( url.compare( "Language" ) == 0 ) { string lang; readArticle( df, n, lang ); if( lang.size() == 2 ) idxHeader.langFrom = LangCoder::code2toInt( lang.c_str() ); else if( lang.size() == 3 ) idxHeader.langFrom = LangCoder::findIdForLanguageCode3( lang.c_str() ); idxHeader.langTo = idxHeader.langFrom; } } else if( nameSpace == 'X' || nameSpace=='V' || nameSpace=='U'|| nameSpace=='W' ) { continue; } else { // auto formatedUrl = QString::fromStdString(url).replace(RX::Zim::linkSpecialChar," "); indexedResources.addSingleWord( Utf8::decode( url ), n ); } } // Build index { IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; idxHeader.indexRootOffset = idxInfo.rootOffset; indexedWords.clear(); // Release memory -- no need for this data } { IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedResources, idx ); idxHeader.resourceIndexBtreeMaxElements = idxInfo.btreeMaxElements; idxHeader.resourceIndexRootOffset = idxInfo.rootOffset; indexedResources.clear(); // Release memory -- no need for this data } idxHeader.signature = Signature; idxHeader.formatVersion = CurrentFormatVersion; idxHeader.articleCount = articleCount; idxHeader.wordCount = wordCount; idx.rewind(); idx.write( &idxHeader, sizeof( idxHeader ) ); } dictionaries.push_back( std::make_shared( dictId, indexFile, dictFiles ) ); } catch( std::exception & e ) { gdWarning( "Zim dictionary initializing failed: %s, error: %s\n", i->c_str(), e.what() ); continue; } catch( ... ) { qWarning( "Zim dictionary initializing failed\n" ); continue; } } return dictionaries; } } // namespace Zim #endif