/* This file is (c) 2008-2012 Konstantin Isakov * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "stardict.hh" #include "btreeidx.hh" #include "folding.hh" #include "utf8.hh" #include "chunkedstorage.hh" #include "dictzip.h" #include "xdxf2html.hh" #include "htmlescape.hh" #include "langcoder.hh" #include "gddebug.hh" #include "fsencoding.hh" #include "filetype.hh" #include "indexedzip.hh" #include "tiff.hh" #include "ftshelpers.hh" #include "wstring_qt.hh" #include "audiolink.hh" #include #include #include #include // msvc defines _WIN32 https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170 // gcc also defines __WIN32, _WIN32, __WIN32__ // todo: unify how windows are detected on headers #ifndef _WIN32 #include #else #include #endif #include #ifdef _MSC_VER #include #endif #include #include #include #include #include #if (QT_VERSION >= QT_VERSION_CHECK(6,0,0)) #include #else #include #endif #include #include #include #include "ufile.hh" #include "utils.hh" #include namespace Stardict { using std::map; using std::multimap; using std::pair; using std::set; using std::string; using gd::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexInfo; namespace { DEF_EX( exNotAnIfoFile, "Not an .ifo file", Dictionary::Ex ) DEF_EX_STR( exBadFieldInIfo, "Bad field in .ifo file encountered:", Dictionary::Ex ) DEF_EX_STR( exNoIdxFile, "No corresponding .idx file was found for", Dictionary::Ex ) DEF_EX_STR( exNoDictFile, "No corresponding .dict file was found for", Dictionary::Ex ) DEF_EX_STR( exNoSynFile, "No corresponding .syn file was found for", Dictionary::Ex ) DEF_EX( ex64BitsNotSupported, "64-bit indices are not presently supported, sorry", Dictionary::Ex ) DEF_EX( exDicttypeNotSupported, "Dictionaries with dicttypes are not supported, sorry", Dictionary::Ex ) DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex ) DEF_EX_STR( exWordIsTooLarge, "Enountered a word that is too large:", Dictionary::Ex ) DEF_EX_STR( exSuddenEndOfFile, "Sudden end of file", Dictionary::Ex ) DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex ) DEF_EX_STR( exIncorrectOffset, "Incorrect offset encountered in file", Dictionary::Ex ) /// Contents of an ifo file struct Ifo { string version; string bookname; uint32_t wordcount, synwordcount, idxfilesize, idxoffsetbits; string sametypesequence, dicttype, description; string copyright, author, email, website, date; Ifo( File::Class & ); }; enum { Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian CurrentFormatVersion = 9 + BtreeIndexing::FormatVersion + Folding::Version }; struct IdxHeader { uint32_t signature; // First comes the signature, SIDX uint32_t formatVersion; // File format version (CurrentFormatVersion) uint32_t chunksOffset; // The offset to chunks' storage uint32_t indexBtreeMaxElements; // Two fields from IndexInfo uint32_t indexRootOffset; uint32_t wordCount; // Saved from Ifo::wordcount uint32_t synWordCount; // Saved from Ifo::synwordcount uint32_t bookNameSize; // Book name's length. Used to read it then. uint32_t sameTypeSequenceSize; // That string's size. Used to read it then. uint32_t langFrom; // Source language uint32_t langTo; // Target language uint32_t hasZipFile; // Non-zero means there's a zip file with resources present uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip // resource index. uint32_t zipIndexRootOffset; } #ifndef _MSC_VER __attribute__((packed)) #endif ; bool indexIsOldOrBad( string const & indexFile ) { File::Class idx( indexFile, "rb" ); IdxHeader header; return idx.readRecords( &header, sizeof( header ), 1 ) != 1 || header.signature != Signature || header.formatVersion != CurrentFormatVersion; } class StardictDictionary: public BtreeIndexing::BtreeDictionary { Mutex idxMutex; File::Class idx; IdxHeader idxHeader; string bookName; string sameTypeSequence; ChunkedStorage::Reader chunks; Mutex dzMutex; dictData * dz; Mutex resourceZipMutex; IndexedZip resourceZip; public: StardictDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ); ~StardictDictionary(); virtual string getName() noexcept { return bookName; } virtual map< Dictionary::Property, string > getProperties() noexcept { return map< Dictionary::Property, string >(); } virtual unsigned long getArticleCount() noexcept { return idxHeader.wordCount; } virtual unsigned long getWordCount() noexcept { return idxHeader.wordCount + idxHeader.synWordCount; } inline virtual quint32 getLangFrom() const { return idxHeader.langFrom; } inline virtual quint32 getLangTo() const { return idxHeader.langTo; } virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) ; virtual sptr< Dictionary::DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) ; virtual sptr< Dictionary::DataRequest > getResource( string const & name ) ; virtual QString const& getDescription(); virtual QString getMainFilename(); virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString, int searchMode, bool matchCase, int distanceBetweenWords, int maxResults, bool ignoreWordsOrder, bool ignoreDiacritics ); virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text ); virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration ); virtual void setFTSParameters( Config::FullTextSearch const & fts ) { can_FTS = fts.enabled && !fts.disabledTypes.contains( "STARDICT", Qt::CaseInsensitive ) && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize ); } protected: void loadIcon() noexcept; private: /// Retrieves the article's offset/size in .dict file, and its headword. void getArticleProps( uint32_t articleAddress, string & headword, uint32_t & offset, uint32_t & size ); /// Loads the article, storing its headword and formatting the data it has /// into an html. void loadArticle( uint32_t address, string & headword, string & articleText ); string loadString( size_t size ); string handleResource( char type, char const * resource, size_t size ); void pangoToHtml( QString & text ); friend class StardictResourceRequest; friend class StardictArticleRequest; friend class StardictHeadwordsRequest; }; StardictDictionary::StardictDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ): BtreeDictionary( id, dictionaryFiles ), idx( indexFile, "rb" ), idxHeader( idx.read< IdxHeader >() ), bookName( loadString( idxHeader.bookNameSize ) ), sameTypeSequence( loadString( idxHeader.sameTypeSequenceSize ) ), chunks( idx, idxHeader.chunksOffset ) { // Open the .dict file DZ_ERRORS error; dz = dict_data_open( dictionaryFiles[ 2 ].c_str(), &error, 0 ); if ( !dz ) throw exDictzipError( string( dz_error_str( error ) ) + "(" + dictionaryFiles[ 2 ] + ")" ); // Initialize the index openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); // Open a resource zip file, if there's one if ( idxHeader.hasZipFile && ( idxHeader.zipIndexBtreeMaxElements || idxHeader.zipIndexRootOffset ) ) { resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements, idxHeader.zipIndexRootOffset ), idx, idxMutex ); QString zipName = QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames().back().c_str() ) ); if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check resourceZip.openZipFile( zipName ); } // Full-text search parameters can_FTS = true; ftsIdxName = indexFile + Dictionary::getFtsSuffix(); if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName ) && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) FTS_index_completed.ref(); } StardictDictionary::~StardictDictionary() { if ( dz ) dict_data_close( dz ); } void StardictDictionary::loadIcon() noexcept { if ( dictionaryIconLoaded ) return; QString fileName = QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) ); // Remove the extension fileName.chop( 3 ); if( !loadIconFromFile( fileName ) ) { // Load failed -- use default icons dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_stardict.png"); } dictionaryIconLoaded = true; } string StardictDictionary::loadString( size_t size ) { if( size == 0 ) return string(); vector< char > data( size ); idx.read( &data.front(), data.size() ); return string( &data.front(), data.size() ); } void StardictDictionary::getArticleProps( uint32_t articleAddress, string & headword, uint32_t & offset, uint32_t & size ) { vector< char > chunk; Mutex::Lock _( idxMutex ); char * articleData = chunks.getBlock( articleAddress, chunk ); memcpy( &offset, articleData, sizeof( uint32_t ) ); articleData += sizeof( uint32_t ); memcpy( &size, articleData, sizeof( uint32_t ) ); articleData += sizeof( uint32_t ); headword = articleData; } class PowerWordDataProcessor{ class PWSyntaxTranslate{ public: PWSyntaxTranslate(const char* re, const char* replacement) : _re(re, QRegularExpression::UseUnicodePropertiesOption ) , _replacement(replacement) { } const QRegularExpression & re() const { return _re; } const QString & replacement() const { return _replacement; } private: QRegularExpression _re; QString _replacement; }; public: PowerWordDataProcessor(const char* resource, size_t size) : _data(QString::fromUtf8(resource, size)) { } string process() { QDomDocument doc; QString ss; ss = "

"; if (!doc.setContent(_data)) { ss += _data ; } else { QStringList sl; walkNode(doc.firstChild(), sl); QStringListIterator itr(sl); while (itr.hasNext()) { QString s = itr.next(); translatePW(s); ss += s; ss += "
"; } } ss += "

"; QByteArray ba = ss.toUtf8(); return string(ba.data(), ba.size()); } private: void walkNode(const QDomNode& e, QStringList& sl) { if (e.isNull()) { return; } if (e.isText()) { sl.append(e.toText().data()); } else { QDomNodeList l = e.childNodes(); for (int i = 0; i < l.size(); ++i) { QDomNode n = l.at(i); if (n.isText()) { sl.append(n.toText().data()); } else { walkNode(n, sl); } } } } void translatePW(QString& s){ const int TRANSLATE_TBL_SIZE=5; static PWSyntaxTranslate t[TRANSLATE_TBL_SIZE]={ PWSyntaxTranslate("&[bB]\\s*\\{([^\\{}&]+)\\}", "\\1"), PWSyntaxTranslate("&[iI]\\s*\\{([^\\{}&]+)\\}", "\\1"), PWSyntaxTranslate("&[uU]\\s*\\{([^\\{}&]+)\\}", "\\1"), PWSyntaxTranslate("&[lL]\\s*\\{([^\\{}&]+)\\}", "\\1"), PWSyntaxTranslate("&[2]\\s*\\{([^\\{}&]+)\\}", "\\1") }; QString old; while (s.compare(old) != 0) { for (int i = 0; i < TRANSLATE_TBL_SIZE; ++i) { PWSyntaxTranslate& a = t[i]; s.replace(a.re(), a.replacement()); } old = s; } s.replace(QRegularExpression( "&.\\s*\\{", QRegularExpression::UseUnicodePropertiesOption | QRegularExpression::DotMatchesEverythingOption), ""); s.replace("}", ""); } private: QString _data; }; /// This function tries to make an html of the Stardict's resource typed /// 'type', contained in a block pointed to by 'resource', 'size' bytes long. string StardictDictionary::handleResource( char type, char const * resource, size_t size ) { QString text; switch( type ) { case 'x': // Xdxf content return Xdxf2Html::convert( string( resource, size ), Xdxf2Html::STARDICT, NULL, this, &resourceZip ); case 'h': // Html content { QString articleText = QString( "

" ) + QString::fromUtf8( resource, size ) + "

"; QRegularExpression imgRe( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)", QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption ); QRegularExpression linkRe( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)", QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption ); articleText.replace( imgRe , "\\1bres://" + QString::fromStdString( getId() ) + "/" ) .replace( linkRe, "\\1bres://" + QString::fromStdString( getId() ) + "/" ); // Handle links to articles QRegularExpression linksReg( "]*)href\\s*=\\s*['\"](bword://)?([^'\"]+)['\"]", QRegularExpression::CaseInsensitiveOption ); int pos = 0; QString articleNewText; QRegularExpressionMatchIterator it = linksReg.globalMatch( articleText ); while( it.hasNext() ) { QRegularExpressionMatch match = it.next(); articleNewText += articleText.mid( pos, match.capturedStart() - pos ); pos = match.capturedEnd(); QString link = match.captured( 3 ); if( link.indexOf( ':' ) < 0 ) { QString newLink; if( link.indexOf( '#' ) < 0 ) newLink = QString( " 0 ) { newLink = QString( "(.*)", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption | QRegularExpression::InvertedGreedinessOption ); pos = 0; it = audioRe.globalMatch( articleText ); while( it.hasNext() ) { QRegularExpressionMatch match = it.next(); articleNewText += articleText.mid( pos, match.capturedStart() - pos ); pos = match.capturedEnd(); QString src = match.captured( 2 ); if( src.indexOf( "://" ) >= 0 ) articleNewText += match.captured(); else { std::string href = "\"gdau://" + getId() + "/" + src.toUtf8().data() + "\""; QString newTag = QString::fromUtf8( ( addAudioLink( href, getId() ) + "" ).c_str() ); newTag += match.captured( 4 ); if( match.captured( 4 ).indexOf( " $\"Play\"$ "; newTag += ""; articleNewText += newTag; } } if( pos ) { articleNewText += articleText.mid( pos ); articleText = articleNewText; articleNewText.clear(); } return ( articleText.toUtf8().data() ); } case 'm': // Pure meaning, usually means preformatted text return "

" + Html::preformat( string( resource, size ), isToLanguageRTL() ) + "

"; case 'l': // Same as 'm', but not in utf8, instead in current locale's // encoding. // We just use Qt here, it should know better about system's // locale. return "

" + Html::preformat( QString::fromLocal8Bit( resource, size ).toUtf8().data(), isToLanguageRTL() ) + "

"; case 'g': // Pango markup. text = QString::fromUtf8( resource, size ); pangoToHtml( text ); return "

" + string( text.toUtf8().data() ) + "

"; case 't': // Transcription return "