/* This file is (c) 2008-2012 Konstantin Isakov * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "bgl.hh" #include "btreeidx.hh" #include "bgl_babylon.hh" #include "file.hh" #include "folding.hh" #include "utf8.hh" #include "chunkedstorage.hh" #include "langcoder.hh" #include "language.hh" #include "gddebug.hh" #include "htmlescape.hh" #include "ftshelpers.hh" #include #include #include #include #include #include #ifdef _MSC_VER #include #endif #include #include #include #if (QT_VERSION >= QT_VERSION_CHECK(6,0,0)) #include #else #include #endif #include #include "utils.hh" namespace Bgl { using std::map; using std::multimap; using std::set; using gd::wstring; using gd::wchar; using std::list; using std::pair; using std::string; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexInfo; namespace { enum { Signature = 0x584c4742, // BGLX on little-endian, XLGB on big-endian CurrentFormatVersion = 19 + BtreeIndexing::FormatVersion }; struct IdxHeader { uint32_t signature; // First comes the signature, BGLX uint32_t formatVersion; // File format version, currently 1. uint32_t parserVersion; // Version of the parser used to parse the BGL file. // If it's lower than the current one, the file is to // be re-parsed. uint32_t foldingVersion; // Version of the folding algorithm used when building // index. If it's different from the current one, // the file is to be rebuilt. uint32_t articleCount; // Total number of articles, for informative purposes only uint32_t wordCount; // Total number of words, for informative purposes only /// Add more fields here, like name, description, author and such. uint32_t chunksOffset; // The offset to chunks' storage uint32_t indexBtreeMaxElements; // Two fields from IndexInfo uint32_t indexRootOffset; uint32_t resourceListOffset; // The offset of the list of resources uint32_t resourcesCount; // Number of resources stored uint32_t langFrom; // Source language uint32_t langTo; // Target language uint32_t iconAddress; // Address of the icon in the chunks' storage uint32_t iconSize; // Size of the icon in the chunks' storage, 0 = no icon uint32_t descriptionAddress; // Address of the dictionary description in the chunks' storage uint32_t descriptionSize; // Size of the description in the chunks' storage, 0 = no description } #ifndef _MSC_VER __attribute__((packed)) #endif ; bool indexIsOldOrBad( string const & indexFile ) { File::Class idx( indexFile, "rb" ); IdxHeader header; return idx.readRecords( &header, sizeof( header ), 1 ) != 1 || header.signature != Signature || header.formatVersion != CurrentFormatVersion || header.parserVersion != Babylon::ParserVersion || header.foldingVersion != Folding::Version; } // Removes the $1$-like postfix string removePostfix( string const & in ) { if ( in.size() && in[ in.size() - 1 ] == '$' ) { // Find the end of it and cut it, barring any unexpectedness for( long x = in.size() - 2; x >= 0; x-- ) { if ( in[ x ] == '$' ) return in.substr( 0, x ); else if ( !isdigit( in[ x ] ) ) break; } } return in; } // Removes any leading or trailing whitespace void trimWs( string & word ) { if ( word.size() ) { unsigned begin = 0; while( begin < word.size() && Utf8::isspace( word[ begin ] ) ) ++begin; if ( begin == word.size() ) // Consists of ws entirely? word.clear(); else { unsigned end = word.size(); // Doesn't consist of ws entirely, so must end with just isspace() // condition. while( Utf8::isspace( word[ end - 1 ] ) ) --end; if ( end != word.size() || begin ) word = string( word, begin, end - begin ); } } } void addEntryToIndex( string & word, uint32_t articleOffset, IndexedWords & indexedWords, vector< wchar > & wcharBuffer ) { // Strip any leading or trailing whitespaces trimWs( word ); // If the word starts with a slash, we drop it. There are quite a lot // of them, and they all seem to be redudant duplicates. if ( word.size() && word[ 0 ] == '/' ) return; // Check the input word for a superscript postfix ($1$, $2$ etc), which // signifies different meaning in Bgl files. We emit different meaning // as different articles, but they appear in the index as the same word. if ( word.size() && word[ word.size() - 1 ] == '$' ) { word = removePostfix( word ); trimWs( word ); } // Convert the word from utf8 to wide chars indexedWords.addWord( Utf8::decode( word ), articleOffset ); } DEF_EX( exFailedToDecompressArticle, "Failed to decompress article's body", Dictionary::Ex ) DEF_EX( exChunkIndexOutOfRange, "Chunk index is out of range", Dictionary::Ex ) class BglDictionary: public BtreeIndexing::BtreeDictionary { Mutex idxMutex; File::Class idx; IdxHeader idxHeader; ChunkedStorage::Reader chunks; public: BglDictionary( string const & id, string const & indexFile, string const & dictionaryFile ); map< Dictionary::Property, string > getProperties() noexcept override { return map< Dictionary::Property, string >(); } unsigned long getArticleCount() noexcept override { return idxHeader.articleCount; } unsigned long getWordCount() noexcept override { return idxHeader.wordCount; } inline quint32 getLangFrom() const override { return idxHeader.langFrom; } inline quint32 getLangTo() const override { return idxHeader.langTo; } sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override ; sptr< Dictionary::DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override ; sptr< Dictionary::DataRequest > getResource( string const & name ) override ; sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString, int searchMode, bool matchCase, int distanceBetweenWords, int maxResults, bool ignoreWordsOrder, bool ignoreDiacritics ) override; QString const& getDescription() override; void getArticleText( uint32_t articleAddress, QString & headword, QString & text ) override; void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration ) override; void setFTSParameters( Config::FullTextSearch const & fts ) override { can_FTS = fts.enabled && !fts.disabledTypes.contains( "BGL", Qt::CaseInsensitive ) && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize ); } protected: void loadIcon() noexcept override; private: /// Loads an article with the given offset, filling the given strings. void loadArticle( uint32_t offset, string & headword, string & displayedHeadword, string & articleText ); static void replaceCharsetEntities( string & ); friend class BglHeadwordsRequest; friend class BglArticleRequest; friend class BglResourceRequest; }; BglDictionary::BglDictionary( string const & id, string const & indexFile, string const & dictionaryFile ): BtreeDictionary( id, vector< string >( 1, dictionaryFile ) ), idx( indexFile, "rb" ), idxHeader( idx.read< IdxHeader >() ), chunks( idx, idxHeader.chunksOffset ) { idx.seek( sizeof( idxHeader ) ); // Read the dictionary's name size_t len = idx.read< uint32_t >(); if( len ) { vector< char > nameBuf( len ); idx.read( &nameBuf.front(), len ); dictionaryName = string( &nameBuf.front(), len ); } // Initialize the index openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); can_FTS = true; ftsIdxName = indexFile + Dictionary::getFtsSuffix(); if( !Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName ) && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) FTS_index_completed.ref(); } void BglDictionary::loadIcon() noexcept { if ( dictionaryIconLoaded ) return; QString fileName = QDir::fromNativeSeparators( QString::fromStdString(getDictionaryFilenames()[ 0 ] ) ); // Remove the extension fileName.chop( 3 ); if( !loadIconFromFile( fileName ) ) { if( idxHeader.iconSize ) { // Try loading icon now vector< char > chunk; Mutex::Lock _( idxMutex ); char * iconData = chunks.getBlock( idxHeader.iconAddress, chunk ); QImage img; if (img.loadFromData( ( unsigned char *) iconData, idxHeader.iconSize ) ) { // Load successful dictionaryNativeIcon = QIcon( QPixmap::fromImage( img ) ); // Transform it to be square int max = img.width() > img.height() ? img.width() : img.height(); QImage result( max, max, QImage::Format_ARGB32 ); result.fill( 0 ); // Black transparent QPainter painter( &result ); painter.setRenderHint(QPainter::RenderHint::Antialiasing); painter.drawImage( QPoint( img.width() == max ? 0 : ( max - img.width() ) / 2, img.height() == max ? 0 : ( max - img.height() ) / 2 ), img ); painter.end(); dictionaryIcon = QIcon( QPixmap::fromImage( result ) ); } } if ( dictionaryIcon.isNull() ) dictionaryIcon = dictionaryNativeIcon = QIcon(":/icons/icon32_bgl.png"); } dictionaryIconLoaded = true; } void BglDictionary::loadArticle( uint32_t offset, string & headword, string & displayedHeadword, string & articleText ) { vector< char > chunk; Mutex::Lock _( idxMutex ); char * articleData = chunks.getBlock( offset, chunk ); headword = articleData; displayedHeadword = articleData + headword.size() + 1; articleText = string( articleData + headword.size() + displayedHeadword.size() + 2 ); } QString const& BglDictionary::getDescription() { if( !dictionaryDescription.isEmpty() ) return dictionaryDescription; if( idxHeader.descriptionSize == 0 ) dictionaryDescription = "NONE"; else { Mutex::Lock _( idxMutex ); vector< char > chunk; char * dictDescription = chunks.getBlock( idxHeader.descriptionAddress, chunk ); string str( dictDescription ); if( !str.empty() ) dictionaryDescription += QObject::tr( "Copyright: %1%2" ) .arg( Html::unescape( QString::fromUtf8( str.data(), str.size() ) ) ) .arg( "\n\n" ); dictDescription += str.size() + 1; str = string( dictDescription ); if( !str.empty() ) dictionaryDescription += QObject::tr( "Author: %1%2" ) .arg( QString::fromUtf8( str.data(), str.size() ) ) .arg( "\n\n" ); dictDescription += str.size() + 1; str = string( dictDescription ); if( !str.empty() ) dictionaryDescription += QObject::tr( "E-mail: %1%2" ) .arg( QString::fromUtf8( str.data(), str.size() ) ) .arg( "\n\n" ); dictDescription += str.size() + 1; str = string( dictDescription ); if( !str.empty() ) dictionaryDescription += Html::unescape( QString::fromUtf8( str.data(), str.size() ) ); } return dictionaryDescription; } void BglDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text ) { try { string headwordStr, displayedHeadwordStr, articleStr; loadArticle( articleAddress, headwordStr, displayedHeadwordStr, articleStr ); // Some headword normalization similar while indexing trimWs( headwordStr ); if ( headwordStr.size() && headwordStr[ 0 ] == '/' ) headwordStr.erase(); // We will take headword from index later if ( headwordStr.size() && headwordStr[ headwordStr.size() - 1 ] == '$' ) { headwordStr = removePostfix( headwordStr ); trimWs( headwordStr ); } headword = QString::fromUtf8( headwordStr.data(), headwordStr.size() ); wstring wstr = Utf8::decode( articleStr ); if ( getLangTo() == LangCoder::code2toInt( "he" ) ) { for ( unsigned int i = 0; i < wstr.size(); i++ ) { if ( (wstr[ i ] >= 224 && wstr[ i ] <= 250) || (wstr[ i ] >= 192 && wstr[ i ] <= 210) ) // Hebrew chars encoded ecoded as windows-1255 or ISO-8859-8, or as vowel-points of windows-1255 wstr[ i ] += 1488 - 224; // Convert to Hebrew unicode } } text = Html::unescape( QString::fromStdU32String( wstr ) ); } catch( std::exception &ex ) { gdWarning( "BGL: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() ); } } void BglDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration ) { if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName ) || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) ) FTS_index_completed.ref(); if( haveFTSIndex() ) return; if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch ) return; gdDebug( "Bgl: Building the full-text index for dictionary: %s\n", getName().c_str() ); try { FtsHelpers::makeFTSIndex( this, isCancelled ); FTS_index_completed.ref(); } catch( std::exception &ex ) { gdWarning( "Bgl: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() ); QFile::remove( QString::fromStdString( ftsIdxName ) ); } } /// BglDictionary::findHeadwordsForSynonym() class BglHeadwordsRequest: public Dictionary::WordSearchRequest { wstring str; BglDictionary & dict; QAtomicInt isCancelled; QFuture< void > f; public: BglHeadwordsRequest( wstring const & word_, BglDictionary & dict_ ): str( word_ ), dict( dict_ ) { f = QtConcurrent::run( [ this ]() { this->run(); } ); } void run(); void cancel() override { isCancelled.ref(); } ~BglHeadwordsRequest() override { isCancelled.ref(); f.waitForFinished(); } }; void BglHeadwordsRequest::run() { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } vector< WordArticleLink > chain = dict.findArticles( str ); wstring caseFolded = Folding::applySimpleCaseOnly( str ); for( unsigned x = 0; x < chain.size(); ++x ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } string headword, displayedHeadword, articleText; dict.loadArticle( chain[ x ].articleOffset, headword, displayedHeadword, articleText ); wstring headwordDecoded; try { headwordDecoded = Utf8::decode( removePostfix( headword ) ); } catch( Utf8::exCantDecode & ) { } if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) && !headwordDecoded.empty() ) { // The headword seems to differ from the input word, which makes the // input word its synonym. Mutex::Lock _( dataMutex ); matches.push_back( headwordDecoded ); } } finish(); } sptr< Dictionary::WordSearchRequest > BglDictionary::findHeadwordsForSynonym( wstring const & word ) { return synonymSearchEnabled ? std::make_shared( word, *this ) : Class::findHeadwordsForSynonym( word ); } // Converts a $1$-like postfix to a 1 one string postfixToSuperscript( string const & in ) { if ( !in.size() || in[ in.size() - 1 ] != '$' ) return in; for( long x = in.size() - 2; x >= 0; x-- ) { if ( in[ x ] == '$' ) { if ( in.size() - x - 2 > 2 ) { // Large postfixes seem like something we wouldn't want to show -- // some dictionaries seem to have each word numbered using the // postfix. return in.substr( 0, x ); } else return in.substr( 0, x ) + "" + in.substr( x + 1, in.size() - x - 2 ) + ""; } else if ( !isdigit( in[ x ] ) ) break; } return in; } /// BglDictionary::getArticle() class BglArticleRequest: public Dictionary::DataRequest { wstring word; vector< wstring > alts; BglDictionary & dict; QAtomicInt isCancelled; bool ignoreDiacritics; QFuture< void > f; public: BglArticleRequest( wstring const & word_, vector< wstring > const & alts_, BglDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ ) { f = QtConcurrent::run( [ this ]() { this->run(); } ); } void run(); void cancel() override { isCancelled.ref(); } void fixHebString(string & hebStr); // Hebrew support void fixHebArticle(string & hebArticle); // Hebrew support ~BglArticleRequest() { isCancelled.ref(); f.waitForFinished(); } }; void BglArticleRequest::fixHebString(string & hebStr) // Hebrew support - convert non-unicode to unicode { wstring hebWStr; try { hebWStr = Utf8::decode(hebStr); } catch( Utf8::exCantDecode & ) { hebStr = "Utf-8 decoding error"; return; } for (unsigned int i=0; i= 224 && hebWStr[ i ] <= 250) || (hebWStr[ i ] >= 192 && hebWStr[ i ] <= 210) ) // Hebrew chars encoded ecoded as windows-1255 or ISO-8859-8, or as vowel-points of windows-1255 hebWStr[i]+=1488-224; // Convert to Hebrew unicode } hebStr=Utf8::encode(hebWStr); } void BglArticleRequest::fixHebArticle(string & hebArticle) // Hebrew support - remove extra chars at the end { unsigned nulls; for ( nulls = hebArticle.size(); nulls > 0 && ( ( hebArticle[ nulls - 1 ] <= 32 && hebArticle[ nulls - 1 ] >= 0 ) || ( hebArticle[ nulls - 1 ] >= 65 && hebArticle[ nulls - 1 ] <= 90 ) ); --nulls ) ; //special chars and A-Z hebArticle.resize( nulls ); } void BglArticleRequest::run() { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics ); static Language::Id hebrew = LangCoder::code2toInt( "he" ); // Hebrew support for( unsigned x = 0; x < alts.size(); ++x ) { /// Make an additional query for each alt vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics ); chain.insert( chain.end(), altChain.begin(), altChain.end() ); } multimap< wstring, pair< string, string > > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. // Sometimes the articles are physically duplicated. We store hashes of // the bodies to account for this. set< QByteArray > articleBodiesIncluded; wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); if( ignoreDiacritics ) wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); for( unsigned x = 0; x < chain.size(); ++x ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } try { if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() ) continue; // We already have this article in the body. // Now grab that article string headword, displayedHeadword, articleText; dict.loadArticle( chain[ x ].articleOffset, headword, displayedHeadword, articleText ); // Ok. Now, does it go to main articles, or to alternate ones? We list // main ones first, and alternates after. // We do the case-folded and postfix-less comparison here. wstring headwordStripped = Folding::applySimpleCaseOnly( removePostfix( headword ) ); if( ignoreDiacritics ) headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); // Hebrew support - fix Hebrew text if (dict.idxHeader.langFrom == hebrew) { displayedHeadword= displayedHeadword.size() ? displayedHeadword : headword; fixHebString(articleText); fixHebArticle(articleText); fixHebString(displayedHeadword); } string const & targetHeadword = displayedHeadword.size() ? displayedHeadword : headword; QCryptographicHash hash( QCryptographicHash::Md5 ); hash.addData( targetHeadword.data(), targetHeadword.size() + 1 ); // with 0 hash.addData( articleText.data(), articleText.size() ); if ( !articleBodiesIncluded.insert( hash.result() ).second ) continue; // Already had this body multimap< wstring, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( targetHeadword, articleText ) ) ); articlesIncluded.insert( chain[ x ].articleOffset ); } // try catch( std::exception &ex ) { gdWarning( "BGL: Failed loading article from \"%s\", reason: %s\n", dict.getName().c_str(), ex.what() ); } } if ( mainArticles.empty() && alternateArticles.empty() ) { // No such word finish(); return; } string result; multimap< wstring, pair< string, string > >::const_iterator i; string cleaner = Utils::Html::getHtmlCleaner(); for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { if (dict.isFromLanguageRTL() ) // RTL support result += "

"; else result += "

"; result += postfixToSuperscript( i->second.first ); result += "

"; if ( dict.isToLanguageRTL() ) result += "
" + i->second.second + "
"; else result += "
" + i->second.second + "
"; result += cleaner; } for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i ) { if (dict.isFromLanguageRTL() ) // RTL support result += "

"; else result += "

"; result += postfixToSuperscript( i->second.first ); result += "

"; if ( dict.isToLanguageRTL() ) result += "
" + i->second.second + "
"; else result += "
" + i->second.second + "
"; result += cleaner; } // Do some cleanups in the text BglDictionary::replaceCharsetEntities( result ); result = QString::fromUtf8( result.c_str() ) // onclick location to link .replace( QRegularExpression( R"(<([a-z0-9]+)\s+[^>]*onclick="[a-z.]*location(?:\.href)\s*=\s*'([^']+)[^>]*>([^<]+))", QRegularExpression::CaseInsensitiveOption ), R"(\3)") .replace( QRegularExpression( R"((<\s*a\s+[^>]*href\s*=\s*["']\s*)bword://)", QRegularExpression::CaseInsensitiveOption ), "\\1bword:" ) //remove invalid width, height attrs .replace( QRegularExpression( R"((width|height)\s*=\s*["']\d{7,}["''])" ), "" ) //remove invalid
tag .replace( QRegularExpression( R"(
(|||||function addScript|var scNode|scNode|var atag|while\(atag|atag=atag|document\.getElementsByTagName|addScript|src="bres||addScript\('JS_FILE_PHONG_VT_45634'\);|appendChild\(scNode\);|atag\.firstChild;)
)", QRegularExpression::CaseInsensitiveOption ), " \\1 " ) .toUtf8().data(); Mutex::Lock _( dataMutex ); data.resize( result.size() ); memcpy( &data.front(), result.data(), result.size() ); hasAnyData = true; finish(); } sptr< Dictionary::DataRequest > BglDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) { return std::make_shared( word, alts, *this, ignoreDiacritics ); } //// BglDictionary::getResource() class BglResourceRequest: public Dictionary::DataRequest { Mutex & idxMutex; File::Class & idx; uint32_t resourceListOffset, resourcesCount; string name; QAtomicInt isCancelled; QFuture< void > f; public: BglResourceRequest( Mutex & idxMutex_, File::Class & idx_, uint32_t resourceListOffset_, uint32_t resourcesCount_, string const & name_ ): idxMutex( idxMutex_ ), idx( idx_ ), resourceListOffset( resourceListOffset_ ), resourcesCount( resourcesCount_ ), name( name_ ) { f = QtConcurrent::run( [ this ]() { this->run(); } ); } void run(); void cancel() override { isCancelled.ref(); } ~BglResourceRequest() { isCancelled.ref(); f.waitForFinished(); } }; void BglResourceRequest::run() { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } string nameLowercased = name; for( string::iterator i = nameLowercased.begin(); i != nameLowercased.end(); ++i ) *i = tolower( *i ); Mutex::Lock _( idxMutex ); idx.seek( resourceListOffset ); for( size_t count = resourcesCount; count--; ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) break; vector< char > nameData( idx.read< uint32_t >() ); idx.read( &nameData.front(), nameData.size() ); for( size_t x = nameData.size(); x--; ) nameData[ x ] = tolower( nameData[ x ] ); uint32_t offset = idx.read< uint32_t >(); if ( string( &nameData.front(), nameData.size() ) == nameLowercased ) { // We have a match. idx.seek( offset ); Mutex::Lock _( dataMutex ); data.resize( idx.read< uint32_t >() ); vector< unsigned char > compressedData( idx.read< uint32_t >() ); idx.read( &compressedData.front(), compressedData.size() ); unsigned long decompressedLength = data.size(); if ( uncompress( (unsigned char *) &data.front(), &decompressedLength, &compressedData.front(), compressedData.size() ) != Z_OK || decompressedLength != data.size() ) { gdWarning( "Failed to decompress resource \"%s\", ignoring it.\n", name.c_str() ); } else hasAnyData = true; break; } } finish(); } sptr< Dictionary::DataRequest > BglDictionary::getResource( string const & name ) { return std::shared_ptr(new BglResourceRequest(idxMutex, idx, idxHeader.resourceListOffset, idxHeader.resourcesCount, name )); } /// Replaces 1234; occurrences with ሴ void BglDictionary::replaceCharsetEntities( string & text ) { QString str = QString::fromUtf8( text.c_str() ); QRegularExpression charsetExp( R"(<\s*charset\s+c\s*=\s*["']?t["']?\s*>((?:\s*[0-9a-fA-F]+\s*;\s*)*)<\s*/\s*charset\s*>)", QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption ); QRegularExpression oneValueExp( "\\s*([0-9a-fA-F]+)\\s*;" ); QString result; int pos = 0; QRegularExpressionMatchIterator it = charsetExp.globalMatch( str ); while( it.hasNext() ) { QRegularExpressionMatch match = it.next(); result += str.mid( pos, match.capturedStart() - pos ); pos = match.capturedEnd(); QRegularExpressionMatchIterator itValue = oneValueExp.globalMatch( match.captured( 1 ) ); while( itValue.hasNext() ) { QRegularExpressionMatch matchValue = itValue.next(); result += "&#x" + matchValue.captured( 1 ) + ";"; } } if( pos ) { result += str.mid( pos ); str = result; } text = str.toUtf8().data(); } class ResourceHandler: public Babylon::ResourceHandler { File::Class & idxFile; list< pair< string, uint32_t > > resources; public: ResourceHandler( File::Class & idxFile_ ): idxFile( idxFile_ ) {} list< pair< string, uint32_t > > const & getResources() const { return resources; } protected: void handleBabylonResource( string const & filename, char const * data, size_t size ) override; }; void ResourceHandler::handleBabylonResource( string const & filename, char const * data, size_t size ) { //GD_DPRINTF( "Handling resource file %s (%u bytes)\n", filename.c_str(), size ); vector< unsigned char > compressedData( compressBound( size ) ); unsigned long compressedSize = compressedData.size(); if ( compress( &compressedData.front(), &compressedSize, (unsigned char const *) data, size ) != Z_OK ) { gdWarning( "Failed to compress the body of resource \"%s\", dropping it.\n", filename.c_str() ); return; } resources.push_back( pair< string, uint32_t >( filename, idxFile.tell() ) ); idxFile.write< uint32_t >( size ); idxFile.write< uint32_t >( compressedSize ); idxFile.write( &compressedData.front(), compressedSize ); } } sptr< Dictionary::DataRequest > BglDictionary::getSearchResults( QString const & searchString, int searchMode, bool matchCase, int distanceBetweenWords, int maxResults, bool ignoreWordsOrder, bool ignoreDiacritics ) { return std::make_shared( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics ); } vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, Dictionary::Initializing & initializing ) { vector< sptr< Dictionary::Class > > dictionaries; for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); ++i ) { // Skip files with the extensions different to .bgl to speed up the // scanning if ( i->size() < 4 || strcasecmp( i->c_str() + ( i->size() - 4 ), ".bgl" ) != 0 ) continue; // Got the file -- check if we need to rebuid the index vector< string > dictFiles( 1, *i ); string dictId = Dictionary::makeDictionaryId( dictFiles ); string indexFile = indicesDir + dictId; if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( indexFile ) ) { // Building the index gdDebug( "Bgl: Building the index for dictionary: %s\n", i->c_str() ); try { Babylon b( *i ); if ( !b.open() ) continue; std::string sourceCharset, targetCharset; if ( !b.read( sourceCharset, targetCharset ) ) { gdWarning( "Failed to start reading from %s, skipping it\n", i->c_str() ); continue; } initializing.indexingDictionary( b.title() ); File::Class idx( indexFile, "wb" ); IdxHeader idxHeader; memset( &idxHeader, 0, sizeof( idxHeader ) ); // We write a dummy header first. At the end of the process the header // will be rewritten with the right values. idx.write( idxHeader ); idx.write< uint32_t >( b.title().size() ); idx.write( b.title().data(), b.title().size() ); // This is our index data that we accumulate during the loading process. // For each new word encountered, we emit the article's body to the file // immediately, inserting the word itself and its offset in this map. // This map maps folded words to the original words and the corresponding // articles' offsets. IndexedWords indexedWords; // We use this buffer to decode utf8 into it. vector< wchar > wcharBuffer; ChunkedStorage::Writer chunks( idx ); uint32_t articleCount = 0, wordCount = 0; ResourceHandler resourceHandler( idx ); b.setResourcePrefix( string( "bres://" ) + dictId + "/" ); // Save icon if there's one if ( size_t sz = b.getIcon().size() ) { idxHeader.iconAddress = chunks.startNewBlock(); chunks.addToBlock( &b.getIcon().front(), sz ); idxHeader.iconSize = sz; } // Save dictionary description if there's one idxHeader.descriptionSize = 0; idxHeader.descriptionAddress = chunks.startNewBlock(); chunks.addToBlock( b.copyright().c_str(), b.copyright().size() + 1 ); idxHeader.descriptionSize += b.copyright().size() + 1; chunks.addToBlock( b.author().c_str(), b.author().size() + 1 ); idxHeader.descriptionSize += b.author().size() + 1; chunks.addToBlock( b.email().c_str(), b.email().size() + 1 ); idxHeader.descriptionSize += b.email().size() + 1; chunks.addToBlock( b.description().c_str(), b.description().size() + 1 ); idxHeader.descriptionSize += b.description().size() + 1; for( ; ; ) { bgl_entry e = b.readEntry( &resourceHandler ); if ( e.headword.empty() ) break; // Save the article's body itself first uint32_t articleAddress = chunks.startNewBlock(); chunks.addToBlock( e.headword.c_str(), e.headword.size() + 1 ); chunks.addToBlock( e.displayedHeadword.c_str(), e.displayedHeadword.size() + 1 ); chunks.addToBlock( e.definition.c_str(), e.definition.size() + 1 ); // Add entries to the index addEntryToIndex( e.headword, articleAddress, indexedWords, wcharBuffer ); for( unsigned x = 0; x < e.alternates.size(); ++x ) addEntryToIndex( e.alternates[ x ], articleAddress, indexedWords, wcharBuffer ); wordCount += 1 + e.alternates.size(); ++articleCount; } // Finish with the chunks idxHeader.chunksOffset = chunks.finish(); GD_DPRINTF( "Writing index...\n" ); // Good. Now build the index IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; idxHeader.indexRootOffset = idxInfo.rootOffset; // Save the resource's list. idxHeader.resourceListOffset = idx.tell(); idxHeader.resourcesCount = resourceHandler.getResources().size(); for( list< pair< string, uint32_t > >::const_iterator j = resourceHandler.getResources().begin(); j != resourceHandler.getResources().end(); ++j ) { idx.write< uint32_t >( j->first.size() ); idx.write( j->first.data(), j->first.size() ); idx.write< uint32_t >( j->second ); } // That concludes it. Update the header. idxHeader.signature = Signature; idxHeader.formatVersion = CurrentFormatVersion; idxHeader.parserVersion = Babylon::ParserVersion; idxHeader.foldingVersion = Folding::Version; idxHeader.articleCount = articleCount; idxHeader.wordCount = wordCount; idxHeader.langFrom = b.sourceLang();//LangCoder::findIdForLanguage( Utf8::decode( b.sourceLang() ) ); idxHeader.langTo = b.targetLang();//LangCoder::findIdForLanguage( Utf8::decode( b.targetLang() ) ); idx.rewind(); idx.write( &idxHeader, sizeof( idxHeader ) ); } catch( std::exception & e ) { gdWarning( "BGL dictionary indexing failed: %s, error: %s\n", i->c_str(), e.what() ); } } try { dictionaries.push_back( std::make_shared( dictId, indexFile, *i ) ); } catch( std::exception & e ) { gdWarning( "BGL dictionary initializing failed: %s, error: %s\n", i->c_str(), e.what() ); } } return dictionaries; } }