"; } QString const& AardDictionary::getDescription() { if( !dictionaryDescription.isEmpty() ) return dictionaryDescription; AAR_header dictHeader; quint32 size; vector< char > data; { Mutex::Lock _( aardMutex ); df.seek( 0 ); df.read( &dictHeader, sizeof(dictHeader) ); size = qFromBigEndian( dictHeader.metaLength ); data.resize( size ); df.read( &data.front(), size ); } string metaStr = decompressBzip2( data.data(), size ); if( metaStr.empty() ) metaStr = decompressZlib( data.data(), size ); map< string, string > meta = parseMetaData( metaStr ); if( !meta.empty() ) { map< string, string >::const_iterator iter = meta.find( "copyright" ); if( iter != meta.end() ) dictionaryDescription = QString( QObject::tr( "Copyright: %1%2" ) ).arg( QString::fromUtf8( iter->second.c_str() ) ).arg( "\n\n" ); iter = meta.find( "version" ); if( iter != meta.end() ) dictionaryDescription = QString( QObject::tr( "Version: %1%2" ) ).arg( QString::fromUtf8( iter->second.c_str() ) ).arg( "\n\n" ); iter = meta.find( "description" ); if( iter != meta.end() ) { QString desc = QString::fromUtf8( iter->second.c_str() ); desc.replace( "\\n", "\n" ); desc.replace( "\\t", "\t" ); dictionaryDescription += desc; } } if( dictionaryDescription.isEmpty() ) dictionaryDescription = "NONE"; return dictionaryDescription; } void AardDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration ) { if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName ) || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) ) FTS_index_completed.ref(); if( haveFTSIndex() ) return; if( ensureInitDone().size() ) return; if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch ) return; gdDebug( "Aard: Building the full-text index for dictionary: %s\n", getName().c_str() ); try { FtsHelpers::makeFTSIndex( this, isCancelled ); FTS_index_completed.ref(); } catch( std::exception &ex ) { gdWarning( "Aard: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() ); QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) ); } } void AardDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text ) { try { headword.clear(); string articleText; loadArticle( articleAddress, articleText ); text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) ); } catch( std::exception &ex ) { gdWarning( "Aard: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() ); } } sptr< Dictionary::DataRequest > AardDictionary::getSearchResults( QString const & searchString, int searchMode, bool matchCase, int distanceBetweenWords, int maxResults, bool ignoreWordsOrder, bool ignoreDiacritics ) { return std::make_shared( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics ); } /// AardDictionary::getArticle() class AardArticleRequest; class AardArticleRequestRunnable: public QRunnable { AardArticleRequest & r; QSemaphore & hasExited; public: AardArticleRequestRunnable( AardArticleRequest & r_, QSemaphore & hasExited_ ): r( r_ ), hasExited( hasExited_ ) {} ~AardArticleRequestRunnable() { hasExited.release(); } virtual void run(); }; class AardArticleRequest: public Dictionary::DataRequest { friend class AardArticleRequestRunnable; wstring word; vector< wstring > alts; AardDictionary & dict; bool ignoreDiacritics; QAtomicInt isCancelled; QSemaphore hasExited; public: AardArticleRequest( wstring const & word_, vector< wstring > const & alts_, AardDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ ) { QThreadPool::globalInstance()->start( new AardArticleRequestRunnable( *this, hasExited ) ); } void run(); // Run from another thread by DslArticleRequestRunnable virtual void cancel() { isCancelled.ref(); } ~AardArticleRequest() { isCancelled.ref(); hasExited.acquire(); } }; void AardArticleRequestRunnable::run() { r.run(); } void AardArticleRequest::run() { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics ); for( unsigned x = 0; x < alts.size(); ++x ) { /// Make an additional query for each alt vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics ); chain.insert( chain.end(), altChain.begin(), altChain.end() ); } multimap< wstring, pair< string, string > > mainArticles, alternateArticles; set< quint32 > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); if( ignoreDiacritics ) wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); for( unsigned x = 0; x < chain.size(); ++x ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); return; } if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() ) continue; // We already have this article in the body. // Now grab that article string headword, articleText; headword = chain[ x ].word; try { dict.loadArticle( chain[ x ].articleOffset, articleText ); } catch(...) { } // Ok. Now, does it go to main articles, or to alternate ones? We list // main ones first, and alternates after. // We do the case-folded comparison here. wstring headwordStripped = Folding::applySimpleCaseOnly( Utf8::decode( headword ) ); if( ignoreDiacritics ) headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); multimap< wstring, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair< wstring, pair< string, string > >( Folding::applySimpleCaseOnly( Utf8::decode( headword ) ), pair< string, string >( headword, articleText ) ) ); articlesIncluded.insert( chain[ x ].articleOffset ); } if ( mainArticles.empty() && alternateArticles.empty() ) { // No such word finish(); return; } string result; multimap< wstring, pair< string, string > >::const_iterator i; for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += "

"; result += i->second.first; result += "

"; result += i->second.second; } for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i ) { result += "

"; result += i->second.first; result += "

"; result += i->second.second; } Mutex::Lock _( dataMutex ); data.resize( result.size() ); memcpy( &data.front(), result.data(), result.size() ); hasAnyData = true; finish(); } sptr< Dictionary::DataRequest > AardDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) { return std::make_shared( word, alts, *this, ignoreDiacritics ); } } // anonymous namespace vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, Dictionary::Initializing & initializing, unsigned maxHeadwordsToExpand ) { vector< sptr< Dictionary::Class > > dictionaries; for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); ++i ) { // Skip files with the extensions different to .aar to speed up the // scanning if ( i->size() < 4 || strcasecmp( i->c_str() + ( i->size() - 4 ), ".aar" ) != 0 ) continue; // Got the file -- check if we need to rebuid the index vector< string > dictFiles( 1, *i ); string dictId = Dictionary::makeDictionaryId( dictFiles ); string indexFile = indicesDir + dictId; if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( indexFile ) ) { try { gdDebug( "Aard: Building the index for dictionary: %s\n", i->c_str() ); { QFileInfo info( FsEncoding::decode( i->c_str() ) ); if( static_cast< quint64 >( info.size() ) > ULONG_MAX ) { gdWarning( "File %s is too large\n", i->c_str() ); continue; } } File::Class df( *i, "rb" ); AAR_header dictHeader; df.read( &dictHeader, sizeof(dictHeader) ); bool has64bitIndex = !strncmp( dictHeader.indexItemFormat, ">LQ", 4 ); if( strncmp( dictHeader.signature, "aard", 4 ) || ( !has64bitIndex && strncmp( dictHeader.indexItemFormat, ">LL", 4 ) ) || strncmp( dictHeader.keyLengthFormat, ">H", 2 ) || strncmp( dictHeader.articleLengthFormat, ">L", 2) ) { gdWarning( "File %s is not in supported aard format\n", i->c_str() ); continue; } vector< char > data; quint32 size = qFromBigEndian( dictHeader.metaLength ); if( size == 0 ) { gdWarning( "File %s has invalid metadata", i->c_str() ); continue; } data.resize( size ); df.read( &data.front(), size ); string metaStr = decompressBzip2( data.data(), size ); if( metaStr.empty() ) metaStr = decompressZlib( data.data(), size ); map< string, string > meta = parseMetaData( metaStr ); if( meta.empty() ) { gdWarning( "File %s has invalid metadata", i->c_str() ); continue; } string dictName; map< string, string >::const_iterator iter = meta.find( "title" ); if( iter != meta.end() ) dictName = iter->second; string langFrom; iter = meta.find( "index_language" ); if( iter != meta.end() ) langFrom = iter->second; string langTo; iter = meta.find( "article_language" ); if( iter != meta.end() ) langTo = iter->second; if( ( dictName.compare( "Wikipedia") == 0 || dictName.compare( "Wikiquote" ) == 0 || dictName.compare( "Wiktionary" ) == 0 ) && !langTo.empty() ) { string capitalized = langTo.c_str(); capitalized[0] = toupper( capitalized[0] ); dictName = dictName + " (" + capitalized + ")"; } quint16 volumes = qFromBigEndian( dictHeader.totalVolumes ); if( volumes > 1 ) { QString ss=QString( " (%1/%2)").arg( qFromBigEndian( dictHeader.volume ), volumes ); dictName += ss.toLocal8Bit().data(); } initializing.indexingDictionary( dictName ); File::Class idx( indexFile, "wb" ); IdxHeader idxHeader; memset( &idxHeader, 0, sizeof( idxHeader ) ); // We write a dummy header first. At the end of the process the header // will be rewritten with the right values. idx.write( idxHeader ); idx.write( (quint32) dictName.size() ); if( !dictName.empty() ) idx.write( dictName.data(), dictName.size() ); IndexedWords indexedWords; ChunkedStorage::Writer chunks( idx ); quint32 wordCount = qFromBigEndian( dictHeader.wordsCount ); set< quint32 > articleOffsets; quint32 pos = df.tell(); quint32 wordsBase = pos + wordCount * ( has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement ) ); quint32 articlesBase = qFromBigEndian( dictHeader.articleOffset ); data.clear(); for( quint32 j = 0; j < wordCount; j++ ) { quint32 articleOffset; quint32 wordOffset; if( has64bitIndex ) { IndexElement64 el64; df.seek( pos ); df.read( &el64, sizeof(el64) ); articleOffset = articlesBase + qFromBigEndian( el64.articleOffset ); wordOffset = wordsBase + qFromBigEndian( el64.wordOffset ); } else { IndexElement el; df.seek( pos ); df.read( &el, sizeof(el) ); articleOffset = articlesBase + qFromBigEndian( el.articleOffset ); wordOffset = wordsBase + qFromBigEndian( el.wordOffset ); } df.seek( wordOffset ); quint16 sizeBE; df.read( &sizeBE, sizeof(sizeBE) ); quint16 wordSize = qFromBigEndian( sizeBE ); if( data.size() < wordSize ) data.resize( wordSize ); df.read( &data.front(), wordSize ); if( articleOffsets.find( articleOffset ) == articleOffsets.end() ) articleOffsets.insert( articleOffset ); // Insert new entry wstring word = Utf8::decode( string( data.data(), wordSize ) ); if( maxHeadwordsToExpand && dictHeader.wordsCount >= maxHeadwordsToExpand ) indexedWords.addSingleWord( word, articleOffset); else indexedWords.addWord( word, articleOffset); pos += has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement ); } data.clear(); idxHeader.articleCount = articleOffsets.size(); articleOffsets.clear(); // Finish with the chunks idxHeader.chunksOffset = chunks.finish(); // Build index IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; idxHeader.indexRootOffset = idxInfo.rootOffset; indexedWords.clear(); // Release memory -- no need for this data // That concludes it. Update the header. idxHeader.signature = Signature; idxHeader.formatVersion = CurrentFormatVersion; idxHeader.wordCount = wordCount; if( langFrom.size() == 3) idxHeader.langFrom = LangCoder::findIdForLanguageCode3( langFrom.c_str() ); else if( langFrom.size() == 2 ) idxHeader.langFrom = LangCoder::code2toInt( langFrom.c_str() ); if( langTo.size() == 3) idxHeader.langTo = LangCoder::findIdForLanguageCode3( langTo.c_str() ); else if( langTo.size() == 2 ) idxHeader.langTo = LangCoder::code2toInt( langTo.c_str() ); idx.rewind(); idx.write( &idxHeader, sizeof( idxHeader ) ); } catch( std::exception & e ) { gdWarning( "Aard dictionary indexing failed: %s, error: %s\n", i->c_str(), e.what() ); continue; } catch( ... ) { gdWarning( "Aard dictionary indexing failed\n" ); continue; } } // if need to rebuild try { dictionaries.push_back( std::make_shared( dictId, indexFile, dictFiles ) ); } catch( std::exception & e ) { gdWarning( "Aard dictionary initializing failed: %s, error: %s\n", i->c_str(), e.what() ); continue; } } return dictionaries; } }