mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-23 20:14:05 +00:00
32304d4b15
All Qt Headers already implicitly included it.
1516 lines
43 KiB
C++
1516 lines
43 KiB
C++
/* This file is (c) 2008-2009 Konstantin Isakov <ikm@goldendict.org>
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#include "xdxf.hh"
|
|
#include "btreeidx.hh"
|
|
#include "folding.hh"
|
|
#include "utf8.hh"
|
|
#include "chunkedstorage.hh"
|
|
#include "dictzip.h"
|
|
#include "htmlescape.hh"
|
|
#include "fsencoding.hh"
|
|
#include <map>
|
|
#include <set>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <list>
|
|
#include <wctype.h>
|
|
#include <stdlib.h>
|
|
#include "gddebug.hh"
|
|
#include "wstring_qt.hh"
|
|
#include "xdxf2html.hh"
|
|
#include "ufile.hh"
|
|
#include "dictzip.h"
|
|
#include "langcoder.hh"
|
|
#include "indexedzip.hh"
|
|
#include "filetype.hh"
|
|
#include "tiff.hh"
|
|
#include "ftshelpers.hh"
|
|
|
|
#ifdef _MSC_VER
|
|
#include <stub_msvc.h>
|
|
#endif
|
|
|
|
#include <QIODevice>
|
|
#include <QXmlStreamReader>
|
|
#include <QTextDocument>
|
|
#include <QFileInfo>
|
|
#include <QDir>
|
|
#include <QPainter>
|
|
#include <QRegExp>
|
|
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
|
|
#include <QtCore5Compat>
|
|
#endif
|
|
#include <QSemaphore>
|
|
#include <QThreadPool>
|
|
#include <QAtomicInt>
|
|
|
|
#include "utils.hh"
|
|
|
|
namespace Xdxf {
|
|
|
|
using std::map;
|
|
using std::multimap;
|
|
using std::pair;
|
|
using std::set;
|
|
using std::string;
|
|
using gd::wstring;
|
|
using std::vector;
|
|
using std::list;
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
using BtreeIndexing::IndexedWords;
|
|
using BtreeIndexing::IndexInfo;
|
|
|
|
quint32 getLanguageId( const QString & lang )
|
|
{
|
|
QString lstr = lang.left( 3 );
|
|
|
|
if( lstr.endsWith( QChar( '-' ) ) )
|
|
lstr.chop( 1 );
|
|
|
|
switch( lstr.size() )
|
|
{
|
|
case 2: return LangCoder::code2toInt( lstr.toLatin1().data() );
|
|
case 3: return LangCoder::findIdForLanguageCode3( lstr.toLatin1().data() );
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
namespace {
|
|
|
|
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
|
|
DEF_EX_STR( exNotXdxfFile, "The file is not an XDXF file:", Dictionary::Ex )
|
|
DEF_EX( exCorruptedIndex, "The index file is corrupted", Dictionary::Ex )
|
|
DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
|
|
|
|
enum
|
|
{
|
|
Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian
|
|
CurrentFormatVersion = 6 + BtreeIndexing::FormatVersion + Folding::Version
|
|
};
|
|
|
|
enum ArticleFormat
|
|
{
|
|
Default = 0,
|
|
Visual = 1,
|
|
Logical = 2
|
|
};
|
|
|
|
struct IdxHeader
|
|
{
|
|
uint32_t signature; // First comes the signature, XDXF
|
|
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
|
uint32_t articleFormat; // ArticleFormat value, except that 0 = bad file
|
|
uint32_t langFrom; // Source language
|
|
uint32_t langTo; // Target language
|
|
uint32_t articleCount; // Total number of articles
|
|
uint32_t wordCount; // Total number of words
|
|
uint32_t nameAddress; // Address of an utf8 name string, in chunks
|
|
uint32_t nameSize; // And its size
|
|
uint32_t descriptionAddress; // Address of an utf8 description string, in chunks
|
|
uint32_t descriptionSize; // And its size
|
|
uint32_t hasAbrv; // Non-zero means file has abrvs at abrvAddress
|
|
uint32_t abrvAddress; // Address of abrv map in the chunked storage
|
|
uint32_t chunksOffset; // The offset to chunks' storage
|
|
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
|
uint32_t indexRootOffset;
|
|
uint32_t hasZipFile; // Non-zero means there's a zip file with resources
|
|
// present
|
|
uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
|
|
// resource index.
|
|
uint32_t zipIndexRootOffset;
|
|
uint32_t revisionNumber; // Format revision
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
bool indexIsOldOrBad( string const & indexFile )
|
|
{
|
|
File::Class idx( indexFile, "rb" );
|
|
|
|
IdxHeader header;
|
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
|
|
header.signature != Signature ||
|
|
header.formatVersion != CurrentFormatVersion ||
|
|
!header.articleFormat;
|
|
}
|
|
|
|
|
|
|
|
class XdxfDictionary: public BtreeIndexing::BtreeDictionary
|
|
{
|
|
Mutex idxMutex;
|
|
File::Class idx;
|
|
IdxHeader idxHeader;
|
|
sptr< ChunkedStorage::Reader > chunks;
|
|
Mutex dzMutex;
|
|
dictData * dz;
|
|
Mutex resourceZipMutex;
|
|
IndexedZip resourceZip;
|
|
string dictionaryName;
|
|
map< string, string > abrv;
|
|
|
|
public:
|
|
|
|
XdxfDictionary( string const & id, string const & indexFile,
|
|
vector< string > const & dictionaryFiles );
|
|
|
|
~XdxfDictionary();
|
|
|
|
string getName() noexcept override
|
|
{ return dictionaryName; }
|
|
|
|
map< Dictionary::Property, string > getProperties() noexcept override
|
|
{ return map< Dictionary::Property, string >(); }
|
|
|
|
unsigned long getArticleCount() noexcept override
|
|
{ return idxHeader.articleCount; }
|
|
|
|
unsigned long getWordCount() noexcept override
|
|
{ return idxHeader.wordCount; }
|
|
|
|
inline quint32 getLangFrom() const override
|
|
{ return idxHeader.langFrom; }
|
|
|
|
inline quint32 getLangTo() const override
|
|
{ return idxHeader.langTo; }
|
|
|
|
sptr< Dictionary::DataRequest > getArticle( wstring const &,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics ) override
|
|
;
|
|
|
|
sptr< Dictionary::DataRequest > getResource( string const & name ) override
|
|
;
|
|
|
|
QString const& getDescription() override;
|
|
|
|
QString getMainFilename() override;
|
|
|
|
sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics ) override;
|
|
void getArticleText( uint32_t articleAddress, QString & headword, QString & text ) override;
|
|
|
|
void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration ) override;
|
|
|
|
void setFTSParameters( Config::FullTextSearch const & fts ) override
|
|
{
|
|
can_FTS = fts.enabled
|
|
&& !fts.disabledTypes.contains( "XDXF", Qt::CaseInsensitive )
|
|
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
|
|
}
|
|
|
|
uint32_t getFtsIndexVersion() override
|
|
{ return 1; }
|
|
|
|
protected:
|
|
|
|
void loadIcon() noexcept override;
|
|
|
|
private:
|
|
|
|
// Loads the article, storing its headword and formatting article's data into an html.
|
|
void loadArticle( uint32_t address,
|
|
string & articleText, QString * headword = 0 );
|
|
|
|
friend class XdxfArticleRequest;
|
|
friend class XdxfResourceRequest;
|
|
};
|
|
|
|
XdxfDictionary::XdxfDictionary( string const & id,
|
|
string const & indexFile,
|
|
vector< string > const & dictionaryFiles ):
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
idx( indexFile, "rb" ),
|
|
idxHeader( idx.read< IdxHeader >() )
|
|
{
|
|
// Read the dictionary name
|
|
|
|
chunks = std::shared_ptr<ChunkedStorage::Reader>(new ChunkedStorage::Reader(idx, idxHeader.chunksOffset ));
|
|
|
|
if ( idxHeader.nameSize )
|
|
{
|
|
vector< char > chunk;
|
|
|
|
dictionaryName = string( chunks->getBlock( idxHeader.nameAddress, chunk ),
|
|
idxHeader.nameSize );
|
|
}
|
|
|
|
// Open the file
|
|
|
|
DZ_ERRORS error;
|
|
dz = dict_data_open( dictionaryFiles[ 0 ].c_str(), &error, 0 );
|
|
|
|
if ( !dz )
|
|
throw exDictzipError( string( dz_error_str( error ) )
|
|
+ "(" + dictionaryFiles[ 0 ] + ")" );
|
|
|
|
// Read the abrv, if any
|
|
|
|
if ( idxHeader.hasAbrv )
|
|
{
|
|
vector< char > chunk;
|
|
|
|
char * abrvBlock = chunks->getBlock( idxHeader.abrvAddress, chunk );
|
|
|
|
uint32_t total;
|
|
memcpy( &total, abrvBlock, sizeof( uint32_t ) );
|
|
abrvBlock += sizeof( uint32_t );
|
|
|
|
while( total-- )
|
|
{
|
|
uint32_t keySz;
|
|
memcpy( &keySz, abrvBlock, sizeof( uint32_t ) );
|
|
abrvBlock += sizeof( uint32_t );
|
|
|
|
char * key = abrvBlock;
|
|
|
|
abrvBlock += keySz;
|
|
|
|
uint32_t valueSz;
|
|
memcpy( &valueSz, abrvBlock, sizeof( uint32_t ) );
|
|
abrvBlock += sizeof( uint32_t );
|
|
|
|
abrv[ string( key, keySz ) ] = string( abrvBlock, valueSz );
|
|
|
|
abrvBlock += valueSz;
|
|
}
|
|
|
|
// Open a resource zip file, if there's one
|
|
|
|
if ( idxHeader.hasZipFile &&
|
|
( idxHeader.zipIndexBtreeMaxElements ||
|
|
idxHeader.zipIndexRootOffset ) )
|
|
{
|
|
resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
|
|
idxHeader.zipIndexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
QString zipName = QDir::fromNativeSeparators(
|
|
FsEncoding::decode( getDictionaryFilenames().back().c_str() ) );
|
|
|
|
if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
|
|
resourceZip.openZipFile( zipName );
|
|
}
|
|
}
|
|
|
|
// Initialize the index
|
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
|
idxHeader.indexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
// Full-text search parameters
|
|
|
|
can_FTS = true;
|
|
|
|
ftsIdxName = indexFile + Dictionary::getFtsSuffix();
|
|
|
|
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
|
|
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
|
|
FTS_index_completed.ref();
|
|
}
|
|
|
|
XdxfDictionary::~XdxfDictionary()
|
|
{
|
|
if ( dz )
|
|
dict_data_close( dz );
|
|
}
|
|
|
|
void XdxfDictionary::loadIcon() noexcept
|
|
{
|
|
if ( dictionaryIconLoaded )
|
|
return;
|
|
|
|
QString fileName =
|
|
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
|
|
|
|
QFileInfo baseInfo( fileName );
|
|
|
|
fileName = baseInfo.absoluteDir().absoluteFilePath( "icon32.png" );
|
|
QFileInfo info( fileName );
|
|
|
|
if( !info.isFile() )
|
|
{
|
|
fileName = baseInfo.absoluteDir().absoluteFilePath( "icon16.png" );
|
|
info = QFileInfo( fileName );
|
|
}
|
|
|
|
if ( info.isFile() )
|
|
loadIconFromFile( fileName, true );
|
|
|
|
if ( dictionaryIcon.isNull() )
|
|
{
|
|
// Load failed -- use default icons
|
|
|
|
dictionaryIcon = QIcon(":/icons/icon32_xdxf.png");
|
|
dictionaryNativeIcon = QIcon(":/icons/icon32_xdxf.png");
|
|
}
|
|
|
|
dictionaryIconLoaded = true;
|
|
}
|
|
|
|
QString const& XdxfDictionary::getDescription()
|
|
{
|
|
if( !dictionaryDescription.isEmpty() )
|
|
return dictionaryDescription;
|
|
|
|
if( idxHeader.descriptionAddress == 0 )
|
|
dictionaryDescription = "NONE";
|
|
else
|
|
{
|
|
try
|
|
{
|
|
vector< char > chunk;
|
|
char * descr;
|
|
{
|
|
Mutex::Lock _( idxMutex );
|
|
descr = chunks->getBlock( idxHeader.descriptionAddress, chunk );
|
|
}
|
|
dictionaryDescription = QString::fromUtf8( descr, idxHeader.descriptionSize );
|
|
}
|
|
catch(...)
|
|
{
|
|
}
|
|
}
|
|
return dictionaryDescription;
|
|
}
|
|
|
|
QString XdxfDictionary::getMainFilename()
|
|
{
|
|
return FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() );
|
|
}
|
|
|
|
void XdxfDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
|
|
{
|
|
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|
|
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
|
|
FTS_index_completed.ref();
|
|
|
|
if( haveFTSIndex() )
|
|
return;
|
|
|
|
if( ensureInitDone().size() )
|
|
return;
|
|
|
|
if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
|
|
return;
|
|
|
|
gdDebug( "Xdxf: Building the full-text index for dictionary: %s\n",
|
|
getName().c_str() );
|
|
|
|
try
|
|
{
|
|
FtsHelpers::makeFTSIndex( this, isCancelled );
|
|
FTS_index_completed.ref();
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Xdxf: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
|
|
}
|
|
}
|
|
|
|
void XdxfDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
|
|
{
|
|
try
|
|
{
|
|
string articleStr;
|
|
loadArticle( articleAddress, articleStr, &headword );
|
|
|
|
wstring wstr = Utf8::decode( articleStr );
|
|
|
|
text = Html::unescape( gd::toQString( wstr ) );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Xdxf: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
}
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > XdxfDictionary::getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics )
|
|
{
|
|
return std::make_shared<FtsHelpers::FTSResultsRequest>( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
|
|
}
|
|
|
|
/// XdxfDictionary::getArticle()
|
|
|
|
class XdxfArticleRequest;
|
|
|
|
class XdxfArticleRequestRunnable: public QRunnable
|
|
{
|
|
XdxfArticleRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
XdxfArticleRequestRunnable( XdxfArticleRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~XdxfArticleRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
void run() override;
|
|
};
|
|
|
|
class XdxfArticleRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class XdxfArticleRequestRunnable;
|
|
|
|
wstring word;
|
|
vector< wstring > alts;
|
|
XdxfDictionary & dict;
|
|
bool ignoreDiacritics;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
XdxfArticleRequest( wstring const & word_,
|
|
vector< wstring > const & alts_,
|
|
XdxfDictionary & dict_, bool ignoreDiacritics_ ):
|
|
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new XdxfArticleRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by XdxfArticleRequestRunnable
|
|
|
|
void cancel() override
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~XdxfArticleRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void XdxfArticleRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void XdxfArticleRequest::run()
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
|
|
|
for( unsigned x = 0; x < alts.size(); ++x )
|
|
{
|
|
/// Make an additional query for each alt
|
|
|
|
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
}
|
|
|
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
|
|
|
set< uint32_t > articlesIncluded; // Some synonims make it that the articles
|
|
// appear several times. We combat this
|
|
// by only allowing them to appear once.
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
|
if( ignoreDiacritics )
|
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
|
continue; // We already have this article in the body.
|
|
|
|
// Now grab that article
|
|
|
|
string headword, articleText;
|
|
|
|
headword = chain[ x ].word;
|
|
|
|
try
|
|
{
|
|
dict.loadArticle( chain[ x ].articleOffset, articleText );
|
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
// main ones first, and alternates after.
|
|
|
|
// We do the case-folded comparison here.
|
|
|
|
wstring headwordStripped =
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
|
|
if( ignoreDiacritics )
|
|
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
|
|
|
|
multimap< wstring, pair< string, string > > & mapToUse =
|
|
( wordCaseFolded == headwordStripped ) ?
|
|
mainArticles : alternateArticles;
|
|
|
|
mapToUse.insert( pair< wstring, pair< string, string > >(
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
|
|
pair< string, string >( headword, articleText ) ) );
|
|
|
|
articlesIncluded.insert( chain[ x ].articleOffset );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "XDXF: Failed loading article from \"%s\", reason: %s\n", dict.getName().c_str(), ex.what() );
|
|
}
|
|
}
|
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() )
|
|
{
|
|
// No such word
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string result;
|
|
|
|
multimap< wstring, pair< string, string > >::const_iterator i;
|
|
|
|
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</b></b></b></b></b></b></b></b>"
|
|
"</i></i></i></i></i></i></i></i>";
|
|
|
|
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
|
|
{
|
|
// result += "<h3>";
|
|
// result += i->second.first;
|
|
// result += "</h3>";
|
|
result += i->second.second;
|
|
result += cleaner;
|
|
}
|
|
|
|
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
|
|
{
|
|
// result += "<h3>";
|
|
// result += i->second.first;
|
|
// result += "</h3>";
|
|
result += i->second.second;
|
|
result += cleaner;
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
data.resize( result.size() );
|
|
|
|
memcpy( &data.front(), result.data(), result.size() );
|
|
|
|
hasAnyData = true;
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > XdxfDictionary::getArticle( wstring const & word,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
|
|
{
|
|
return std::make_shared<XdxfArticleRequest>( word, alts, *this, ignoreDiacritics );
|
|
}
|
|
|
|
void XdxfDictionary::loadArticle( uint32_t address,
|
|
string & articleText,
|
|
QString * headword )
|
|
{
|
|
// Read the properties
|
|
|
|
vector< char > chunk;
|
|
|
|
char * propertiesData;
|
|
|
|
{
|
|
Mutex::Lock _( idxMutex );
|
|
|
|
propertiesData = chunks->getBlock( address, chunk );
|
|
}
|
|
|
|
if ( &chunk.front() + chunk.size() - propertiesData < 9 )
|
|
{
|
|
articleText = string( "<div class=\"xdxf\">Index seems corrupted</div>" );
|
|
return;
|
|
}
|
|
|
|
unsigned char fType = (unsigned char) *propertiesData;
|
|
|
|
uint32_t articleOffset, articleSize;
|
|
|
|
memcpy( &articleOffset, propertiesData + 1, sizeof( uint32_t ) );
|
|
memcpy( &articleSize, propertiesData + 5, sizeof( uint32_t ) );
|
|
|
|
// Load the article
|
|
|
|
char * articleBody;
|
|
|
|
{
|
|
Mutex::Lock _( dzMutex );
|
|
|
|
// Note that the function always zero-pads the result.
|
|
articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
|
|
}
|
|
|
|
if ( !articleBody )
|
|
{
|
|
// throw exCantReadFile( getDictionaryFilenames()[ 0 ] );
|
|
articleText = string( "<div class=\"xdxf\">DICTZIP error: " ) + dict_error_str( dz ) + "</div>";
|
|
return;
|
|
}
|
|
|
|
articleText = Xdxf2Html::convert( string( articleBody ), Xdxf2Html::XDXF, idxHeader.hasAbrv ? &abrv : NULL, this,
|
|
&resourceZip, fType == Logical, idxHeader.revisionNumber, headword );
|
|
|
|
free( articleBody );
|
|
}
|
|
|
|
class GzippedFile: public QIODevice
|
|
{
|
|
gzFile gz;
|
|
|
|
public:
|
|
|
|
GzippedFile( char const * fileName ) ;
|
|
|
|
~GzippedFile();
|
|
|
|
// size_t gzTell();
|
|
|
|
char * readDataArray( unsigned long startPos, unsigned long size );
|
|
|
|
protected:
|
|
|
|
dictData *dz;
|
|
|
|
bool isSequential () const override
|
|
{ return false; } // Which is a lie, but else pos() won't work
|
|
|
|
bool waitForReadyRead ( int ) override
|
|
{ return !gzeof( gz ); }
|
|
|
|
qint64 bytesAvailable() const override
|
|
{
|
|
return ( gzeof( gz ) ? 0 : 1 ) + QIODevice::bytesAvailable();
|
|
}
|
|
|
|
qint64 readData( char * data, qint64 maxSize ) override;
|
|
|
|
bool atEnd() const override;
|
|
|
|
qint64 writeData ( const char * /*data*/, qint64 /*maxSize*/ ) override
|
|
{ return -1; }
|
|
};
|
|
|
|
GzippedFile::GzippedFile( char const * fileName )
|
|
{
|
|
gz = gd_gzopen( fileName );
|
|
if ( !gz )
|
|
throw exCantReadFile( fileName );
|
|
|
|
DZ_ERRORS error;
|
|
dz = dict_data_open( fileName, &error, 0 );
|
|
}
|
|
|
|
GzippedFile::~GzippedFile()
|
|
{
|
|
gzclose( gz );
|
|
if( dz )
|
|
dict_data_close( dz );
|
|
}
|
|
|
|
bool GzippedFile::atEnd() const
|
|
{
|
|
return gzeof( gz );
|
|
}
|
|
|
|
/*
|
|
size_t GzippedFile::gzTell()
|
|
{
|
|
return gztell( gz );
|
|
}
|
|
*/
|
|
|
|
qint64 GzippedFile::readData( char * data, qint64 maxSize )
|
|
{
|
|
if ( maxSize > 1 )
|
|
maxSize = 1;
|
|
|
|
// The returning value translates directly to QIODevice semantics
|
|
int n = gzread( gz, data, maxSize );
|
|
|
|
// With QT 5.x QXmlStreamReader ask one byte instead of one UTF-8 char.
|
|
// We read and return all bytes for char.
|
|
|
|
if( n == 1 )
|
|
{
|
|
char ch = *data;
|
|
int addBytes = 0;
|
|
if( ch & 0x80 )
|
|
{
|
|
if( ( ch & 0xF8 ) == 0xF0 )
|
|
addBytes = 3;
|
|
else if( ( ch & 0xF0 ) == 0xE0 )
|
|
addBytes = 2;
|
|
else if( ( ch & 0xE0 ) == 0xC0 )
|
|
addBytes = 1;
|
|
}
|
|
|
|
if( addBytes )
|
|
n += gzread( gz, data + 1, addBytes );
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
char * GzippedFile::readDataArray( unsigned long startPos, unsigned long size )
|
|
{
|
|
if( dz == NULL )
|
|
return NULL;
|
|
return dict_data_read_( dz, startPos, size, 0, 0 );
|
|
}
|
|
|
|
QString readXhtmlData( QXmlStreamReader & stream )
|
|
{
|
|
QString result;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
{
|
|
QString name = stream.name().toString();
|
|
|
|
result += "<" + Utils::escape( name ) + " ";
|
|
|
|
QXmlStreamAttributes attrs = stream.attributes();
|
|
|
|
for( int x = 0; x < attrs.size(); ++x )
|
|
{
|
|
result += Utils::escape( attrs[ x ].name().toString() );
|
|
result += "=\"" + Utils::escape( attrs[ x ].value().toString() ) + "\"";
|
|
}
|
|
|
|
result += ">";
|
|
|
|
result += readXhtmlData( stream );
|
|
|
|
result += "</" + Utils::escape( name ) + ">";
|
|
}
|
|
else
|
|
if ( stream.isCharacters() || stream.isWhitespace() || stream.isCDATA() )
|
|
{
|
|
result += stream.text();
|
|
}
|
|
else
|
|
if ( stream.isEndElement() )
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
namespace {
|
|
|
|
/// Deal with Qt 4.5 incompatibility
|
|
QString readElementText( QXmlStreamReader & stream )
|
|
{
|
|
return stream.readElementText( QXmlStreamReader::SkipChildElements );
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void addAllKeyTags( QXmlStreamReader & stream, list< QString > & words )
|
|
{
|
|
// todo implement support for tag <srt>, that overrides the article sorting order
|
|
if ( stream.name() == u"k" )
|
|
{
|
|
words.push_back( readElementText( stream ) );
|
|
return;
|
|
}
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
addAllKeyTags( stream, words );
|
|
else
|
|
if ( stream.isEndElement() )
|
|
return;
|
|
}
|
|
}
|
|
|
|
void checkArticlePosition( GzippedFile & gzFile,
|
|
uint32_t *pOffset,
|
|
uint32_t *pSize )
|
|
{
|
|
char * data = gzFile.readDataArray( *pOffset, *pSize );
|
|
if( data == NULL )
|
|
return;
|
|
QString s = QString::fromUtf8( data );
|
|
free( data );
|
|
int n = s.lastIndexOf( "</ar" );
|
|
if( n > 0 )
|
|
*pSize -= s.size() - n;
|
|
if( s.at( 0 ) == '>')
|
|
{
|
|
*pOffset += 1;
|
|
*pSize -= 1;
|
|
}
|
|
}
|
|
|
|
void indexArticle( GzippedFile & gzFile,
|
|
QXmlStreamReader & stream,
|
|
IndexedWords & indexedWords,
|
|
ChunkedStorage::Writer & chunks,
|
|
unsigned & articleCount,
|
|
unsigned & wordCount,
|
|
ArticleFormat defaultFormat )
|
|
{
|
|
ArticleFormat format( Default );
|
|
|
|
QStringView formatValue = stream.attributes().value( "f" );
|
|
|
|
if ( formatValue == u"v" )
|
|
format = Visual;
|
|
else
|
|
if ( formatValue == u"l" )
|
|
format = Logical;
|
|
if( format == Default )
|
|
format = defaultFormat;
|
|
size_t articleOffset = gzFile.pos() - 1; // stream.characterOffset() is loony
|
|
|
|
// uint32_t lineNumber = stream.lineNumber();
|
|
// uint32_t columnNumber = stream.columnNumber();
|
|
|
|
list< QString > words;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
// Find any <k> tags and index them
|
|
if ( stream.isEndElement() )
|
|
{
|
|
// End of the <ar> tag
|
|
|
|
if ( words.empty() )
|
|
{
|
|
// Nothing to index, this article didn't have any tags
|
|
gdWarning( "No <k> tags found in an article at offset 0x%x, article skipped.\n",
|
|
(unsigned) articleOffset );
|
|
}
|
|
else
|
|
{
|
|
// Add an entry
|
|
|
|
uint32_t offset = chunks.startNewBlock();
|
|
|
|
uint32_t offs = articleOffset;
|
|
uint32_t size = gzFile.pos() - 1 - articleOffset;
|
|
|
|
checkArticlePosition( gzFile, &offs, &size );
|
|
|
|
unsigned char f = format;
|
|
chunks.addToBlock( &f, 1 );
|
|
chunks.addToBlock( &offs, sizeof( offs ) );
|
|
chunks.addToBlock( &size, sizeof( size ) );
|
|
|
|
// Add also first header - it's needed for full-text search
|
|
chunks.addToBlock( words.begin()->toUtf8().data(), words.begin()->toUtf8().length() + 1 );
|
|
|
|
// GD_DPRINTF( "%x: %s\n", articleOffset, words.begin()->toUtf8().data() );
|
|
|
|
// Add words to index
|
|
|
|
for( list< QString >::const_iterator i = words.begin(); i != words.end();
|
|
++i )
|
|
indexedWords.addWord( gd::toWString( *i ), offset );
|
|
|
|
++articleCount;
|
|
|
|
wordCount += words.size();
|
|
}
|
|
|
|
return;
|
|
}
|
|
else
|
|
if ( stream.isStartElement() )
|
|
{
|
|
addAllKeyTags( stream, words );
|
|
}
|
|
}
|
|
}
|
|
|
|
//// XdxfDictionary::getResource()
|
|
|
|
class XdxfResourceRequest;
|
|
|
|
class XdxfResourceRequestRunnable: public QRunnable
|
|
{
|
|
XdxfResourceRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
XdxfResourceRequestRunnable( XdxfResourceRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~XdxfResourceRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
void run() override;
|
|
};
|
|
|
|
class XdxfResourceRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class XdxfResourceRequestRunnable;
|
|
|
|
XdxfDictionary & dict;
|
|
|
|
string resourceName;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
XdxfResourceRequest( XdxfDictionary & dict_,
|
|
string const & resourceName_ ):
|
|
dict( dict_ ),
|
|
resourceName( resourceName_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new XdxfResourceRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by XdxfResourceRequestRunnable
|
|
|
|
void cancel() override
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~XdxfResourceRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void XdxfResourceRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void XdxfResourceRequest::run()
|
|
{
|
|
// Some runnables linger enough that they are cancelled before they start
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
if ( dict.ensureInitDone().size() )
|
|
{
|
|
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string n =
|
|
FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
|
|
FsEncoding::separator() +
|
|
FsEncoding::encode( resourceName );
|
|
|
|
GD_DPRINTF( "n is %s\n", n.c_str() );
|
|
|
|
try
|
|
{
|
|
try
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
File::loadFromFile( n, data );
|
|
}
|
|
catch( File::exCantOpen & )
|
|
{
|
|
n = dict.getDictionaryFilenames()[ 0 ] + ".files" +
|
|
FsEncoding::separator() +
|
|
FsEncoding::encode( resourceName );
|
|
|
|
try
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
File::loadFromFile( n, data );
|
|
}
|
|
catch( File::exCantOpen & )
|
|
{
|
|
// Try reading from zip file
|
|
|
|
if ( dict.resourceZip.isOpen() )
|
|
{
|
|
Mutex::Lock _( dict.resourceZipMutex );
|
|
|
|
Mutex::Lock __( dataMutex );
|
|
|
|
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
|
|
throw; // Make it fail since we couldn't read the archive
|
|
}
|
|
else
|
|
throw;
|
|
}
|
|
}
|
|
|
|
if ( Filetype::isNameOfTiff( resourceName ) )
|
|
{
|
|
// Convert it
|
|
Mutex::Lock _( dataMutex );
|
|
GdTiff::tiff2img( data );
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
hasAnyData = true;
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "XDXF: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
|
|
resourceName.c_str(), dict.getName().c_str(), ex.what() );
|
|
// Resource not loaded -- we don't set the hasAnyData flag then
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > XdxfDictionary::getResource( string const & name )
|
|
|
|
{
|
|
return std::make_shared<XdxfResourceRequest>( *this, name );
|
|
}
|
|
|
|
}
|
|
// anonymous namespace - this section of file is devoted to rebuilding of dictionary articles index
|
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries(
|
|
vector< string > const & fileNames,
|
|
string const & indicesDir,
|
|
Dictionary::Initializing & initializing )
|
|
|
|
{
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
|
|
++i )
|
|
{
|
|
// Only allow .xdxf and .xdxf.dz suffixes
|
|
|
|
if ( ( i->size() < 5 || strcasecmp( i->c_str() + ( i->size() - 5 ), ".xdxf" ) != 0 ) &&
|
|
( i->size() < 8 ||
|
|
strcasecmp( i->c_str() + ( i->size() - 8 ), ".xdxf.dz" ) != 0 ) )
|
|
continue;
|
|
|
|
try
|
|
{
|
|
vector< string > dictFiles( 1, *i );
|
|
|
|
string baseName = ( (*i)[ i->size() - 5 ] == '.' ) ?
|
|
string( *i, 0, i->size() - 5 ) : string( *i, 0, i->size() - 8 );
|
|
|
|
// See if there's a zip file with resources present. If so, include it.
|
|
|
|
string zipFileName;
|
|
|
|
if ( File::tryPossibleZipName( baseName + ".xdxf.files.zip", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".xdxf.dz.files.zip", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".XDXF.FILES.ZIP", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".XDXF.DZ.FILES.ZIP", zipFileName ) )
|
|
dictFiles.push_back( zipFileName );
|
|
|
|
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
|
|
|
string indexFile = indicesDir + dictId;
|
|
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
|
|
indexIsOldOrBad( indexFile ) )
|
|
{
|
|
// Building the index
|
|
|
|
gdDebug( "Xdxf: Building the index for dictionary: %s\n", i->c_str() );
|
|
|
|
//initializing.indexingDictionary( nameFromFileName( dictFiles[ 0 ] ) );
|
|
|
|
File::Class idx( indexFile, "wb" );
|
|
|
|
IdxHeader idxHeader;
|
|
map< string, string > abrv;
|
|
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
// will be rewritten with the right values.
|
|
|
|
idx.write( idxHeader );
|
|
|
|
IndexedWords indexedWords;
|
|
|
|
GzippedFile gzFile( dictFiles[ 0 ].c_str() );
|
|
|
|
if ( !gzFile.open( QIODevice::ReadOnly ) )
|
|
throw exCantReadFile( dictFiles[ 0 ] );
|
|
|
|
QXmlStreamReader stream( &gzFile );
|
|
|
|
QString dictionaryName, dictionaryDescription;
|
|
|
|
ChunkedStorage::Writer chunks( idx );
|
|
|
|
// Wait for the first element, which must be xdxf
|
|
|
|
bool hadXdxf = false;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
{
|
|
if ( stream.name() != u"xdxf" )
|
|
throw exNotXdxfFile( dictFiles[ 0 ] );
|
|
else
|
|
{
|
|
// Read the xdxf
|
|
|
|
string str = stream.attributes().value( "lang_from" ).toString().toLatin1().data();
|
|
if( !str.empty() )
|
|
idxHeader.langFrom = getLanguageId( str.c_str() );
|
|
|
|
str = stream.attributes().value( "lang_to" ).toString().toLatin1().data();
|
|
if( !str.empty() )
|
|
idxHeader.langTo = getLanguageId( str.c_str() );
|
|
|
|
QRegExp regNum( "\\d+" );
|
|
regNum.indexIn( stream.attributes().value( "revision" ).toString() );
|
|
idxHeader.revisionNumber = regNum.cap().toUInt();
|
|
|
|
bool isLogical = ( stream.attributes().value( "format" ) == u"logical" || idxHeader.revisionNumber >= 34 );
|
|
|
|
idxHeader.articleFormat = isLogical ? Logical : Visual;
|
|
|
|
unsigned articleCount = 0, wordCount = 0;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
{
|
|
// todo implement using short <title> for denoting the dictionary in settings or dict list toolbar
|
|
if ( stream.name() == u"full_name" || stream.name() == u"full_title" )
|
|
{
|
|
// That's our name
|
|
|
|
QString name = stream.readElementText();
|
|
|
|
if ( dictionaryName.isEmpty() )
|
|
{
|
|
dictionaryName = name;
|
|
|
|
initializing.indexingDictionary( dictionaryName.toUtf8().data() );
|
|
|
|
idxHeader.nameAddress = chunks.startNewBlock();
|
|
|
|
QByteArray n = dictionaryName.toUtf8();
|
|
|
|
idxHeader.nameSize = n.size();
|
|
|
|
chunks.addToBlock( n.data(), n.size() );
|
|
}
|
|
else
|
|
{
|
|
GD_DPRINTF( "Warning: duplicate full_name in %s\n", dictFiles[ 0 ].c_str() );
|
|
}
|
|
}
|
|
else
|
|
if ( stream.name() == u"description" )
|
|
{
|
|
// todo implement adding other information to the description like <publisher>, <authors>, <file_ver>, <creation_date>, <last_edited_date>, <dict_edition>, <publishing_date>, <dict_src_url>
|
|
QString desc = readXhtmlData( stream );
|
|
|
|
if( isLogical )
|
|
{
|
|
desc = desc.simplified();
|
|
QRegularExpression br( "<br\\s*>\\s*</br>" );
|
|
desc.replace( br, QString("\n") );
|
|
}
|
|
|
|
if ( dictionaryDescription.isEmpty() )
|
|
{
|
|
dictionaryDescription = desc;
|
|
idxHeader.descriptionAddress = chunks.startNewBlock();
|
|
|
|
QByteArray n = dictionaryDescription.toUtf8();
|
|
|
|
idxHeader.descriptionSize = n.size();
|
|
|
|
chunks.addToBlock( n.data(), n.size() );
|
|
}
|
|
else
|
|
{
|
|
GD_DPRINTF( "Warning: duplicate description in %s\n", dictFiles[ 0 ].c_str() );
|
|
}
|
|
}
|
|
else
|
|
if( stream.name() == u"languages" )
|
|
{
|
|
while( !( stream.isEndElement() && stream.name() == u"languages" ) && !stream.atEnd() )
|
|
{
|
|
if( !stream.readNext() )
|
|
break;
|
|
if ( stream.isStartElement() )
|
|
{
|
|
if( stream.name() == u"from" )
|
|
{
|
|
if( idxHeader.langFrom == 0 )
|
|
{
|
|
QString lang = stream.attributes().value( "xml:lang" ).toString();
|
|
idxHeader.langFrom = getLanguageId( lang );
|
|
}
|
|
}
|
|
else if( stream.name() == u"to" )
|
|
{
|
|
if( idxHeader.langTo == 0 )
|
|
{
|
|
QString lang = stream.attributes().value( "xml:lang" ).toString();
|
|
idxHeader.langTo = getLanguageId( lang );
|
|
}
|
|
}
|
|
}
|
|
else if ( stream.isEndElement() && stream.name() == u"languages" )
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
if ( stream.name() == u"abbreviations" )
|
|
{
|
|
QString s;
|
|
string value;
|
|
list < wstring > keys;
|
|
while( !( stream.isEndElement() && stream.name() == u"abbreviations" ) && !stream.atEnd() )
|
|
{
|
|
if( !stream.readNextStartElement() )
|
|
break;
|
|
// abbreviations tag set switch at format revision = 30
|
|
if( idxHeader.revisionNumber >= 30 )
|
|
{
|
|
while ( !( stream.isEndElement() && stream.name() == u"abbr_def" ) || !stream.atEnd() )
|
|
{
|
|
if ( stream.isStartElement() && stream.name() == u"abbr_k" )
|
|
{
|
|
s = readElementText( stream );
|
|
keys.push_back( gd::toWString( s ) );
|
|
}
|
|
else if ( stream.isStartElement() && stream.name() == u"abbr_v" )
|
|
{
|
|
s = readElementText( stream );
|
|
value = Utf8::encode( Folding::trimWhitespace( gd::toWString( s ) ) );
|
|
for( list< wstring >::iterator i = keys.begin(); i != keys.end(); ++i )
|
|
{
|
|
abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
|
|
}
|
|
keys.clear();
|
|
}
|
|
else if ( stream.isEndElement() && stream.name() == u"abbreviations" )
|
|
break;
|
|
stream.readNext();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
while ( !( stream.isEndElement() && stream.name() == u"abr_def" ) || !stream.atEnd() )
|
|
{
|
|
if ( stream.isStartElement() && stream.name() == u"k" )
|
|
{
|
|
s = readElementText( stream );
|
|
keys.push_back( gd::toWString( s ) );
|
|
}
|
|
else if ( stream.isStartElement() && stream.name() == u"v" )
|
|
{
|
|
s = readElementText( stream );
|
|
value = Utf8::encode( Folding::trimWhitespace( gd::toWString( s ) ) );
|
|
for( list< wstring >::iterator i = keys.begin(); i != keys.end(); ++i )
|
|
{
|
|
abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
|
|
}
|
|
keys.clear();
|
|
}
|
|
else if ( stream.isEndElement() && stream.name() == u"abbreviations" )
|
|
break;
|
|
stream.readNext();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
if ( stream.name() == u"ar" )
|
|
{
|
|
indexArticle( gzFile, stream, indexedWords, chunks,
|
|
articleCount, wordCount, isLogical ? Logical : Visual );
|
|
}
|
|
}
|
|
}
|
|
|
|
// Write abbreviations if presented
|
|
|
|
if( !abrv.empty() )
|
|
{
|
|
idxHeader.hasAbrv = 1;
|
|
idxHeader.abrvAddress = chunks.startNewBlock();
|
|
|
|
uint32_t sz = abrv.size();
|
|
|
|
chunks.addToBlock( &sz, sizeof( uint32_t ) );
|
|
|
|
for( map< string, string >::const_iterator i = abrv.begin(); i != abrv.end(); ++i )
|
|
{
|
|
sz = i->first.size();
|
|
chunks.addToBlock( &sz, sizeof( uint32_t ) );
|
|
chunks.addToBlock( i->first.data(), sz );
|
|
sz = i->second.size();
|
|
chunks.addToBlock( &sz, sizeof( uint32_t ) );
|
|
chunks.addToBlock( i->second.data(), sz );
|
|
}
|
|
}
|
|
|
|
// Finish with the chunks
|
|
|
|
idxHeader.chunksOffset = chunks.finish();
|
|
|
|
// Build index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
|
|
|
indexedWords.clear(); // Release memory -- no need for this data
|
|
|
|
// If there was a zip file, index it too
|
|
|
|
if ( zipFileName.size() )
|
|
{
|
|
GD_DPRINTF( "Indexing zip file\n" );
|
|
|
|
idxHeader.hasZipFile = 1;
|
|
|
|
IndexedWords zipFileNames;
|
|
IndexedZip zipFile;
|
|
if( zipFile.openZipFile( QDir::fromNativeSeparators(
|
|
FsEncoding::decode( zipFileName.c_str() ) ) ) )
|
|
zipFile.indexFile( zipFileNames );
|
|
|
|
if( !zipFileNames.empty() )
|
|
{
|
|
// Build the resulting zip file index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
|
|
|
|
idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
|
|
}
|
|
else
|
|
{
|
|
// Bad zip file -- no index (though the mark that we have one
|
|
// remains)
|
|
idxHeader.zipIndexBtreeMaxElements = 0;
|
|
idxHeader.zipIndexRootOffset = 0;
|
|
}
|
|
}
|
|
else
|
|
idxHeader.hasZipFile = 0;
|
|
// That concludes it. Update the header.
|
|
|
|
idxHeader.signature = Signature;
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
|
|
|
idxHeader.articleCount = articleCount;
|
|
idxHeader.wordCount = wordCount;
|
|
|
|
idx.rewind();
|
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
|
|
hadXdxf = true;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ( !hadXdxf )
|
|
throw exNotXdxfFile( dictFiles[ 0 ] );
|
|
|
|
if ( stream.hasError() )
|
|
{
|
|
gdWarning( "%s had a parse error %s at line %lu, and therefore was indexed only up to the point of error.",
|
|
dictFiles[ 0 ].c_str(), stream.errorString().toUtf8().data(),
|
|
(unsigned long) stream.lineNumber() );
|
|
}
|
|
}
|
|
|
|
dictionaries.push_back( std::make_shared<XdxfDictionary>( dictId,
|
|
indexFile,
|
|
dictFiles ) );
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
gdWarning( "Xdxf dictionary initializing failed: %s, error: %s\n",
|
|
i->c_str(), e.what() );
|
|
}
|
|
}
|
|
|
|
return dictionaries;
|
|
}
|
|
|
|
}
|