mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 04:24:09 +00:00
5034348c1a
When a referenced audio resource is not found in a DSL or XDXF dictionary, GoldenDict searches for this resource by filename in all other dictionaries within the current group. Naturally, the file is absent from most dictionaries (see #970). Therefore a "Failed loading resource" warning is printed for almost every dictionary in the current group. These warnings are by far the most frequent on my system. And in the scenario described above there is nothing wrong at all. So the user may want to silence these warnings to help notice less frequent and more important messages. Implement categorized logging to enable this customization. These warnings can now be disabled by adding the following line in the [Rules] section of a logging configuration file (e.g. ~/.config/QtProject/qtlogging.ini on GNU/Linux): goldendict.dictionary.resource.warning=false See also https://doc.qt.io/qt-5/qloggingcategory.html#logging-rules
1503 lines
42 KiB
C++
1503 lines
42 KiB
C++
/* This file is (c) 2008-2009 Konstantin Isakov <ikm@goldendict.org>
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#include "xdxf.hh"
|
|
#include "btreeidx.hh"
|
|
#include "folding.hh"
|
|
#include "utf8.hh"
|
|
#include "chunkedstorage.hh"
|
|
#include "dictzip.h"
|
|
#include "htmlescape.hh"
|
|
#include "fsencoding.hh"
|
|
#include <map>
|
|
#include <set>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <list>
|
|
#include <wctype.h>
|
|
#include <stdlib.h>
|
|
#include "categorized_logging.hh"
|
|
#include "gddebug.hh"
|
|
#include "wstring_qt.hh"
|
|
#include "xdxf2html.hh"
|
|
#include "ufile.hh"
|
|
#include "dictzip.h"
|
|
#include "langcoder.hh"
|
|
#include "indexedzip.hh"
|
|
#include "filetype.hh"
|
|
#include "tiff.hh"
|
|
#include "ftshelpers.hh"
|
|
|
|
#ifdef _MSC_VER
|
|
#include <stub_msvc.h>
|
|
#endif
|
|
|
|
#include <QIODevice>
|
|
#include <QXmlStreamReader>
|
|
#include <QTextDocument>
|
|
#include <QFileInfo>
|
|
#include <QDir>
|
|
#include <QPainter>
|
|
#include <QDebug>
|
|
#include <QRegExp>
|
|
|
|
#include <QSemaphore>
|
|
#include <QThreadPool>
|
|
#include <QAtomicInt>
|
|
|
|
#include "qt4x5.hh"
|
|
|
|
namespace Xdxf {
|
|
|
|
using std::map;
|
|
using std::multimap;
|
|
using std::pair;
|
|
using std::set;
|
|
using std::string;
|
|
using gd::wstring;
|
|
using std::vector;
|
|
using std::list;
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
using BtreeIndexing::IndexedWords;
|
|
using BtreeIndexing::IndexInfo;
|
|
|
|
namespace {
|
|
|
|
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
|
|
DEF_EX_STR( exNotXdxfFile, "The file is not an XDXF file:", Dictionary::Ex )
|
|
DEF_EX( exCorruptedIndex, "The index file is corrupted", Dictionary::Ex )
|
|
DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
|
|
|
|
enum
|
|
{
|
|
Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian
|
|
CurrentFormatVersion = 5 + BtreeIndexing::FormatVersion + Folding::Version
|
|
};
|
|
|
|
enum ArticleFormat
|
|
{
|
|
Default = 0,
|
|
Visual = 1,
|
|
Logical = 2
|
|
};
|
|
|
|
struct IdxHeader
|
|
{
|
|
uint32_t signature; // First comes the signature, XDXF
|
|
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
|
uint32_t articleFormat; // ArticleFormat value, except that 0 = bad file
|
|
uint32_t langFrom; // Source language
|
|
uint32_t langTo; // Target language
|
|
uint32_t articleCount; // Total number of articles
|
|
uint32_t wordCount; // Total number of words
|
|
uint32_t nameAddress; // Address of an utf8 name string, in chunks
|
|
uint32_t nameSize; // And its size
|
|
uint32_t descriptionAddress; // Address of an utf8 description string, in chunks
|
|
uint32_t descriptionSize; // And its size
|
|
uint32_t hasAbrv; // Non-zero means file has abrvs at abrvAddress
|
|
uint32_t abrvAddress; // Address of abrv map in the chunked storage
|
|
uint32_t chunksOffset; // The offset to chunks' storage
|
|
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
|
uint32_t indexRootOffset;
|
|
uint32_t hasZipFile; // Non-zero means there's a zip file with resources
|
|
// present
|
|
uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
|
|
// resource index.
|
|
uint32_t zipIndexRootOffset;
|
|
uint32_t revisionNumber; // Format revision
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
bool indexIsOldOrBad( string const & indexFile )
|
|
{
|
|
File::Class idx( indexFile, "rb" );
|
|
|
|
IdxHeader header;
|
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
|
|
header.signature != Signature ||
|
|
header.formatVersion != CurrentFormatVersion ||
|
|
!header.articleFormat;
|
|
}
|
|
|
|
|
|
|
|
class XdxfDictionary: public BtreeIndexing::BtreeDictionary
|
|
{
|
|
Mutex idxMutex;
|
|
File::Class idx;
|
|
IdxHeader idxHeader;
|
|
sptr< ChunkedStorage::Reader > chunks;
|
|
Mutex dzMutex;
|
|
dictData * dz;
|
|
Mutex resourceZipMutex;
|
|
IndexedZip resourceZip;
|
|
string dictionaryName;
|
|
map< string, string > abrv;
|
|
|
|
public:
|
|
|
|
XdxfDictionary( string const & id, string const & indexFile,
|
|
vector< string > const & dictionaryFiles );
|
|
|
|
~XdxfDictionary();
|
|
|
|
virtual string getName() throw()
|
|
{ return dictionaryName; }
|
|
|
|
virtual map< Dictionary::Property, string > getProperties() throw()
|
|
{ return map< Dictionary::Property, string >(); }
|
|
|
|
virtual unsigned long getArticleCount() throw()
|
|
{ return idxHeader.articleCount; }
|
|
|
|
virtual unsigned long getWordCount() throw()
|
|
{ return idxHeader.wordCount; }
|
|
|
|
inline virtual quint32 getLangFrom() const
|
|
{ return idxHeader.langFrom; }
|
|
|
|
inline virtual quint32 getLangTo() const
|
|
{ return idxHeader.langTo; }
|
|
|
|
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
THROW_SPEC( std::exception );
|
|
|
|
virtual sptr< Dictionary::DataRequest > getResource( string const & name )
|
|
THROW_SPEC( std::exception );
|
|
|
|
virtual QString const& getDescription();
|
|
|
|
virtual QString getMainFilename();
|
|
|
|
virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics,
|
|
QThreadPool * ftsThreadPoolPtr );
|
|
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
|
|
|
|
virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
|
|
|
|
virtual void setFTSParameters( Config::FullTextSearch const & fts )
|
|
{
|
|
can_FTS = fts.enabled
|
|
&& !fts.disabledTypes.contains( "XDXF", Qt::CaseInsensitive )
|
|
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
|
|
}
|
|
|
|
virtual uint32_t getFtsIndexVersion()
|
|
{ return 1; }
|
|
|
|
protected:
|
|
|
|
void loadIcon() throw();
|
|
|
|
private:
|
|
|
|
// Loads the article, storing its headword and formatting article's data into an html.
|
|
void loadArticle( uint32_t address,
|
|
string & articleText, QString * headword = 0 );
|
|
|
|
friend class XdxfArticleRequest;
|
|
friend class XdxfResourceRequest;
|
|
};
|
|
|
|
XdxfDictionary::XdxfDictionary( string const & id,
|
|
string const & indexFile,
|
|
vector< string > const & dictionaryFiles ):
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
idx( indexFile, "rb" ),
|
|
idxHeader( idx.read< IdxHeader >() )
|
|
{
|
|
// Read the dictionary name
|
|
|
|
chunks = new ChunkedStorage::Reader( idx, idxHeader.chunksOffset );
|
|
|
|
if ( idxHeader.nameSize )
|
|
{
|
|
vector< char > chunk;
|
|
|
|
dictionaryName = string( chunks->getBlock( idxHeader.nameAddress, chunk ),
|
|
idxHeader.nameSize );
|
|
}
|
|
|
|
// Open the file
|
|
|
|
DZ_ERRORS error;
|
|
dz = dict_data_open( dictionaryFiles[ 0 ].c_str(), &error, 0 );
|
|
|
|
if ( !dz )
|
|
throw exDictzipError( string( dz_error_str( error ) )
|
|
+ "(" + dictionaryFiles[ 0 ] + ")" );
|
|
|
|
// Read the abrv, if any
|
|
|
|
if ( idxHeader.hasAbrv )
|
|
{
|
|
vector< char > chunk;
|
|
|
|
char * abrvBlock = chunks->getBlock( idxHeader.abrvAddress, chunk );
|
|
|
|
uint32_t total;
|
|
memcpy( &total, abrvBlock, sizeof( uint32_t ) );
|
|
abrvBlock += sizeof( uint32_t );
|
|
|
|
while( total-- )
|
|
{
|
|
uint32_t keySz;
|
|
memcpy( &keySz, abrvBlock, sizeof( uint32_t ) );
|
|
abrvBlock += sizeof( uint32_t );
|
|
|
|
char * key = abrvBlock;
|
|
|
|
abrvBlock += keySz;
|
|
|
|
uint32_t valueSz;
|
|
memcpy( &valueSz, abrvBlock, sizeof( uint32_t ) );
|
|
abrvBlock += sizeof( uint32_t );
|
|
|
|
abrv[ string( key, keySz ) ] = string( abrvBlock, valueSz );
|
|
|
|
abrvBlock += valueSz;
|
|
}
|
|
|
|
// Open a resource zip file, if there's one
|
|
|
|
if ( idxHeader.hasZipFile &&
|
|
( idxHeader.zipIndexBtreeMaxElements ||
|
|
idxHeader.zipIndexRootOffset ) )
|
|
{
|
|
resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
|
|
idxHeader.zipIndexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
QString zipName = QDir::fromNativeSeparators(
|
|
FsEncoding::decode( getDictionaryFilenames().back().c_str() ) );
|
|
|
|
if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
|
|
resourceZip.openZipFile( zipName );
|
|
}
|
|
}
|
|
|
|
// Initialize the index
|
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
|
idxHeader.indexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
// Full-text search parameters
|
|
|
|
can_FTS = true;
|
|
|
|
ftsIdxName = indexFile + "_FTS";
|
|
|
|
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
|
|
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
|
|
FTS_index_completed.ref();
|
|
}
|
|
|
|
XdxfDictionary::~XdxfDictionary()
|
|
{
|
|
if ( dz )
|
|
dict_data_close( dz );
|
|
}
|
|
|
|
void XdxfDictionary::loadIcon() throw()
|
|
{
|
|
if ( dictionaryIconLoaded )
|
|
return;
|
|
|
|
QString fileName =
|
|
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
|
|
|
|
QFileInfo baseInfo( fileName );
|
|
|
|
fileName = baseInfo.absoluteDir().absoluteFilePath( "icon32.png" );
|
|
QFileInfo info( fileName );
|
|
|
|
if( !info.isFile() )
|
|
{
|
|
fileName = baseInfo.absoluteDir().absoluteFilePath( "icon16.png" );
|
|
info = QFileInfo( fileName );
|
|
}
|
|
|
|
if ( info.isFile() )
|
|
loadIconFromFile( fileName, true );
|
|
|
|
if ( dictionaryIcon.isNull() )
|
|
{
|
|
// Load failed -- use default icons
|
|
|
|
dictionaryIcon = QIcon(":/icons/icon32_xdxf.png");
|
|
dictionaryNativeIcon = QIcon(":/icons/icon32_xdxf.png");
|
|
}
|
|
|
|
dictionaryIconLoaded = true;
|
|
}
|
|
|
|
QString const& XdxfDictionary::getDescription()
|
|
{
|
|
if( !dictionaryDescription.isEmpty() )
|
|
return dictionaryDescription;
|
|
|
|
if( idxHeader.descriptionAddress == 0 )
|
|
dictionaryDescription = "NONE";
|
|
else
|
|
{
|
|
try
|
|
{
|
|
vector< char > chunk;
|
|
char * descr;
|
|
{
|
|
Mutex::Lock _( idxMutex );
|
|
descr = chunks->getBlock( idxHeader.descriptionAddress, chunk );
|
|
}
|
|
dictionaryDescription = QString::fromUtf8( descr, idxHeader.descriptionSize );
|
|
}
|
|
catch(...)
|
|
{
|
|
}
|
|
}
|
|
return dictionaryDescription;
|
|
}
|
|
|
|
QString XdxfDictionary::getMainFilename()
|
|
{
|
|
return FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() );
|
|
}
|
|
|
|
void XdxfDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
|
|
{
|
|
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|
|
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
|
|
FTS_index_completed.ref();
|
|
|
|
if( haveFTSIndex() )
|
|
return;
|
|
|
|
if( ensureInitDone().size() )
|
|
return;
|
|
|
|
if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
|
|
return;
|
|
|
|
gdDebug( "Xdxf: Building the full-text index for dictionary: %s\n",
|
|
getName().c_str() );
|
|
|
|
try
|
|
{
|
|
FtsHelpers::makeFTSIndex( this, isCancelled );
|
|
FTS_index_completed.ref();
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Xdxf: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
|
|
}
|
|
}
|
|
|
|
void XdxfDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
|
|
{
|
|
try
|
|
{
|
|
string articleStr;
|
|
loadArticle( articleAddress, articleStr, &headword );
|
|
|
|
wstring wstr = Utf8::decode( articleStr );
|
|
|
|
text = Html::unescape( gd::toQString( wstr ) );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Xdxf: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
}
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > XdxfDictionary::getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics,
|
|
QThreadPool * ftsThreadPoolPtr )
|
|
{
|
|
return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics, ftsThreadPoolPtr );
|
|
}
|
|
|
|
/// XdxfDictionary::getArticle()
|
|
|
|
class XdxfArticleRequest;
|
|
|
|
class XdxfArticleRequestRunnable: public QRunnable
|
|
{
|
|
XdxfArticleRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
XdxfArticleRequestRunnable( XdxfArticleRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~XdxfArticleRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
virtual void run();
|
|
};
|
|
|
|
class XdxfArticleRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class XdxfArticleRequestRunnable;
|
|
|
|
wstring word;
|
|
vector< wstring > alts;
|
|
XdxfDictionary & dict;
|
|
bool ignoreDiacritics;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
XdxfArticleRequest( wstring const & word_,
|
|
vector< wstring > const & alts_,
|
|
XdxfDictionary & dict_, bool ignoreDiacritics_ ):
|
|
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new XdxfArticleRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by XdxfArticleRequestRunnable
|
|
|
|
virtual void cancel()
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~XdxfArticleRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void XdxfArticleRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void XdxfArticleRequest::run()
|
|
{
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
|
|
|
for( unsigned x = 0; x < alts.size(); ++x )
|
|
{
|
|
/// Make an additional query for each alt
|
|
|
|
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
}
|
|
|
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
|
|
|
set< uint32_t > articlesIncluded; // Some synonims make it that the articles
|
|
// appear several times. We combat this
|
|
// by only allowing them to appear once.
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
|
if( ignoreDiacritics )
|
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
|
continue; // We already have this article in the body.
|
|
|
|
// Now grab that article
|
|
|
|
string headword, articleText;
|
|
|
|
headword = chain[ x ].word;
|
|
|
|
try
|
|
{
|
|
dict.loadArticle( chain[ x ].articleOffset, articleText );
|
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
// main ones first, and alternates after.
|
|
|
|
// We do the case-folded comparison here.
|
|
|
|
wstring headwordStripped =
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
|
|
if( ignoreDiacritics )
|
|
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
|
|
|
|
multimap< wstring, pair< string, string > > & mapToUse =
|
|
( wordCaseFolded == headwordStripped ) ?
|
|
mainArticles : alternateArticles;
|
|
|
|
mapToUse.insert( pair< wstring, pair< string, string > >(
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
|
|
pair< string, string >( headword, articleText ) ) );
|
|
|
|
articlesIncluded.insert( chain[ x ].articleOffset );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "XDXF: Failed loading article from \"%s\", reason: %s\n", dict.getName().c_str(), ex.what() );
|
|
}
|
|
}
|
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() )
|
|
{
|
|
// No such word
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string result;
|
|
|
|
multimap< wstring, pair< string, string > >::const_iterator i;
|
|
|
|
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</b></b></b></b></b></b></b></b>"
|
|
"</i></i></i></i></i></i></i></i>";
|
|
|
|
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
|
|
{
|
|
// result += "<h3>";
|
|
// result += i->second.first;
|
|
// result += "</h3>";
|
|
result += i->second.second;
|
|
result += cleaner;
|
|
}
|
|
|
|
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
|
|
{
|
|
// result += "<h3>";
|
|
// result += i->second.first;
|
|
// result += "</h3>";
|
|
result += i->second.second;
|
|
result += cleaner;
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
data.resize( result.size() );
|
|
|
|
memcpy( &data.front(), result.data(), result.size() );
|
|
|
|
hasAnyData = true;
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > XdxfDictionary::getArticle( wstring const & word,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
THROW_SPEC( std::exception )
|
|
{
|
|
return new XdxfArticleRequest( word, alts, *this, ignoreDiacritics );
|
|
}
|
|
|
|
void XdxfDictionary::loadArticle( uint32_t address,
|
|
string & articleText,
|
|
QString * headword )
|
|
{
|
|
// Read the properties
|
|
|
|
vector< char > chunk;
|
|
|
|
char * propertiesData;
|
|
|
|
{
|
|
Mutex::Lock _( idxMutex );
|
|
|
|
propertiesData = chunks->getBlock( address, chunk );
|
|
}
|
|
|
|
if ( &chunk.front() + chunk.size() - propertiesData < 9 )
|
|
{
|
|
articleText = string( "<div class=\"xdxf\">Index seems corrupted</div>" );
|
|
return;
|
|
}
|
|
|
|
unsigned char fType = (unsigned char) *propertiesData;
|
|
|
|
uint32_t articleOffset, articleSize;
|
|
|
|
memcpy( &articleOffset, propertiesData + 1, sizeof( uint32_t ) );
|
|
memcpy( &articleSize, propertiesData + 5, sizeof( uint32_t ) );
|
|
|
|
// Load the article
|
|
|
|
char * articleBody;
|
|
|
|
{
|
|
Mutex::Lock _( dzMutex );
|
|
|
|
// Note that the function always zero-pads the result.
|
|
articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
|
|
}
|
|
|
|
if ( !articleBody )
|
|
{
|
|
// throw exCantReadFile( getDictionaryFilenames()[ 0 ] );
|
|
articleText = string( "<div class=\"xdxf\">DICTZIP error: " ) + dict_error_str( dz ) + "</div>";
|
|
return;
|
|
}
|
|
|
|
articleText = Xdxf2Html::convert( string( articleBody ), Xdxf2Html::XDXF, idxHeader.hasAbrv ? &abrv : NULL, this,
|
|
&resourceZip, fType == Logical, idxHeader.revisionNumber, headword );
|
|
|
|
free( articleBody );
|
|
}
|
|
|
|
class GzippedFile: public QIODevice
|
|
{
|
|
gzFile gz;
|
|
|
|
public:
|
|
|
|
GzippedFile( char const * fileName ) THROW_SPEC( exCantReadFile );
|
|
|
|
~GzippedFile();
|
|
|
|
// size_t gzTell();
|
|
|
|
char * readDataArray( unsigned long startPos, unsigned long size );
|
|
|
|
protected:
|
|
|
|
dictData *dz;
|
|
|
|
virtual bool isSequential () const
|
|
{ return false; } // Which is a lie, but else pos() won't work
|
|
|
|
bool waitForReadyRead ( int )
|
|
{ return !gzeof( gz ); }
|
|
|
|
qint64 bytesAvailable() const
|
|
{
|
|
return ( gzeof( gz ) ? 0 : 1 ) + QIODevice::bytesAvailable();
|
|
}
|
|
|
|
virtual qint64 readData( char * data, qint64 maxSize );
|
|
|
|
virtual bool atEnd() const;
|
|
|
|
virtual qint64 writeData ( const char * /*data*/, qint64 /*maxSize*/ )
|
|
{ return -1; }
|
|
};
|
|
|
|
GzippedFile::GzippedFile( char const * fileName ) THROW_SPEC( exCantReadFile )
|
|
{
|
|
gz = gd_gzopen( fileName );
|
|
if ( !gz )
|
|
throw exCantReadFile( fileName );
|
|
|
|
DZ_ERRORS error;
|
|
dz = dict_data_open( fileName, &error, 0 );
|
|
}
|
|
|
|
GzippedFile::~GzippedFile()
|
|
{
|
|
gzclose( gz );
|
|
if( dz )
|
|
dict_data_close( dz );
|
|
}
|
|
|
|
bool GzippedFile::atEnd() const
|
|
{
|
|
return gzeof( gz );
|
|
}
|
|
|
|
/*
|
|
size_t GzippedFile::gzTell()
|
|
{
|
|
return gztell( gz );
|
|
}
|
|
*/
|
|
|
|
qint64 GzippedFile::readData( char * data, qint64 maxSize )
|
|
{
|
|
if ( maxSize > 1 )
|
|
maxSize = 1;
|
|
|
|
// The returning value translates directly to QIODevice semantics
|
|
int n = gzread( gz, data, maxSize );
|
|
|
|
#if QT_VERSION >= QT_VERSION_CHECK(5, 0, 0)
|
|
// With QT 5.x QXmlStreamReader ask one byte instead of one UTF-8 char.
|
|
// We read and return all bytes for char.
|
|
|
|
if( n == 1 )
|
|
{
|
|
char ch = *data;
|
|
int addBytes = 0;
|
|
if( ch & 0x80 )
|
|
{
|
|
if( ( ch & 0xF8 ) == 0xF0 )
|
|
addBytes = 3;
|
|
else if( ( ch & 0xF0 ) == 0xE0 )
|
|
addBytes = 2;
|
|
else if( ( ch & 0xE0 ) == 0xC0 )
|
|
addBytes = 1;
|
|
}
|
|
|
|
if( addBytes )
|
|
n += gzread( gz, data + 1, addBytes );
|
|
}
|
|
#endif
|
|
|
|
return n;
|
|
}
|
|
|
|
char * GzippedFile::readDataArray( unsigned long startPos, unsigned long size )
|
|
{
|
|
if( dz == NULL )
|
|
return NULL;
|
|
return dict_data_read_( dz, startPos, size, 0, 0 );
|
|
}
|
|
|
|
QString readXhtmlData( QXmlStreamReader & stream )
|
|
{
|
|
QString result;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
{
|
|
QString name = stream.name().toString();
|
|
|
|
result += "<" + Qt4x5::escape( name ) + " ";
|
|
|
|
QXmlStreamAttributes attrs = stream.attributes();
|
|
|
|
for( int x = 0; x < attrs.size(); ++x )
|
|
{
|
|
result += Qt4x5::escape( attrs[ x ].name().toString() );
|
|
result += "=\"" + Qt4x5::escape( attrs[ x ].value().toString() ) + "\"";
|
|
}
|
|
|
|
result += ">";
|
|
|
|
result += readXhtmlData( stream );
|
|
|
|
result += "</" + Qt4x5::escape( name ) + ">";
|
|
}
|
|
else
|
|
if ( stream.isCharacters() || stream.isWhitespace() || stream.isCDATA() )
|
|
{
|
|
result += stream.text();
|
|
}
|
|
else
|
|
if ( stream.isEndElement() )
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
namespace {
|
|
|
|
/// Deal with Qt 4.5 incompatibility
|
|
QString readElementText( QXmlStreamReader & stream )
|
|
{
|
|
#if QT_VERSION >= 0x040600
|
|
return stream.readElementText( QXmlStreamReader::SkipChildElements );
|
|
#else
|
|
return stream.readElementText();
|
|
#endif
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void addAllKeyTags( QXmlStreamReader & stream, list< QString > & words )
|
|
{
|
|
// todo implement support for tag <srt>, that overrides the article sorting order
|
|
if ( stream.name() == "k" )
|
|
{
|
|
words.push_back( readElementText( stream ) );
|
|
return;
|
|
}
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
addAllKeyTags( stream, words );
|
|
else
|
|
if ( stream.isEndElement() )
|
|
return;
|
|
}
|
|
}
|
|
|
|
void checkArticlePosition( GzippedFile & gzFile,
|
|
uint32_t *pOffset,
|
|
uint32_t *pSize )
|
|
{
|
|
char * data = gzFile.readDataArray( *pOffset, *pSize );
|
|
if( data == NULL )
|
|
return;
|
|
QString s = QString::fromUtf8( data );
|
|
free( data );
|
|
int n = s.lastIndexOf( "</ar" );
|
|
if( n > 0 )
|
|
*pSize -= s.size() - n;
|
|
if( s.at( 0 ) == '>')
|
|
{
|
|
*pOffset += 1;
|
|
*pSize -= 1;
|
|
}
|
|
}
|
|
|
|
void indexArticle( GzippedFile & gzFile,
|
|
QXmlStreamReader & stream,
|
|
IndexedWords & indexedWords,
|
|
ChunkedStorage::Writer & chunks,
|
|
unsigned & articleCount,
|
|
unsigned & wordCount,
|
|
ArticleFormat defaultFormat )
|
|
{
|
|
ArticleFormat format( Default );
|
|
|
|
QStringRef formatValue = stream.attributes().value( "f" );
|
|
|
|
if ( formatValue == "v" )
|
|
format = Visual;
|
|
else
|
|
if ( formatValue == "l" )
|
|
format = Logical;
|
|
if( format == Default )
|
|
format = defaultFormat;
|
|
size_t articleOffset = gzFile.pos() - 1; // stream.characterOffset() is loony
|
|
|
|
// uint32_t lineNumber = stream.lineNumber();
|
|
// uint32_t columnNumber = stream.columnNumber();
|
|
|
|
list< QString > words;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
// Find any <k> tags and index them
|
|
if ( stream.isEndElement() )
|
|
{
|
|
// End of the <ar> tag
|
|
|
|
if ( words.empty() )
|
|
{
|
|
// Nothing to index, this article didn't have any tags
|
|
gdWarning( "No <k> tags found in an article at offset 0x%x, article skipped.\n",
|
|
(unsigned) articleOffset );
|
|
}
|
|
else
|
|
{
|
|
// Add an entry
|
|
|
|
uint32_t offset = chunks.startNewBlock();
|
|
|
|
uint32_t offs = articleOffset;
|
|
uint32_t size = gzFile.pos() - 1 - articleOffset;
|
|
|
|
checkArticlePosition( gzFile, &offs, &size );
|
|
|
|
unsigned char f = format;
|
|
chunks.addToBlock( &f, 1 );
|
|
chunks.addToBlock( &offs, sizeof( offs ) );
|
|
chunks.addToBlock( &size, sizeof( size ) );
|
|
|
|
// Add also first header - it's needed for full-text search
|
|
chunks.addToBlock( words.begin()->toUtf8().data(), words.begin()->toUtf8().length() + 1 );
|
|
|
|
// DPRINTF( "%x: %s\n", articleOffset, words.begin()->toUtf8().data() );
|
|
|
|
// Add words to index
|
|
|
|
for( list< QString >::const_iterator i = words.begin(); i != words.end();
|
|
++i )
|
|
indexedWords.addWord( gd::toWString( *i ), offset );
|
|
|
|
++articleCount;
|
|
|
|
wordCount += words.size();
|
|
}
|
|
|
|
return;
|
|
}
|
|
else
|
|
if ( stream.isStartElement() )
|
|
{
|
|
addAllKeyTags( stream, words );
|
|
}
|
|
}
|
|
}
|
|
|
|
//// XdxfDictionary::getResource()
|
|
|
|
class XdxfResourceRequest;
|
|
|
|
class XdxfResourceRequestRunnable: public QRunnable
|
|
{
|
|
XdxfResourceRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
XdxfResourceRequestRunnable( XdxfResourceRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~XdxfResourceRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
virtual void run();
|
|
};
|
|
|
|
class XdxfResourceRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class XdxfResourceRequestRunnable;
|
|
|
|
XdxfDictionary & dict;
|
|
|
|
string resourceName;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
XdxfResourceRequest( XdxfDictionary & dict_,
|
|
string const & resourceName_ ):
|
|
dict( dict_ ),
|
|
resourceName( resourceName_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new XdxfResourceRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by XdxfResourceRequestRunnable
|
|
|
|
virtual void cancel()
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~XdxfResourceRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void XdxfResourceRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void XdxfResourceRequest::run()
|
|
{
|
|
// Some runnables linger enough that they are cancelled before they start
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
if ( dict.ensureInitDone().size() )
|
|
{
|
|
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string n =
|
|
FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
|
|
FsEncoding::separator() +
|
|
FsEncoding::encode( resourceName );
|
|
|
|
GD_DPRINTF( "n is %s\n", n.c_str() );
|
|
|
|
try
|
|
{
|
|
try
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
File::loadFromFile( n, data );
|
|
}
|
|
catch( File::exCantOpen & )
|
|
{
|
|
n = dict.getDictionaryFilenames()[ 0 ] + ".files" +
|
|
FsEncoding::separator() +
|
|
FsEncoding::encode( resourceName );
|
|
|
|
try
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
File::loadFromFile( n, data );
|
|
}
|
|
catch( File::exCantOpen & )
|
|
{
|
|
// Try reading from zip file
|
|
|
|
if ( dict.resourceZip.isOpen() )
|
|
{
|
|
Mutex::Lock _( dict.resourceZipMutex );
|
|
|
|
Mutex::Lock __( dataMutex );
|
|
|
|
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
|
|
throw; // Make it fail since we couldn't read the archive
|
|
}
|
|
else
|
|
throw;
|
|
}
|
|
}
|
|
|
|
if ( Filetype::isNameOfTiff( resourceName ) )
|
|
{
|
|
// Convert it
|
|
|
|
dataMutex.lock();
|
|
|
|
QImage img = QImage::fromData( (unsigned char *) &data.front(),
|
|
data.size() );
|
|
|
|
#ifdef MAKE_EXTRA_TIFF_HANDLER
|
|
if( img.isNull() )
|
|
GdTiff::tiffToQImage( &data.front(), data.size(), img );
|
|
#endif
|
|
|
|
dataMutex.unlock();
|
|
|
|
if ( !img.isNull() )
|
|
{
|
|
// Managed to load -- now store it back as BMP
|
|
|
|
QByteArray ba;
|
|
QBuffer buffer( &ba );
|
|
buffer.open( QIODevice::WriteOnly );
|
|
img.save( &buffer, "BMP" );
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
data.resize( buffer.size() );
|
|
|
|
memcpy( &data.front(), buffer.data(), data.size() );
|
|
}
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
hasAnyData = true;
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdCWarning( dictionaryResourceLc, "XDXF: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
|
|
resourceName.c_str(), dict.getName().c_str(), ex.what() );
|
|
// Resource not loaded -- we don't set the hasAnyData flag then
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > XdxfDictionary::getResource( string const & name )
|
|
THROW_SPEC( std::exception )
|
|
{
|
|
return new XdxfResourceRequest( *this, name );
|
|
}
|
|
|
|
}
|
|
// anonymous namespace - this section of file is devoted to rebuilding of dictionary articles index
|
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries(
|
|
vector< string > const & fileNames,
|
|
string const & indicesDir,
|
|
Dictionary::Initializing & initializing )
|
|
THROW_SPEC( std::exception )
|
|
{
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
|
|
++i )
|
|
{
|
|
// Only allow .xdxf and .xdxf.dz suffixes
|
|
|
|
if ( ( i->size() < 5 || strcasecmp( i->c_str() + ( i->size() - 5 ), ".xdxf" ) != 0 ) &&
|
|
( i->size() < 8 ||
|
|
strcasecmp( i->c_str() + ( i->size() - 8 ), ".xdxf.dz" ) != 0 ) )
|
|
continue;
|
|
|
|
try
|
|
{
|
|
vector< string > dictFiles( 1, *i );
|
|
|
|
string baseName = ( (*i)[ i->size() - 5 ] == '.' ) ?
|
|
string( *i, 0, i->size() - 5 ) : string( *i, 0, i->size() - 8 );
|
|
|
|
// See if there's a zip file with resources present. If so, include it.
|
|
|
|
string zipFileName;
|
|
|
|
if ( File::tryPossibleZipName( baseName + ".xdxf.files.zip", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".xdxf.dz.files.zip", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".XDXF.FILES.ZIP", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".XDXF.DZ.FILES.ZIP", zipFileName ) )
|
|
dictFiles.push_back( zipFileName );
|
|
|
|
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
|
|
|
string indexFile = indicesDir + dictId;
|
|
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
|
|
indexIsOldOrBad( indexFile ) )
|
|
{
|
|
// Building the index
|
|
|
|
gdDebug( "Xdxf: Building the index for dictionary: %s\n", i->c_str() );
|
|
|
|
//initializing.indexingDictionary( nameFromFileName( dictFiles[ 0 ] ) );
|
|
|
|
File::Class idx( indexFile, "wb" );
|
|
|
|
IdxHeader idxHeader;
|
|
map< string, string > abrv;
|
|
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
// will be rewritten with the right values.
|
|
|
|
idx.write( idxHeader );
|
|
|
|
IndexedWords indexedWords;
|
|
|
|
GzippedFile gzFile( dictFiles[ 0 ].c_str() );
|
|
|
|
if ( !gzFile.open( QIODevice::ReadOnly ) )
|
|
throw exCantReadFile( dictFiles[ 0 ] );
|
|
|
|
QXmlStreamReader stream( &gzFile );
|
|
|
|
QString dictionaryName, dictionaryDescription;
|
|
|
|
ChunkedStorage::Writer chunks( idx );
|
|
|
|
// Wait for the first element, which must be xdxf
|
|
|
|
bool hadXdxf = false;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
{
|
|
if ( stream.name() != "xdxf" )
|
|
throw exNotXdxfFile( dictFiles[ 0 ] );
|
|
else
|
|
{
|
|
// Read the xdxf
|
|
|
|
string str = stream.attributes().value( "lang_from" ).toString().toLatin1().data();
|
|
|
|
if ( str.size() > 3 )
|
|
str.resize( 3 );
|
|
|
|
idxHeader.langFrom = LangCoder::findIdForLanguageCode3( str.c_str() );
|
|
|
|
str = stream.attributes().value( "lang_to" ).toString().toLatin1().data();
|
|
|
|
if ( str.size() > 3 )
|
|
str.resize( 3 );
|
|
|
|
idxHeader.langTo = LangCoder::findIdForLanguageCode3( str.c_str() );
|
|
|
|
bool isLogical = ( stream.attributes().value( "format" ) == "logical" );
|
|
|
|
QRegExp regNum( "\\d+" );
|
|
regNum.indexIn( stream.attributes().value( "revision" ).toString() );
|
|
idxHeader.revisionNumber = regNum.cap().toUInt();
|
|
|
|
idxHeader.articleFormat = isLogical ? Logical : Visual;
|
|
|
|
unsigned articleCount = 0, wordCount = 0;
|
|
|
|
while( !stream.atEnd() )
|
|
{
|
|
stream.readNext();
|
|
|
|
if ( stream.isStartElement() )
|
|
{
|
|
// todo implement using short <title> for denoting the dictionary in settings or dict list toolbar
|
|
if ( stream.name() == "full_name" || stream.name() == "full_title" )
|
|
{
|
|
// That's our name
|
|
|
|
QString name = stream.readElementText();
|
|
|
|
if ( dictionaryName.isEmpty() )
|
|
{
|
|
dictionaryName = name;
|
|
|
|
initializing.indexingDictionary( dictionaryName.toUtf8().data() );
|
|
|
|
idxHeader.nameAddress = chunks.startNewBlock();
|
|
|
|
QByteArray n = dictionaryName.toUtf8();
|
|
|
|
idxHeader.nameSize = n.size();
|
|
|
|
chunks.addToBlock( n.data(), n.size() );
|
|
}
|
|
else
|
|
{
|
|
GD_DPRINTF( "Warning: duplicate full_name in %s\n", dictFiles[ 0 ].c_str() );
|
|
}
|
|
}
|
|
else
|
|
if ( stream.name() == "description" )
|
|
{
|
|
// todo implement adding other information to the description like <publisher>, <authors>, <file_ver>, <creation_date>, <last_edited_date>, <dict_edition>, <publishing_date>, <dict_src_url>
|
|
QString desc = readXhtmlData( stream );
|
|
|
|
if ( dictionaryDescription.isEmpty() )
|
|
{
|
|
dictionaryDescription = desc;
|
|
idxHeader.descriptionAddress = chunks.startNewBlock();
|
|
|
|
QByteArray n = dictionaryDescription.toUtf8();
|
|
|
|
idxHeader.descriptionSize = n.size();
|
|
|
|
chunks.addToBlock( n.data(), n.size() );
|
|
}
|
|
else
|
|
{
|
|
GD_DPRINTF( "Warning: duplicate description in %s\n", dictFiles[ 0 ].c_str() );
|
|
}
|
|
}
|
|
else
|
|
if ( stream.name() == "abbreviations" )
|
|
{
|
|
QString s;
|
|
string value;
|
|
list < wstring > keys;
|
|
while( !( stream.isEndElement() && stream.name() == "abbreviations" ) && !stream.atEnd() )
|
|
{
|
|
if( !stream.readNextStartElement() )
|
|
break;
|
|
// abbreviations tag set switch at format revision = 30
|
|
if( idxHeader.revisionNumber >= 30 )
|
|
{
|
|
while ( !( stream.isEndElement() && stream.name() == "abbr_def" ) || !stream.atEnd() )
|
|
{
|
|
if ( stream.isStartElement() && stream.name() == "abbr_k" )
|
|
{
|
|
s = readElementText( stream );
|
|
keys.push_back( gd::toWString( s ) );
|
|
}
|
|
else if ( stream.isStartElement() && stream.name() == "abbr_v" )
|
|
{
|
|
s = readElementText( stream );
|
|
value = Utf8::encode( Folding::trimWhitespace( gd::toWString( s ) ) );
|
|
for( list< wstring >::iterator i = keys.begin(); i != keys.end(); ++i )
|
|
{
|
|
abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
|
|
}
|
|
keys.clear();
|
|
}
|
|
else if ( stream.isEndElement() && stream.name() == "abbreviations" )
|
|
break;
|
|
stream.readNext();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
while ( !( stream.isEndElement() && stream.name() == "abr_def" ) || !stream.atEnd() )
|
|
{
|
|
if ( stream.isStartElement() && stream.name() == "k" )
|
|
{
|
|
s = readElementText( stream );
|
|
keys.push_back( gd::toWString( s ) );
|
|
}
|
|
else if ( stream.isStartElement() && stream.name() == "v" )
|
|
{
|
|
s = readElementText( stream );
|
|
value = Utf8::encode( Folding::trimWhitespace( gd::toWString( s ) ) );
|
|
for( list< wstring >::iterator i = keys.begin(); i != keys.end(); ++i )
|
|
{
|
|
abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
|
|
}
|
|
keys.clear();
|
|
}
|
|
else if ( stream.isEndElement() && stream.name() == "abbreviations" )
|
|
break;
|
|
stream.readNext();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
if ( stream.name() == "ar" )
|
|
{
|
|
indexArticle( gzFile, stream, indexedWords, chunks,
|
|
articleCount, wordCount, isLogical ? Logical : Visual );
|
|
}
|
|
}
|
|
}
|
|
|
|
// Write abbreviations if presented
|
|
|
|
if( !abrv.empty() )
|
|
{
|
|
idxHeader.hasAbrv = 1;
|
|
idxHeader.abrvAddress = chunks.startNewBlock();
|
|
|
|
uint32_t sz = abrv.size();
|
|
|
|
chunks.addToBlock( &sz, sizeof( uint32_t ) );
|
|
|
|
for( map< string, string >::const_iterator i = abrv.begin(); i != abrv.end(); ++i )
|
|
{
|
|
sz = i->first.size();
|
|
chunks.addToBlock( &sz, sizeof( uint32_t ) );
|
|
chunks.addToBlock( i->first.data(), sz );
|
|
sz = i->second.size();
|
|
chunks.addToBlock( &sz, sizeof( uint32_t ) );
|
|
chunks.addToBlock( i->second.data(), sz );
|
|
}
|
|
}
|
|
|
|
// Finish with the chunks
|
|
|
|
idxHeader.chunksOffset = chunks.finish();
|
|
|
|
// Build index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
|
|
|
indexedWords.clear(); // Release memory -- no need for this data
|
|
|
|
// If there was a zip file, index it too
|
|
|
|
if ( zipFileName.size() )
|
|
{
|
|
GD_DPRINTF( "Indexing zip file\n" );
|
|
|
|
idxHeader.hasZipFile = 1;
|
|
|
|
IndexedWords zipFileNames;
|
|
IndexedZip zipFile;
|
|
if( zipFile.openZipFile( QDir::fromNativeSeparators(
|
|
FsEncoding::decode( zipFileName.c_str() ) ) ) )
|
|
zipFile.indexFile( zipFileNames );
|
|
|
|
if( !zipFileNames.empty() )
|
|
{
|
|
// Build the resulting zip file index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
|
|
|
|
idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
|
|
}
|
|
else
|
|
{
|
|
// Bad zip file -- no index (though the mark that we have one
|
|
// remains)
|
|
idxHeader.zipIndexBtreeMaxElements = 0;
|
|
idxHeader.zipIndexRootOffset = 0;
|
|
}
|
|
}
|
|
else
|
|
idxHeader.hasZipFile = 0;
|
|
// That concludes it. Update the header.
|
|
|
|
idxHeader.signature = Signature;
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
|
|
|
idxHeader.articleCount = articleCount;
|
|
idxHeader.wordCount = wordCount;
|
|
|
|
idx.rewind();
|
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
|
|
hadXdxf = true;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ( !hadXdxf )
|
|
throw exNotXdxfFile( dictFiles[ 0 ] );
|
|
|
|
if ( stream.hasError() )
|
|
{
|
|
gdWarning( "%s had a parse error %s at line %lu, and therefore was indexed only up to the point of error.",
|
|
dictFiles[ 0 ].c_str(), stream.errorString().toUtf8().data(),
|
|
(unsigned long) stream.lineNumber() );
|
|
}
|
|
}
|
|
|
|
dictionaries.push_back( new XdxfDictionary( dictId,
|
|
indexFile,
|
|
dictFiles ) );
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
gdWarning( "Xdxf dictionary initializing failed: %s, error: %s\n",
|
|
i->c_str(), e.what() );
|
|
}
|
|
}
|
|
|
|
return dictionaries;
|
|
}
|
|
|
|
}
|