2010-12-09 12:31:50 +00:00
/* This file is (c) 2008-2009 Konstantin Isakov <ikm@goldendict.org>
2009-04-29 13:34:56 +00:00
* Part of GoldenDict . Licensed under GPLv3 or later , see the LICENSE file */
# include "xdxf.hh"
# include "btreeidx.hh"
# include "folding.hh"
# include "utf8.hh"
# include "chunkedstorage.hh"
2023-04-17 20:12:27 +00:00
# include "dictzip.hh"
2009-04-29 13:34:56 +00:00
# include "htmlescape.hh"
# include "fsencoding.hh"
# include <map>
# include <set>
# include <string>
# include <vector>
# include <list>
# include <wctype.h>
# include <stdlib.h>
2013-11-16 18:34:09 +00:00
# include "gddebug.hh"
2012-01-24 12:55:28 +00:00
# include "wstring_qt.hh"
# include "xdxf2html.hh"
# include "ufile.hh"
2023-04-17 20:12:27 +00:00
# include "dictzip.hh"
2012-01-27 11:40:42 +00:00
# include "langcoder.hh"
2012-01-31 12:49:37 +00:00
# include "indexedzip.hh"
# include "filetype.hh"
2014-02-11 14:02:00 +00:00
# include "tiff.hh"
2014-04-16 16:18:28 +00:00
# include "ftshelpers.hh"
2009-04-29 13:34:56 +00:00
2013-08-04 19:19:57 +00:00
# ifdef _MSC_VER
# include <stub_msvc.h>
# endif
2009-04-29 13:34:56 +00:00
# include <QIODevice>
# include <QXmlStreamReader>
# include <QTextDocument>
2012-01-24 12:55:28 +00:00
# include <QFileInfo>
# include <QDir>
# include <QPainter>
2015-07-07 15:02:06 +00:00
# include <QRegExp>
2022-02-27 05:17:37 +00:00
# if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
# include <QtCore5Compat>
# endif
2009-04-29 13:34:56 +00:00
# include <QSemaphore>
# include <QThreadPool>
# include <QAtomicInt>
2021-11-27 07:17:33 +00:00
# include "utils.hh"
2013-05-30 13:24:21 +00:00
2009-04-29 13:34:56 +00:00
namespace Xdxf {
using std : : map ;
using std : : multimap ;
using std : : pair ;
using std : : set ;
using std : : string ;
using gd : : wstring ;
using std : : vector ;
using std : : list ;
using BtreeIndexing : : WordArticleLink ;
using BtreeIndexing : : IndexedWords ;
using BtreeIndexing : : IndexInfo ;
2023-01-19 14:57:47 +00:00
quint32 getLanguageId ( const QString & lang )
{
QString lstr = lang . left ( 3 ) ;
if ( lstr . endsWith ( QChar ( ' - ' ) ) )
lstr . chop ( 1 ) ;
switch ( lstr . size ( ) )
{
case 2 : return LangCoder : : code2toInt ( lstr . toLatin1 ( ) . data ( ) ) ;
case 3 : return LangCoder : : findIdForLanguageCode3 ( lstr . toLatin1 ( ) . data ( ) ) ;
}
return 0 ;
}
2009-04-29 13:34:56 +00:00
namespace {
DEF_EX_STR ( exCantReadFile , " Can't read file " , Dictionary : : Ex )
DEF_EX_STR ( exNotXdxfFile , " The file is not an XDXF file: " , Dictionary : : Ex )
DEF_EX ( exCorruptedIndex , " The index file is corrupted " , Dictionary : : Ex )
2014-04-24 18:50:47 +00:00
DEF_EX_STR ( exDictzipError , " DICTZIP error " , Dictionary : : Ex )
2009-04-29 13:34:56 +00:00
enum
{
Signature = 0x46584458 , // XDXF on little-endian, FXDX on big-endian
2023-01-19 14:57:47 +00:00
CurrentFormatVersion = 6 + BtreeIndexing : : FormatVersion + Folding : : Version
2009-04-29 13:34:56 +00:00
} ;
enum ArticleFormat
{
Default = 0 ,
Visual = 1 ,
Logical = 2
} ;
struct IdxHeader
{
uint32_t signature ; // First comes the signature, XDXF
uint32_t formatVersion ; // File format version (CurrentFormatVersion)
uint32_t articleFormat ; // ArticleFormat value, except that 0 = bad file
2012-01-27 11:40:42 +00:00
uint32_t langFrom ; // Source language
uint32_t langTo ; // Target language
2009-04-29 13:34:56 +00:00
uint32_t articleCount ; // Total number of articles
uint32_t wordCount ; // Total number of words
uint32_t nameAddress ; // Address of an utf8 name string, in chunks
uint32_t nameSize ; // And its size
uint32_t descriptionAddress ; // Address of an utf8 description string, in chunks
uint32_t descriptionSize ; // And its size
2012-01-24 12:55:28 +00:00
uint32_t hasAbrv ; // Non-zero means file has abrvs at abrvAddress
uint32_t abrvAddress ; // Address of abrv map in the chunked storage
2009-04-29 13:34:56 +00:00
uint32_t chunksOffset ; // The offset to chunks' storage
uint32_t indexBtreeMaxElements ; // Two fields from IndexInfo
uint32_t indexRootOffset ;
2012-01-31 12:49:37 +00:00
uint32_t hasZipFile ; // Non-zero means there's a zip file with resources
// present
uint32_t zipIndexBtreeMaxElements ; // Two fields from IndexInfo of the zip
// resource index.
uint32_t zipIndexRootOffset ;
2013-06-15 08:55:15 +00:00
uint32_t revisionNumber ; // Format revision
2013-08-04 19:19:57 +00:00
}
# ifndef _MSC_VER
__attribute__ ( ( packed ) )
# endif
;
2009-04-29 13:34:56 +00:00
bool indexIsOldOrBad ( string const & indexFile )
{
File : : Class idx ( indexFile , " rb " ) ;
IdxHeader header ;
return idx . readRecords ( & header , sizeof ( header ) , 1 ) ! = 1 | |
header . signature ! = Signature | |
header . formatVersion ! = CurrentFormatVersion | |
! header . articleFormat ;
}
class XdxfDictionary : public BtreeIndexing : : BtreeDictionary
{
Mutex idxMutex ;
File : : Class idx ;
IdxHeader idxHeader ;
2012-01-24 12:55:28 +00:00
sptr < ChunkedStorage : : Reader > chunks ;
2009-04-29 13:34:56 +00:00
Mutex dzMutex ;
dictData * dz ;
2012-01-31 12:49:37 +00:00
Mutex resourceZipMutex ;
IndexedZip resourceZip ;
2009-04-29 13:34:56 +00:00
string dictionaryName ;
2012-01-24 12:55:28 +00:00
map < string , string > abrv ;
2009-04-29 13:34:56 +00:00
public :
XdxfDictionary ( string const & id , string const & indexFile ,
vector < string > const & dictionaryFiles ) ;
~ XdxfDictionary ( ) ;
2022-12-29 07:07:40 +00:00
string getName ( ) noexcept override
2009-04-29 13:34:56 +00:00
{ return dictionaryName ; }
2022-12-29 07:07:40 +00:00
map < Dictionary : : Property , string > getProperties ( ) noexcept override
2009-04-29 13:34:56 +00:00
{ return map < Dictionary : : Property , string > ( ) ; }
2022-12-29 07:07:40 +00:00
unsigned long getArticleCount ( ) noexcept override
2012-01-27 11:40:42 +00:00
{ return idxHeader . articleCount ; }
2009-04-29 13:34:56 +00:00
2022-12-29 07:07:40 +00:00
unsigned long getWordCount ( ) noexcept override
2009-04-29 13:34:56 +00:00
{ return idxHeader . wordCount ; }
2022-12-29 07:07:40 +00:00
inline quint32 getLangFrom ( ) const override
2012-01-27 11:40:42 +00:00
{ return idxHeader . langFrom ; }
2022-12-29 07:07:40 +00:00
inline quint32 getLangTo ( ) const override
2012-01-27 11:40:42 +00:00
{ return idxHeader . langTo ; }
2022-12-29 07:07:40 +00:00
sptr < Dictionary : : DataRequest > getArticle ( wstring const & ,
2012-01-24 12:55:28 +00:00
vector < wstring > const & alts ,
2018-06-13 16:00:42 +00:00
wstring const & ,
2022-12-29 07:07:40 +00:00
bool ignoreDiacritics ) override
2022-01-09 08:35:07 +00:00
;
2009-04-29 13:34:56 +00:00
2022-12-29 07:07:40 +00:00
sptr < Dictionary : : DataRequest > getResource ( string const & name ) override
2022-01-09 08:35:07 +00:00
;
2012-01-31 12:49:37 +00:00
2022-12-29 07:07:40 +00:00
QString const & getDescription ( ) override ;
2012-09-07 11:51:42 +00:00
2022-12-29 07:07:40 +00:00
QString getMainFilename ( ) override ;
2012-11-29 17:12:54 +00:00
2022-12-29 07:07:40 +00:00
sptr < Dictionary : : DataRequest > getSearchResults ( QString const & searchString ,
2014-04-16 16:18:28 +00:00
int searchMode , bool matchCase ,
int distanceBetweenWords ,
2017-07-25 15:28:29 +00:00
int maxResults ,
2018-04-10 14:49:52 +00:00
bool ignoreWordsOrder ,
2022-12-29 07:07:40 +00:00
bool ignoreDiacritics ) override ;
void getArticleText ( uint32_t articleAddress , QString & headword , QString & text ) override ;
2014-04-16 16:18:28 +00:00
2022-12-29 07:07:40 +00:00
void makeFTSIndex ( QAtomicInt & isCancelled , bool firstIteration ) override ;
2014-04-16 16:18:28 +00:00
2022-12-29 07:07:40 +00:00
void setFTSParameters ( Config : : FullTextSearch const & fts ) override
2014-04-17 14:31:51 +00:00
{
can_FTS = fts . enabled
& & ! fts . disabledTypes . contains ( " XDXF " , Qt : : CaseInsensitive )
& & ( fts . maxDictionarySize = = 0 | | getArticleCount ( ) < = fts . maxDictionarySize ) ;
}
2022-12-29 07:07:40 +00:00
uint32_t getFtsIndexVersion ( ) override
2016-05-29 00:56:59 +00:00
{ return 1 ; }
2012-12-03 12:47:43 +00:00
protected :
2009-04-29 13:34:56 +00:00
2022-12-29 07:07:40 +00:00
void loadIcon ( ) noexcept override ;
2012-12-03 12:47:43 +00:00
private :
2012-01-24 12:55:28 +00:00
2013-06-15 08:55:15 +00:00
// Loads the article, storing its headword and formatting article's data into an html.
2009-04-29 13:34:56 +00:00
void loadArticle ( uint32_t address ,
2014-04-16 16:18:28 +00:00
string & articleText , QString * headword = 0 ) ;
2009-04-29 13:34:56 +00:00
friend class XdxfArticleRequest ;
2012-01-31 12:49:37 +00:00
friend class XdxfResourceRequest ;
2009-04-29 13:34:56 +00:00
} ;
XdxfDictionary : : XdxfDictionary ( string const & id ,
string const & indexFile ,
vector < string > const & dictionaryFiles ) :
BtreeDictionary ( id , dictionaryFiles ) ,
idx ( indexFile , " rb " ) ,
2012-12-03 12:47:43 +00:00
idxHeader ( idx . read < IdxHeader > ( ) )
2009-04-29 13:34:56 +00:00
{
// Read the dictionary name
2022-11-29 03:54:31 +00:00
chunks = std : : shared_ptr < ChunkedStorage : : Reader > ( new ChunkedStorage : : Reader ( idx , idxHeader . chunksOffset ) ) ;
2012-01-24 12:55:28 +00:00
2009-04-29 13:34:56 +00:00
if ( idxHeader . nameSize )
{
vector < char > chunk ;
2012-01-24 12:55:28 +00:00
dictionaryName = string ( chunks - > getBlock ( idxHeader . nameAddress , chunk ) ,
2009-04-29 13:34:56 +00:00
idxHeader . nameSize ) ;
}
// Open the file
2014-04-24 18:50:47 +00:00
DZ_ERRORS error ;
dz = dict_data_open ( dictionaryFiles [ 0 ] . c_str ( ) , & error , 0 ) ;
2009-04-29 13:34:56 +00:00
if ( ! dz )
2014-04-24 18:50:47 +00:00
throw exDictzipError ( string ( dz_error_str ( error ) )
+ " ( " + dictionaryFiles [ 0 ] + " ) " ) ;
2009-04-29 13:34:56 +00:00
2012-01-24 12:55:28 +00:00
// Read the abrv, if any
if ( idxHeader . hasAbrv )
{
vector < char > chunk ;
char * abrvBlock = chunks - > getBlock ( idxHeader . abrvAddress , chunk ) ;
uint32_t total ;
memcpy ( & total , abrvBlock , sizeof ( uint32_t ) ) ;
abrvBlock + = sizeof ( uint32_t ) ;
while ( total - - )
{
uint32_t keySz ;
memcpy ( & keySz , abrvBlock , sizeof ( uint32_t ) ) ;
abrvBlock + = sizeof ( uint32_t ) ;
char * key = abrvBlock ;
abrvBlock + = keySz ;
uint32_t valueSz ;
memcpy ( & valueSz , abrvBlock , sizeof ( uint32_t ) ) ;
abrvBlock + = sizeof ( uint32_t ) ;
abrv [ string ( key , keySz ) ] = string ( abrvBlock , valueSz ) ;
abrvBlock + = valueSz ;
}
2012-01-31 12:49:37 +00:00
// Open a resource zip file, if there's one
if ( idxHeader . hasZipFile & &
( idxHeader . zipIndexBtreeMaxElements | |
idxHeader . zipIndexRootOffset ) )
{
resourceZip . openIndex ( IndexInfo ( idxHeader . zipIndexBtreeMaxElements ,
idxHeader . zipIndexRootOffset ) ,
idx , idxMutex ) ;
2023-04-13 10:08:32 +00:00
QString zipName = QDir : : fromNativeSeparators ( getDictionaryFilenames ( ) . back ( ) . c_str ( ) ) ;
2012-01-31 12:49:37 +00:00
if ( zipName . endsWith ( " .zip " , Qt : : CaseInsensitive ) ) // Sanity check
resourceZip . openZipFile ( zipName ) ;
}
2012-01-24 12:55:28 +00:00
}
2009-04-29 13:34:56 +00:00
// Initialize the index
openIndex ( IndexInfo ( idxHeader . indexBtreeMaxElements ,
idxHeader . indexRootOffset ) ,
idx , idxMutex ) ;
2014-04-16 16:18:28 +00:00
// Full-text search parameters
can_FTS = true ;
2022-10-06 03:04:48 +00:00
ftsIdxName = indexFile + Dictionary : : getFtsSuffix ( ) ;
2014-04-16 16:18:28 +00:00
2014-05-08 12:38:00 +00:00
if ( ! Dictionary : : needToRebuildIndex ( dictionaryFiles , ftsIdxName )
2014-11-22 14:22:04 +00:00
& & ! FtsHelpers : : ftsIndexIsOldOrBad ( ftsIdxName , this ) )
2014-05-08 12:38:00 +00:00
FTS_index_completed . ref ( ) ;
2009-04-29 13:34:56 +00:00
}
XdxfDictionary : : ~ XdxfDictionary ( )
{
if ( dz )
dict_data_close ( dz ) ;
}
2022-06-03 13:28:41 +00:00
void XdxfDictionary : : loadIcon ( ) noexcept
2012-01-24 12:55:28 +00:00
{
if ( dictionaryIconLoaded )
return ;
2023-04-13 10:08:32 +00:00
QString fileName = QDir : : fromNativeSeparators ( getDictionaryFilenames ( ) [ 0 ] . c_str ( ) ) ;
2012-01-24 12:55:28 +00:00
2012-01-31 12:49:37 +00:00
QFileInfo baseInfo ( fileName ) ;
2012-01-24 12:55:28 +00:00
2012-01-31 12:49:37 +00:00
fileName = baseInfo . absoluteDir ( ) . absoluteFilePath ( " icon32.png " ) ;
2012-01-24 12:55:28 +00:00
QFileInfo info ( fileName ) ;
2012-01-31 12:49:37 +00:00
2012-12-03 12:47:43 +00:00
if ( ! info . isFile ( ) )
2012-01-24 12:55:28 +00:00
{
2012-01-31 12:49:37 +00:00
fileName = baseInfo . absoluteDir ( ) . absoluteFilePath ( " icon16.png " ) ;
2012-01-24 12:55:28 +00:00
info = QFileInfo ( fileName ) ;
}
2012-12-03 12:47:43 +00:00
if ( info . isFile ( ) )
loadIconFromFile ( fileName , true ) ;
2012-01-24 12:55:28 +00:00
if ( dictionaryIcon . isNull ( ) )
{
// Load failed -- use default icons
dictionaryIcon = QIcon ( " :/icons/icon32_xdxf.png " ) ;
dictionaryNativeIcon = QIcon ( " :/icons/icon32_xdxf.png " ) ;
}
dictionaryIconLoaded = true ;
}
2012-09-07 11:51:42 +00:00
QString const & XdxfDictionary : : getDescription ( )
{
if ( ! dictionaryDescription . isEmpty ( ) )
return dictionaryDescription ;
if ( idxHeader . descriptionAddress = = 0 )
dictionaryDescription = " NONE " ;
else
{
try
{
vector < char > chunk ;
char * descr ;
{
Mutex : : Lock _ ( idxMutex ) ;
descr = chunks - > getBlock ( idxHeader . descriptionAddress , chunk ) ;
}
2013-11-21 15:03:43 +00:00
dictionaryDescription = QString : : fromUtf8 ( descr , idxHeader . descriptionSize ) ;
2012-09-07 11:51:42 +00:00
}
catch ( . . . )
{
}
}
return dictionaryDescription ;
}
2023-04-13 10:08:32 +00:00
QString XdxfDictionary : : getMainFilename ( ) { return getDictionaryFilenames ( ) [ 0 ] . c_str ( ) ; }
2012-11-29 17:12:54 +00:00
2014-04-16 16:18:28 +00:00
void XdxfDictionary : : makeFTSIndex ( QAtomicInt & isCancelled , bool firstIteration )
{
if ( ! ( Dictionary : : needToRebuildIndex ( getDictionaryFilenames ( ) , ftsIdxName )
2014-11-22 14:22:04 +00:00
| | FtsHelpers : : ftsIndexIsOldOrBad ( ftsIdxName , this ) ) )
2014-04-16 16:18:28 +00:00
FTS_index_completed . ref ( ) ;
if ( haveFTSIndex ( ) )
return ;
if ( ensureInitDone ( ) . size ( ) )
return ;
if ( firstIteration & & getArticleCount ( ) > FTS : : MaxDictionarySizeForFastSearch )
return ;
gdDebug ( " Xdxf: Building the full-text index for dictionary: %s \n " ,
getName ( ) . c_str ( ) ) ;
try
{
FtsHelpers : : makeFTSIndex ( this , isCancelled ) ;
2014-04-17 14:31:51 +00:00
FTS_index_completed . ref ( ) ;
2014-04-16 16:18:28 +00:00
}
catch ( std : : exception & ex )
{
gdWarning ( " Xdxf: Failed building full-text search index for \" %s \" , reason: %s \n " , getName ( ) . c_str ( ) , ex . what ( ) ) ;
2023-04-13 10:08:32 +00:00
QFile : : remove ( ftsIdxName . c_str ( ) ) ;
2014-04-16 16:18:28 +00:00
}
}
void XdxfDictionary : : getArticleText ( uint32_t articleAddress , QString & headword , QString & text )
{
try
{
string articleStr ;
loadArticle ( articleAddress , articleStr , & headword ) ;
wstring wstr = Utf8 : : decode ( articleStr ) ;
text = Html : : unescape ( gd : : toQString ( wstr ) ) ;
}
catch ( std : : exception & ex )
{
gdWarning ( " Xdxf: Failed retrieving article from \" %s \" , reason: %s \n " , getName ( ) . c_str ( ) , ex . what ( ) ) ;
}
}
sptr < Dictionary : : DataRequest > XdxfDictionary : : getSearchResults ( QString const & searchString ,
int searchMode , bool matchCase ,
int distanceBetweenWords ,
2017-07-25 15:28:29 +00:00
int maxResults ,
2018-04-10 14:49:52 +00:00
bool ignoreWordsOrder ,
bool ignoreDiacritics )
2014-04-16 16:18:28 +00:00
{
2022-11-29 03:54:31 +00:00
return std : : make_shared < FtsHelpers : : FTSResultsRequest > ( * this , searchString , searchMode , matchCase , distanceBetweenWords , maxResults , ignoreWordsOrder , ignoreDiacritics ) ;
2014-04-16 16:18:28 +00:00
}
2009-04-29 13:34:56 +00:00
/// XdxfDictionary::getArticle()
class XdxfArticleRequest ;
class XdxfArticleRequestRunnable : public QRunnable
{
XdxfArticleRequest & r ;
QSemaphore & hasExited ;
public :
XdxfArticleRequestRunnable ( XdxfArticleRequest & r_ ,
QSemaphore & hasExited_ ) : r ( r_ ) ,
hasExited ( hasExited_ )
{ }
~ XdxfArticleRequestRunnable ( )
{
hasExited . release ( ) ;
}
2022-12-29 07:07:40 +00:00
void run ( ) override ;
2009-04-29 13:34:56 +00:00
} ;
class XdxfArticleRequest : public Dictionary : : DataRequest
{
friend class XdxfArticleRequestRunnable ;
wstring word ;
vector < wstring > alts ;
XdxfDictionary & dict ;
2018-06-13 16:00:42 +00:00
bool ignoreDiacritics ;
2009-04-29 13:34:56 +00:00
QAtomicInt isCancelled ;
QSemaphore hasExited ;
public :
XdxfArticleRequest ( wstring const & word_ ,
vector < wstring > const & alts_ ,
2018-06-13 16:00:42 +00:00
XdxfDictionary & dict_ , bool ignoreDiacritics_ ) :
word ( word_ ) , alts ( alts_ ) , dict ( dict_ ) , ignoreDiacritics ( ignoreDiacritics_ )
2009-04-29 13:34:56 +00:00
{
QThreadPool : : globalInstance ( ) - > start (
new XdxfArticleRequestRunnable ( * this , hasExited ) ) ;
}
void run ( ) ; // Run from another thread by XdxfArticleRequestRunnable
2022-12-29 07:07:40 +00:00
void cancel ( ) override
2009-04-29 13:34:56 +00:00
{
isCancelled . ref ( ) ;
}
~ XdxfArticleRequest ( )
{
isCancelled . ref ( ) ;
hasExited . acquire ( ) ;
}
} ;
void XdxfArticleRequestRunnable : : run ( )
{
r . run ( ) ;
}
void XdxfArticleRequest : : run ( )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-29 13:34:56 +00:00
{
finish ( ) ;
return ;
}
2018-06-13 16:00:42 +00:00
vector < WordArticleLink > chain = dict . findArticles ( word , ignoreDiacritics ) ;
2009-04-29 13:34:56 +00:00
for ( unsigned x = 0 ; x < alts . size ( ) ; + + x )
{
/// Make an additional query for each alt
2018-06-13 16:00:42 +00:00
vector < WordArticleLink > altChain = dict . findArticles ( alts [ x ] , ignoreDiacritics ) ;
2009-04-29 13:34:56 +00:00
chain . insert ( chain . end ( ) , altChain . begin ( ) , altChain . end ( ) ) ;
}
multimap < wstring , pair < string , string > > mainArticles , alternateArticles ;
set < uint32_t > articlesIncluded ; // Some synonims make it that the articles
// appear several times. We combat this
// by only allowing them to appear once.
wstring wordCaseFolded = Folding : : applySimpleCaseOnly ( word ) ;
2018-06-13 16:00:42 +00:00
if ( ignoreDiacritics )
wordCaseFolded = Folding : : applyDiacriticsOnly ( wordCaseFolded ) ;
2009-04-29 13:34:56 +00:00
for ( unsigned x = 0 ; x < chain . size ( ) ; + + x )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-29 13:34:56 +00:00
{
finish ( ) ;
return ;
}
if ( articlesIncluded . find ( chain [ x ] . articleOffset ) ! = articlesIncluded . end ( ) )
continue ; // We already have this article in the body.
// Now grab that article
string headword , articleText ;
2012-02-02 14:01:39 +00:00
headword = chain [ x ] . word ;
2009-04-29 13:34:56 +00:00
2013-09-24 13:56:47 +00:00
try
{
dict . loadArticle ( chain [ x ] . articleOffset , articleText ) ;
2009-04-29 13:34:56 +00:00
2013-09-24 13:56:47 +00:00
// Ok. Now, does it go to main articles, or to alternate ones? We list
// main ones first, and alternates after.
2009-04-29 13:34:56 +00:00
2013-09-24 13:56:47 +00:00
// We do the case-folded comparison here.
2009-04-29 13:34:56 +00:00
2013-09-24 13:56:47 +00:00
wstring headwordStripped =
Folding : : applySimpleCaseOnly ( Utf8 : : decode ( headword ) ) ;
2018-06-13 16:00:42 +00:00
if ( ignoreDiacritics )
headwordStripped = Folding : : applyDiacriticsOnly ( headwordStripped ) ;
2009-04-29 13:34:56 +00:00
2013-09-24 13:56:47 +00:00
multimap < wstring , pair < string , string > > & mapToUse =
( wordCaseFolded = = headwordStripped ) ?
mainArticles : alternateArticles ;
2009-04-29 13:34:56 +00:00
2013-09-24 13:56:47 +00:00
mapToUse . insert ( pair < wstring , pair < string , string > > (
Folding : : applySimpleCaseOnly ( Utf8 : : decode ( headword ) ) ,
pair < string , string > ( headword , articleText ) ) ) ;
2009-04-29 13:34:56 +00:00
2013-09-24 13:56:47 +00:00
articlesIncluded . insert ( chain [ x ] . articleOffset ) ;
}
catch ( std : : exception & ex )
{
2013-11-16 18:34:09 +00:00
gdWarning ( " XDXF: Failed loading article from \" %s \" , reason: %s \n " , dict . getName ( ) . c_str ( ) , ex . what ( ) ) ;
2013-09-24 13:56:47 +00:00
}
2009-04-29 13:34:56 +00:00
}
if ( mainArticles . empty ( ) & & alternateArticles . empty ( ) )
{
// No such word
finish ( ) ;
return ;
}
string result ;
multimap < wstring , pair < string , string > > : : const_iterator i ;
2023-04-15 07:39:49 +00:00
string cleaner = Utils : : Html : : getHtmlCleaner ( ) ;
2009-04-29 13:34:56 +00:00
for ( i = mainArticles . begin ( ) ; i ! = mainArticles . end ( ) ; + + i )
{
2012-02-02 14:01:39 +00:00
// result += "<h3>";
// result += i->second.first;
// result += "</h3>";
2009-04-29 13:34:56 +00:00
result + = i - > second . second ;
result + = cleaner ;
}
for ( i = alternateArticles . begin ( ) ; i ! = alternateArticles . end ( ) ; + + i )
{
2012-02-02 14:01:39 +00:00
// result += "<h3>";
// result += i->second.first;
// result += "</h3>";
2009-04-29 13:34:56 +00:00
result + = i - > second . second ;
result + = cleaner ;
}
Mutex : : Lock _ ( dataMutex ) ;
data . resize ( result . size ( ) ) ;
memcpy ( & data . front ( ) , result . data ( ) , result . size ( ) ) ;
hasAnyData = true ;
finish ( ) ;
}
sptr < Dictionary : : DataRequest > XdxfDictionary : : getArticle ( wstring const & word ,
2012-01-24 12:55:28 +00:00
vector < wstring > const & alts ,
2018-06-13 16:00:42 +00:00
wstring const & ,
bool ignoreDiacritics )
2022-01-09 08:35:07 +00:00
2009-04-29 13:34:56 +00:00
{
2022-11-29 03:54:31 +00:00
return std : : make_shared < XdxfArticleRequest > ( word , alts , * this , ignoreDiacritics ) ;
2009-04-29 13:34:56 +00:00
}
void XdxfDictionary : : loadArticle ( uint32_t address ,
2014-04-16 16:18:28 +00:00
string & articleText ,
QString * headword )
2009-04-29 13:34:56 +00:00
{
// Read the properties
vector < char > chunk ;
char * propertiesData ;
{
Mutex : : Lock _ ( idxMutex ) ;
2012-01-24 12:55:28 +00:00
propertiesData = chunks - > getBlock ( address , chunk ) ;
2009-04-29 13:34:56 +00:00
}
if ( & chunk . front ( ) + chunk . size ( ) - propertiesData < 9 )
2013-09-20 14:25:44 +00:00
{
articleText = string ( " <div class= \" xdxf \" >Index seems corrupted</div> " ) ;
return ;
}
2009-04-29 13:34:56 +00:00
2013-06-15 08:55:15 +00:00
unsigned char fType = ( unsigned char ) * propertiesData ;
2009-04-29 13:34:56 +00:00
uint32_t articleOffset , articleSize ;
memcpy ( & articleOffset , propertiesData + 1 , sizeof ( uint32_t ) ) ;
memcpy ( & articleSize , propertiesData + 5 , sizeof ( uint32_t ) ) ;
// Load the article
char * articleBody ;
{
Mutex : : Lock _ ( dzMutex ) ;
// Note that the function always zero-pads the result.
articleBody = dict_data_read_ ( dz , articleOffset , articleSize , 0 , 0 ) ;
}
if ( ! articleBody )
2013-03-15 12:27:32 +00:00
{
// throw exCantReadFile( getDictionaryFilenames()[ 0 ] );
articleText = string ( " <div class= \" xdxf \" >DICTZIP error: " ) + dict_error_str ( dz ) + " </div> " ;
return ;
}
2009-04-29 13:34:56 +00:00
2013-06-15 08:55:15 +00:00
articleText = Xdxf2Html : : convert ( string ( articleBody ) , Xdxf2Html : : XDXF , idxHeader . hasAbrv ? & abrv : NULL , this ,
2015-08-01 10:38:39 +00:00
& resourceZip , fType = = Logical , idxHeader . revisionNumber , headword ) ;
2009-04-29 13:34:56 +00:00
free ( articleBody ) ;
}
class GzippedFile : public QIODevice
{
gzFile gz ;
public :
2022-01-09 08:35:07 +00:00
GzippedFile ( char const * fileName ) ;
2009-04-29 13:34:56 +00:00
~ GzippedFile ( ) ;
2018-05-21 15:32:04 +00:00
// size_t gzTell();
2009-04-29 13:34:56 +00:00
2012-01-24 12:55:28 +00:00
char * readDataArray ( unsigned long startPos , unsigned long size ) ;
2009-04-29 13:34:56 +00:00
protected :
2012-01-24 12:55:28 +00:00
dictData * dz ;
2022-12-29 07:07:40 +00:00
bool isSequential ( ) const override
2009-04-29 13:34:56 +00:00
{ return false ; } // Which is a lie, but else pos() won't work
2022-12-29 07:07:40 +00:00
bool waitForReadyRead ( int ) override
2009-04-29 13:34:56 +00:00
{ return ! gzeof ( gz ) ; }
2022-12-29 07:07:40 +00:00
qint64 bytesAvailable ( ) const override
2012-01-24 12:55:28 +00:00
{
2009-04-29 13:34:56 +00:00
return ( gzeof ( gz ) ? 0 : 1 ) + QIODevice : : bytesAvailable ( ) ;
2012-01-24 12:55:28 +00:00
}
2009-04-29 13:34:56 +00:00
2022-12-29 07:07:40 +00:00
qint64 readData ( char * data , qint64 maxSize ) override ;
2009-04-29 13:34:56 +00:00
2022-12-29 07:07:40 +00:00
bool atEnd ( ) const override ;
2009-04-29 13:34:56 +00:00
2022-12-29 07:07:40 +00:00
qint64 writeData ( const char * /*data*/ , qint64 /*maxSize*/ ) override
2009-04-29 13:34:56 +00:00
{ return - 1 ; }
} ;
2022-01-09 08:35:07 +00:00
GzippedFile : : GzippedFile ( char const * fileName )
2009-04-29 13:34:56 +00:00
{
2012-01-25 16:35:00 +00:00
gz = gd_gzopen ( fileName ) ;
2009-04-29 13:34:56 +00:00
if ( ! gz )
throw exCantReadFile ( fileName ) ;
2012-01-24 12:55:28 +00:00
2014-04-24 18:50:47 +00:00
DZ_ERRORS error ;
dz = dict_data_open ( fileName , & error , 0 ) ;
2009-04-29 13:34:56 +00:00
}
GzippedFile : : ~ GzippedFile ( )
{
gzclose ( gz ) ;
2012-01-24 12:55:28 +00:00
if ( dz )
dict_data_close ( dz ) ;
2009-04-29 13:34:56 +00:00
}
2014-05-12 13:46:33 +00:00
bool GzippedFile : : atEnd ( ) const
2009-04-29 13:34:56 +00:00
{
return gzeof ( gz ) ;
}
2018-05-21 15:32:04 +00:00
/*
2009-04-29 13:34:56 +00:00
size_t GzippedFile : : gzTell ( )
{
return gztell ( gz ) ;
}
2018-05-21 15:32:04 +00:00
*/
2009-04-29 13:34:56 +00:00
qint64 GzippedFile : : readData ( char * data , qint64 maxSize )
{
if ( maxSize > 1 )
maxSize = 1 ;
// The returning value translates directly to QIODevice semantics
2016-05-29 00:56:59 +00:00
int n = gzread ( gz , data , maxSize ) ;
// With QT 5.x QXmlStreamReader ask one byte instead of one UTF-8 char.
// We read and return all bytes for char.
if ( n = = 1 )
{
char ch = * data ;
int addBytes = 0 ;
if ( ch & 0x80 )
{
if ( ( ch & 0xF8 ) = = 0xF0 )
addBytes = 3 ;
else if ( ( ch & 0xF0 ) = = 0xE0 )
addBytes = 2 ;
else if ( ( ch & 0xE0 ) = = 0xC0 )
addBytes = 1 ;
}
if ( addBytes )
n + = gzread ( gz , data + 1 , addBytes ) ;
}
return n ;
2009-04-29 13:34:56 +00:00
}
2012-01-24 12:55:28 +00:00
char * GzippedFile : : readDataArray ( unsigned long startPos , unsigned long size )
{
if ( dz = = NULL )
return NULL ;
return dict_data_read_ ( dz , startPos , size , 0 , 0 ) ;
}
2009-04-29 13:34:56 +00:00
QString readXhtmlData ( QXmlStreamReader & stream )
{
QString result ;
while ( ! stream . atEnd ( ) )
{
stream . readNext ( ) ;
if ( stream . isStartElement ( ) )
{
QString name = stream . name ( ) . toString ( ) ;
2021-11-27 07:17:33 +00:00
result + = " < " + Utils : : escape ( name ) + " " ;
2009-04-29 13:34:56 +00:00
QXmlStreamAttributes attrs = stream . attributes ( ) ;
for ( int x = 0 ; x < attrs . size ( ) ; + + x )
{
2021-11-27 07:17:33 +00:00
result + = Utils : : escape ( attrs [ x ] . name ( ) . toString ( ) ) ;
result + = " = \" " + Utils : : escape ( attrs [ x ] . value ( ) . toString ( ) ) + " \" " ;
2009-04-29 13:34:56 +00:00
}
result + = " > " ;
result + = readXhtmlData ( stream ) ;
2021-11-27 07:17:33 +00:00
result + = " </ " + Utils : : escape ( name ) + " > " ;
2009-04-29 13:34:56 +00:00
}
else
if ( stream . isCharacters ( ) | | stream . isWhitespace ( ) | | stream . isCDATA ( ) )
{
result + = stream . text ( ) ;
}
else
if ( stream . isEndElement ( ) )
break ;
}
return result ;
}
2013-07-18 13:02:39 +00:00
namespace {
/// Deal with Qt 4.5 incompatibility
QString readElementText ( QXmlStreamReader & stream )
{
return stream . readElementText ( QXmlStreamReader : : SkipChildElements ) ;
}
}
2009-04-29 13:34:56 +00:00
void addAllKeyTags ( QXmlStreamReader & stream , list < QString > & words )
{
2022-02-27 05:17:37 +00:00
// todo implement support for tag <srt>, that overrides the article sorting order
if ( stream . name ( ) = = u " k " )
2009-04-29 13:34:56 +00:00
{
2013-07-18 13:02:39 +00:00
words . push_back ( readElementText ( stream ) ) ;
2009-04-29 13:34:56 +00:00
return ;
}
2013-05-20 15:36:06 +00:00
while ( ! stream . atEnd ( ) )
2009-04-29 13:34:56 +00:00
{
stream . readNext ( ) ;
if ( stream . isStartElement ( ) )
addAllKeyTags ( stream , words ) ;
else
if ( stream . isEndElement ( ) )
return ;
}
}
2012-01-24 12:55:28 +00:00
void checkArticlePosition ( GzippedFile & gzFile ,
2012-02-08 20:21:26 +00:00
uint32_t * pOffset ,
uint32_t * pSize )
2012-01-24 12:55:28 +00:00
{
char * data = gzFile . readDataArray ( * pOffset , * pSize ) ;
if ( data = = NULL )
return ;
QString s = QString : : fromUtf8 ( data ) ;
free ( data ) ;
int n = s . lastIndexOf ( " </ar " ) ;
if ( n > 0 )
* pSize - = s . size ( ) - n ;
if ( s . at ( 0 ) = = ' > ' )
{
* pOffset + = 1 ;
* pSize - = 1 ;
}
}
2009-04-29 13:34:56 +00:00
void indexArticle ( GzippedFile & gzFile ,
QXmlStreamReader & stream ,
IndexedWords & indexedWords ,
ChunkedStorage : : Writer & chunks ,
unsigned & articleCount ,
2013-06-15 08:55:15 +00:00
unsigned & wordCount ,
ArticleFormat defaultFormat )
2009-04-29 13:34:56 +00:00
{
ArticleFormat format ( Default ) ;
2022-02-27 05:17:37 +00:00
QStringView formatValue = stream . attributes ( ) . value ( " f " ) ;
2009-04-29 13:34:56 +00:00
2022-02-27 05:17:37 +00:00
if ( formatValue = = u " v " )
2009-04-29 13:34:56 +00:00
format = Visual ;
else
2022-02-27 05:17:37 +00:00
if ( formatValue = = u " l " )
2009-04-29 13:34:56 +00:00
format = Logical ;
2013-06-15 08:55:15 +00:00
if ( format = = Default )
format = defaultFormat ;
2009-04-29 13:34:56 +00:00
size_t articleOffset = gzFile . pos ( ) - 1 ; // stream.characterOffset() is loony
2012-01-24 12:55:28 +00:00
// uint32_t lineNumber = stream.lineNumber();
// uint32_t columnNumber = stream.columnNumber();
2009-04-29 13:34:56 +00:00
list < QString > words ;
while ( ! stream . atEnd ( ) )
{
stream . readNext ( ) ;
// Find any <k> tags and index them
if ( stream . isEndElement ( ) )
{
// End of the <ar> tag
if ( words . empty ( ) )
{
// Nothing to index, this article didn't have any tags
2017-06-22 15:02:04 +00:00
gdWarning ( " No <k> tags found in an article at offset 0x%x, article skipped. \n " ,
2013-09-20 14:25:44 +00:00
( unsigned ) articleOffset ) ;
2009-04-29 13:34:56 +00:00
}
else
{
// Add an entry
uint32_t offset = chunks . startNewBlock ( ) ;
2012-01-24 12:55:28 +00:00
uint32_t offs = articleOffset ;
uint32_t size = gzFile . pos ( ) - 1 - articleOffset ;
checkArticlePosition ( gzFile , & offs , & size ) ;
2009-04-29 13:34:56 +00:00
unsigned char f = format ;
chunks . addToBlock ( & f , 1 ) ;
2012-01-24 12:55:28 +00:00
chunks . addToBlock ( & offs , sizeof ( offs ) ) ;
chunks . addToBlock ( & size , sizeof ( size ) ) ;
2009-04-29 13:34:56 +00:00
2014-04-16 16:18:28 +00:00
// Add also first header - it's needed for full-text search
chunks . addToBlock ( words . begin ( ) - > toUtf8 ( ) . data ( ) , words . begin ( ) - > toUtf8 ( ) . length ( ) + 1 ) ;
2022-01-15 07:29:20 +00:00
// GD_DPRINTF( "%x: %s\n", articleOffset, words.begin()->toUtf8().data() );
2009-04-29 13:34:56 +00:00
// Add words to index
for ( list < QString > : : const_iterator i = words . begin ( ) ; i ! = words . end ( ) ;
+ + i )
2012-01-24 12:55:28 +00:00
indexedWords . addWord ( gd : : toWString ( * i ) , offset ) ;
2009-04-29 13:34:56 +00:00
+ + articleCount ;
wordCount + = words . size ( ) ;
}
return ;
}
else
if ( stream . isStartElement ( ) )
{
addAllKeyTags ( stream , words ) ;
}
}
}
2012-01-31 12:49:37 +00:00
//// XdxfDictionary::getResource()
class XdxfResourceRequest ;
class XdxfResourceRequestRunnable : public QRunnable
{
XdxfResourceRequest & r ;
QSemaphore & hasExited ;
public :
XdxfResourceRequestRunnable ( XdxfResourceRequest & r_ ,
QSemaphore & hasExited_ ) : r ( r_ ) ,
hasExited ( hasExited_ )
{ }
~ XdxfResourceRequestRunnable ( )
{
hasExited . release ( ) ;
}
2022-12-29 07:07:40 +00:00
void run ( ) override ;
2012-01-31 12:49:37 +00:00
} ;
class XdxfResourceRequest : public Dictionary : : DataRequest
{
friend class XdxfResourceRequestRunnable ;
XdxfDictionary & dict ;
string resourceName ;
QAtomicInt isCancelled ;
QSemaphore hasExited ;
public :
XdxfResourceRequest ( XdxfDictionary & dict_ ,
string const & resourceName_ ) :
dict ( dict_ ) ,
resourceName ( resourceName_ )
{
QThreadPool : : globalInstance ( ) - > start (
new XdxfResourceRequestRunnable ( * this , hasExited ) ) ;
}
void run ( ) ; // Run from another thread by XdxfResourceRequestRunnable
2022-12-29 07:07:40 +00:00
void cancel ( ) override
2012-01-31 12:49:37 +00:00
{
isCancelled . ref ( ) ;
}
~ XdxfResourceRequest ( )
{
isCancelled . ref ( ) ;
hasExited . acquire ( ) ;
}
} ;
void XdxfResourceRequestRunnable : : run ( )
{
r . run ( ) ;
}
void XdxfResourceRequest : : run ( )
{
// Some runnables linger enough that they are cancelled before they start
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2012-01-31 12:49:37 +00:00
{
finish ( ) ;
return ;
}
if ( dict . ensureInitDone ( ) . size ( ) )
{
setErrorString ( QString : : fromUtf8 ( dict . ensureInitDone ( ) . c_str ( ) ) ) ;
finish ( ) ;
return ;
}
2023-04-14 03:53:23 +00:00
string n = dict . getContainingFolder ( ) . toStdString ( ) + FsEncoding : : separator ( ) + resourceName ;
2012-01-31 12:49:37 +00:00
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " n is %s \n " , n . c_str ( ) ) ;
2012-01-31 12:49:37 +00:00
try
{
try
{
Mutex : : Lock _ ( dataMutex ) ;
File : : loadFromFile ( n , data ) ;
}
catch ( File : : exCantOpen & )
{
2023-04-13 10:08:32 +00:00
n = dict . getDictionaryFilenames ( ) [ 0 ] + " .files " + FsEncoding : : separator ( ) + resourceName ;
2012-01-31 12:49:37 +00:00
try
{
Mutex : : Lock _ ( dataMutex ) ;
File : : loadFromFile ( n , data ) ;
}
catch ( File : : exCantOpen & )
{
// Try reading from zip file
if ( dict . resourceZip . isOpen ( ) )
{
Mutex : : Lock _ ( dict . resourceZipMutex ) ;
Mutex : : Lock __ ( dataMutex ) ;
if ( ! dict . resourceZip . loadFile ( Utf8 : : decode ( resourceName ) , data ) )
throw ; // Make it fail since we couldn't read the archive
}
else
throw ;
}
}
if ( Filetype : : isNameOfTiff ( resourceName ) )
{
// Convert it
2022-04-05 13:25:07 +00:00
Mutex : : Lock _ ( dataMutex ) ;
GdTiff : : tiff2img ( data ) ;
2012-01-31 12:49:37 +00:00
}
Mutex : : Lock _ ( dataMutex ) ;
hasAnyData = true ;
}
2013-09-24 13:56:47 +00:00
catch ( std : : exception & ex )
2012-01-31 12:49:37 +00:00
{
2013-11-16 18:34:09 +00:00
gdWarning ( " XDXF: Failed loading resource \" %s \" for \" %s \" , reason: %s \n " ,
resourceName . c_str ( ) , dict . getName ( ) . c_str ( ) , ex . what ( ) ) ;
2013-09-24 13:56:47 +00:00
// Resource not loaded -- we don't set the hasAnyData flag then
2012-01-31 12:49:37 +00:00
}
finish ( ) ;
}
sptr < Dictionary : : DataRequest > XdxfDictionary : : getResource ( string const & name )
2022-01-09 08:35:07 +00:00
2012-01-31 12:49:37 +00:00
{
2022-11-29 03:54:31 +00:00
return std : : make_shared < XdxfResourceRequest > ( * this , name ) ;
2012-01-31 12:49:37 +00:00
}
2013-06-15 08:55:15 +00:00
}
// anonymous namespace - this section of file is devoted to rebuilding of dictionary articles index
2009-04-29 13:34:56 +00:00
vector < sptr < Dictionary : : Class > > makeDictionaries (
vector < string > const & fileNames ,
string const & indicesDir ,
Dictionary : : Initializing & initializing )
2022-01-09 08:35:07 +00:00
2009-04-29 13:34:56 +00:00
{
vector < sptr < Dictionary : : Class > > dictionaries ;
for ( vector < string > : : const_iterator i = fileNames . begin ( ) ; i ! = fileNames . end ( ) ;
+ + i )
{
// Only allow .xdxf and .xdxf.dz suffixes
if ( ( i - > size ( ) < 5 | | strcasecmp ( i - > c_str ( ) + ( i - > size ( ) - 5 ) , " .xdxf " ) ! = 0 ) & &
( i - > size ( ) < 8 | |
strcasecmp ( i - > c_str ( ) + ( i - > size ( ) - 8 ) , " .xdxf.dz " ) ! = 0 ) )
continue ;
try
{
vector < string > dictFiles ( 1 , * i ) ;
2012-01-31 12:49:37 +00:00
string baseName = ( ( * i ) [ i - > size ( ) - 5 ] = = ' . ' ) ?
string ( * i , 0 , i - > size ( ) - 5 ) : string ( * i , 0 , i - > size ( ) - 8 ) ;
// See if there's a zip file with resources present. If so, include it.
string zipFileName ;
2017-04-24 14:42:01 +00:00
if ( File : : tryPossibleZipName ( baseName + " .xdxf.files.zip " , zipFileName ) | |
File : : tryPossibleZipName ( baseName + " .xdxf.dz.files.zip " , zipFileName ) | |
File : : tryPossibleZipName ( baseName + " .XDXF.FILES.ZIP " , zipFileName ) | |
File : : tryPossibleZipName ( baseName + " .XDXF.DZ.FILES.ZIP " , zipFileName ) )
2012-01-31 12:49:37 +00:00
dictFiles . push_back ( zipFileName ) ;
2009-04-29 13:34:56 +00:00
string dictId = Dictionary : : makeDictionaryId ( dictFiles ) ;
string indexFile = indicesDir + dictId ;
if ( Dictionary : : needToRebuildIndex ( dictFiles , indexFile ) | |
indexIsOldOrBad ( indexFile ) )
{
// Building the index
2013-11-16 18:34:09 +00:00
gdDebug ( " Xdxf: Building the index for dictionary: %s \n " , i - > c_str ( ) ) ;
2013-09-20 14:25:44 +00:00
2009-04-29 13:34:56 +00:00
//initializing.indexingDictionary( nameFromFileName( dictFiles[ 0 ] ) );
File : : Class idx ( indexFile , " wb " ) ;
IdxHeader idxHeader ;
2012-01-24 12:55:28 +00:00
map < string , string > abrv ;
2009-04-29 13:34:56 +00:00
memset ( & idxHeader , 0 , sizeof ( idxHeader ) ) ;
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
idx . write ( idxHeader ) ;
IndexedWords indexedWords ;
GzippedFile gzFile ( dictFiles [ 0 ] . c_str ( ) ) ;
if ( ! gzFile . open ( QIODevice : : ReadOnly ) )
throw exCantReadFile ( dictFiles [ 0 ] ) ;
QXmlStreamReader stream ( & gzFile ) ;
QString dictionaryName , dictionaryDescription ;
ChunkedStorage : : Writer chunks ( idx ) ;
// Wait for the first element, which must be xdxf
bool hadXdxf = false ;
while ( ! stream . atEnd ( ) )
{
stream . readNext ( ) ;
if ( stream . isStartElement ( ) )
{
2022-02-27 05:17:37 +00:00
if ( stream . name ( ) ! = u " xdxf " )
2009-04-29 13:34:56 +00:00
throw exNotXdxfFile ( dictFiles [ 0 ] ) ;
else
{
// Read the xdxf
2013-02-03 20:19:55 +00:00
string str = stream . attributes ( ) . value ( " lang_from " ) . toString ( ) . toLatin1 ( ) . data ( ) ;
2023-01-19 14:57:47 +00:00
if ( ! str . empty ( ) )
idxHeader . langFrom = getLanguageId ( str . c_str ( ) ) ;
2009-04-29 13:34:56 +00:00
2013-02-03 20:19:55 +00:00
str = stream . attributes ( ) . value ( " lang_to " ) . toString ( ) . toLatin1 ( ) . data ( ) ;
2023-01-19 14:57:47 +00:00
if ( ! str . empty ( ) )
idxHeader . langTo = getLanguageId ( str . c_str ( ) ) ;
2015-07-07 15:02:06 +00:00
QRegExp regNum ( " \\ d+ " ) ;
regNum . indexIn ( stream . attributes ( ) . value ( " revision " ) . toString ( ) ) ;
idxHeader . revisionNumber = regNum . cap ( ) . toUInt ( ) ;
2009-04-29 13:34:56 +00:00
2023-01-24 12:37:34 +00:00
bool isLogical = ( stream . attributes ( ) . value ( " format " ) = = u " logical " | | idxHeader . revisionNumber > = 34 ) ;
2023-01-19 14:57:47 +00:00
2009-04-29 13:34:56 +00:00
idxHeader . articleFormat = isLogical ? Logical : Visual ;
unsigned articleCount = 0 , wordCount = 0 ;
while ( ! stream . atEnd ( ) )
{
stream . readNext ( ) ;
if ( stream . isStartElement ( ) )
{
2013-06-15 08:55:15 +00:00
// todo implement using short <title> for denoting the dictionary in settings or dict list toolbar
2022-02-27 05:17:37 +00:00
if ( stream . name ( ) = = u " full_name " | | stream . name ( ) = = u " full_title " )
2009-04-29 13:34:56 +00:00
{
// That's our name
QString name = stream . readElementText ( ) ;
if ( dictionaryName . isEmpty ( ) )
{
dictionaryName = name ;
initializing . indexingDictionary ( dictionaryName . toUtf8 ( ) . data ( ) ) ;
idxHeader . nameAddress = chunks . startNewBlock ( ) ;
QByteArray n = dictionaryName . toUtf8 ( ) ;
idxHeader . nameSize = n . size ( ) ;
chunks . addToBlock ( n . data ( ) , n . size ( ) ) ;
}
else
2013-02-04 13:46:30 +00:00
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " Warning: duplicate full_name in %s \n " , dictFiles [ 0 ] . c_str ( ) ) ;
2013-02-04 13:46:30 +00:00
}
2009-04-29 13:34:56 +00:00
}
else
2022-02-27 05:17:37 +00:00
if ( stream . name ( ) = = u " description " )
2009-04-29 13:34:56 +00:00
{
2013-06-15 08:55:15 +00:00
// todo implement adding other information to the description like <publisher>, <authors>, <file_ver>, <creation_date>, <last_edited_date>, <dict_edition>, <publishing_date>, <dict_src_url>
2009-04-29 13:34:56 +00:00
QString desc = readXhtmlData ( stream ) ;
2023-01-19 14:57:47 +00:00
if ( isLogical )
{
desc = desc . simplified ( ) ;
2023-01-24 12:37:34 +00:00
QRegularExpression br ( " <br \\ s*> \\ s*</br> " ) ;
desc . replace ( br , QString ( " \n " ) ) ;
2023-01-19 14:57:47 +00:00
}
2009-04-29 13:34:56 +00:00
if ( dictionaryDescription . isEmpty ( ) )
{
dictionaryDescription = desc ;
idxHeader . descriptionAddress = chunks . startNewBlock ( ) ;
QByteArray n = dictionaryDescription . toUtf8 ( ) ;
idxHeader . descriptionSize = n . size ( ) ;
chunks . addToBlock ( n . data ( ) , n . size ( ) ) ;
}
else
2013-02-04 13:46:30 +00:00
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " Warning: duplicate description in %s \n " , dictFiles [ 0 ] . c_str ( ) ) ;
2013-02-04 13:46:30 +00:00
}
2009-04-29 13:34:56 +00:00
}
else
2023-01-24 12:37:34 +00:00
if ( stream . name ( ) = = u " languages " )
2023-01-19 14:57:47 +00:00
{
2023-01-24 12:37:34 +00:00
while ( ! ( stream . isEndElement ( ) & & stream . name ( ) = = u " languages " ) & & ! stream . atEnd ( ) )
2023-01-19 14:57:47 +00:00
{
if ( ! stream . readNext ( ) )
break ;
if ( stream . isStartElement ( ) )
{
2023-01-24 12:37:34 +00:00
if ( stream . name ( ) = = u " from " )
2023-01-19 14:57:47 +00:00
{
if ( idxHeader . langFrom = = 0 )
{
QString lang = stream . attributes ( ) . value ( " xml:lang " ) . toString ( ) ;
idxHeader . langFrom = getLanguageId ( lang ) ;
}
}
2023-01-24 12:37:34 +00:00
else if ( stream . name ( ) = = u " to " )
2023-01-19 14:57:47 +00:00
{
if ( idxHeader . langTo = = 0 )
{
QString lang = stream . attributes ( ) . value ( " xml:lang " ) . toString ( ) ;
idxHeader . langTo = getLanguageId ( lang ) ;
}
}
}
2023-01-24 12:37:34 +00:00
else if ( stream . isEndElement ( ) & & stream . name ( ) = = u " languages " )
2023-01-19 14:57:47 +00:00
break ;
}
}
else
2022-02-27 05:17:37 +00:00
if ( stream . name ( ) = = u " abbreviations " )
2012-01-24 12:55:28 +00:00
{
2013-06-15 08:55:15 +00:00
QString s ;
string value ;
list < wstring > keys ;
2022-02-27 05:17:37 +00:00
while ( ! ( stream . isEndElement ( ) & & stream . name ( ) = = u " abbreviations " ) & & ! stream . atEnd ( ) )
2013-06-15 08:55:15 +00:00
{
2019-03-17 20:04:14 +00:00
if ( ! stream . readNextStartElement ( ) )
break ;
// abbreviations tag set switch at format revision = 30
2013-06-15 08:55:15 +00:00
if ( idxHeader . revisionNumber > = 30 )
2012-01-24 12:55:28 +00:00
{
2022-02-27 05:17:37 +00:00
while ( ! ( stream . isEndElement ( ) & & stream . name ( ) = = u " abbr_def " ) | | ! stream . atEnd ( ) )
2013-06-15 08:55:15 +00:00
{
2022-02-27 05:17:37 +00:00
if ( stream . isStartElement ( ) & & stream . name ( ) = = u " abbr_k " )
2012-01-24 12:55:28 +00:00
{
2013-07-18 13:02:39 +00:00
s = readElementText ( stream ) ;
2013-06-15 08:55:15 +00:00
keys . push_back ( gd : : toWString ( s ) ) ;
2012-01-24 12:55:28 +00:00
}
2022-02-27 05:17:37 +00:00
else if ( stream . isStartElement ( ) & & stream . name ( ) = = u " abbr_v " )
2013-06-15 08:55:15 +00:00
{
2013-07-18 13:02:39 +00:00
s = readElementText ( stream ) ;
2013-06-15 08:55:15 +00:00
value = Utf8 : : encode ( Folding : : trimWhitespace ( gd : : toWString ( s ) ) ) ;
for ( list < wstring > : : iterator i = keys . begin ( ) ; i ! = keys . end ( ) ; + + i )
{
abrv [ Utf8 : : encode ( Folding : : trimWhitespace ( * i ) ) ] = value ;
}
keys . clear ( ) ;
}
2022-02-27 05:17:37 +00:00
else if ( stream . isEndElement ( ) & & stream . name ( ) = = u " abbreviations " )
2013-06-15 08:55:15 +00:00
break ;
2019-03-17 20:04:14 +00:00
stream . readNext ( ) ;
2013-06-15 08:55:15 +00:00
}
2012-01-24 12:55:28 +00:00
}
2013-06-15 08:55:15 +00:00
else
{
2022-02-27 05:17:37 +00:00
while ( ! ( stream . isEndElement ( ) & & stream . name ( ) = = u " abr_def " ) | | ! stream . atEnd ( ) )
2013-06-15 08:55:15 +00:00
{
2022-02-27 05:17:37 +00:00
if ( stream . isStartElement ( ) & & stream . name ( ) = = u " k " )
2013-06-15 08:55:15 +00:00
{
2013-07-18 13:02:39 +00:00
s = readElementText ( stream ) ;
2013-06-15 08:55:15 +00:00
keys . push_back ( gd : : toWString ( s ) ) ;
}
2022-02-27 05:17:37 +00:00
else if ( stream . isStartElement ( ) & & stream . name ( ) = = u " v " )
2013-06-15 08:55:15 +00:00
{
2013-07-18 13:02:39 +00:00
s = readElementText ( stream ) ;
2013-06-15 08:55:15 +00:00
value = Utf8 : : encode ( Folding : : trimWhitespace ( gd : : toWString ( s ) ) ) ;
for ( list < wstring > : : iterator i = keys . begin ( ) ; i ! = keys . end ( ) ; + + i )
{
abrv [ Utf8 : : encode ( Folding : : trimWhitespace ( * i ) ) ] = value ;
}
keys . clear ( ) ;
}
2022-02-27 05:17:37 +00:00
else if ( stream . isEndElement ( ) & & stream . name ( ) = = u " abbreviations " )
2013-06-15 08:55:15 +00:00
break ;
2019-03-17 20:04:14 +00:00
stream . readNext ( ) ;
2013-06-15 08:55:15 +00:00
}
}
}
2012-01-24 12:55:28 +00:00
}
else
2022-02-27 05:17:37 +00:00
if ( stream . name ( ) = = u " ar " )
2009-04-29 13:34:56 +00:00
{
indexArticle ( gzFile , stream , indexedWords , chunks ,
2013-06-15 08:55:15 +00:00
articleCount , wordCount , isLogical ? Logical : Visual ) ;
2009-04-29 13:34:56 +00:00
}
}
}
2012-01-24 12:55:28 +00:00
// Write abbreviations if presented
2013-06-15 08:55:15 +00:00
if ( ! abrv . empty ( ) )
{
idxHeader . hasAbrv = 1 ;
idxHeader . abrvAddress = chunks . startNewBlock ( ) ;
2012-01-24 12:55:28 +00:00
2013-06-15 08:55:15 +00:00
uint32_t sz = abrv . size ( ) ;
2012-01-24 12:55:28 +00:00
2013-06-15 08:55:15 +00:00
chunks . addToBlock ( & sz , sizeof ( uint32_t ) ) ;
2012-01-24 12:55:28 +00:00
2013-06-15 08:55:15 +00:00
for ( map < string , string > : : const_iterator i = abrv . begin ( ) ; i ! = abrv . end ( ) ; + + i )
{
sz = i - > first . size ( ) ;
chunks . addToBlock ( & sz , sizeof ( uint32_t ) ) ;
chunks . addToBlock ( i - > first . data ( ) , sz ) ;
sz = i - > second . size ( ) ;
chunks . addToBlock ( & sz , sizeof ( uint32_t ) ) ;
chunks . addToBlock ( i - > second . data ( ) , sz ) ;
}
2012-01-24 12:55:28 +00:00
}
2009-04-29 13:34:56 +00:00
// Finish with the chunks
idxHeader . chunksOffset = chunks . finish ( ) ;
// Build index
IndexInfo idxInfo = BtreeIndexing : : buildIndex ( indexedWords , idx ) ;
idxHeader . indexBtreeMaxElements = idxInfo . btreeMaxElements ;
idxHeader . indexRootOffset = idxInfo . rootOffset ;
2012-01-31 12:49:37 +00:00
indexedWords . clear ( ) ; // Release memory -- no need for this data
// If there was a zip file, index it too
if ( zipFileName . size ( ) )
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " Indexing zip file \n " ) ;
2012-01-31 12:49:37 +00:00
idxHeader . hasZipFile = 1 ;
IndexedWords zipFileNames ;
IndexedZip zipFile ;
2023-04-13 10:08:32 +00:00
if ( zipFile . openZipFile ( QDir : : fromNativeSeparators ( zipFileName . c_str ( ) ) ) )
zipFile . indexFile ( zipFileNames ) ;
2012-01-31 12:49:37 +00:00
if ( ! zipFileNames . empty ( ) )
{
// Build the resulting zip file index
IndexInfo idxInfo = BtreeIndexing : : buildIndex ( zipFileNames , idx ) ;
idxHeader . zipIndexBtreeMaxElements = idxInfo . btreeMaxElements ;
idxHeader . zipIndexRootOffset = idxInfo . rootOffset ;
}
else
{
// Bad zip file -- no index (though the mark that we have one
// remains)
idxHeader . zipIndexBtreeMaxElements = 0 ;
idxHeader . zipIndexRootOffset = 0 ;
}
}
else
idxHeader . hasZipFile = 0 ;
2009-04-29 13:34:56 +00:00
// That concludes it. Update the header.
idxHeader . signature = Signature ;
idxHeader . formatVersion = CurrentFormatVersion ;
2012-01-27 11:40:42 +00:00
idxHeader . articleCount = articleCount ;
idxHeader . wordCount = wordCount ;
2009-04-29 13:34:56 +00:00
idx . rewind ( ) ;
idx . write ( & idxHeader , sizeof ( idxHeader ) ) ;
hadXdxf = true ;
}
break ;
}
}
if ( ! hadXdxf )
throw exNotXdxfFile ( dictFiles [ 0 ] ) ;
if ( stream . hasError ( ) )
{
2017-06-22 15:02:04 +00:00
gdWarning ( " %s had a parse error %s at line %lu, and therefore was indexed only up to the point of error. " ,
2013-11-16 18:34:09 +00:00
dictFiles [ 0 ] . c_str ( ) , stream . errorString ( ) . toUtf8 ( ) . data ( ) ,
( unsigned long ) stream . lineNumber ( ) ) ;
2009-04-29 13:34:56 +00:00
}
}
2022-11-29 03:54:31 +00:00
dictionaries . push_back ( std : : make_shared < XdxfDictionary > ( dictId ,
2009-04-29 13:34:56 +00:00
indexFile ,
dictFiles ) ) ;
}
catch ( std : : exception & e )
{
2014-04-25 13:13:56 +00:00
gdWarning ( " Xdxf dictionary initializing failed: %s, error: %s \n " ,
2013-11-16 18:34:09 +00:00
i - > c_str ( ) , e . what ( ) ) ;
2009-04-29 13:34:56 +00:00
}
}
return dictionaries ;
}
}