2012-02-20 21:47:14 +00:00
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2009-01-28 20:55:45 +00:00
* Part of GoldenDict . Licensed under GPLv3 or later , see the LICENSE file */
# include "bgl.hh"
# include "btreeidx.hh"
# include "bgl_babylon.hh"
# include "file.hh"
# include "folding.hh"
# include "utf8.hh"
# include "chunkedstorage.hh"
2009-05-06 18:17:13 +00:00
# include "langcoder.hh"
2009-12-27 12:40:20 +00:00
# include "language.hh"
2013-11-16 18:34:09 +00:00
# include "gddebug.hh"
2013-01-31 23:53:45 +00:00
# include "fsencoding.hh"
2012-11-18 10:00:50 +00:00
# include "htmlescape.hh"
2014-04-16 16:18:28 +00:00
# include "ftshelpers.hh"
2009-05-06 18:17:13 +00:00
2009-01-28 20:55:45 +00:00
# include <map>
# include <set>
# include <list>
# include <zlib.h>
# include <ctype.h>
2009-01-30 01:20:37 +00:00
# include <string.h>
2009-01-28 20:55:45 +00:00
2009-04-29 23:18:26 +00:00
# ifdef _MSC_VER
# include <stub_msvc.h>
# endif
2009-04-16 11:33:12 +00:00
# include <QSemaphore>
# include <QThreadPool>
# include <QAtomicInt>
2013-09-20 14:25:44 +00:00
# include <QDebug>
2009-04-16 11:33:12 +00:00
2022-02-27 05:17:37 +00:00
# if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
2022-02-27 14:42:40 +00:00
# include <QtCore5Compat/QRegExp>
# else
# include <QRegExp>
2022-02-27 05:17:37 +00:00
# endif
2009-05-07 16:14:56 +00:00
2018-02-21 14:43:35 +00:00
# include <QRegularExpression>
2021-11-27 07:17:33 +00:00
# include "utils.hh"
2013-05-30 13:24:21 +00:00
2009-01-28 20:55:45 +00:00
namespace Bgl {
using std : : map ;
using std : : multimap ;
using std : : set ;
2009-04-18 17:20:12 +00:00
using gd : : wstring ;
using gd : : wchar ;
2009-01-28 20:55:45 +00:00
using std : : list ;
using std : : pair ;
2012-11-19 15:30:26 +00:00
using std : : string ;
2009-01-28 20:55:45 +00:00
using BtreeIndexing : : WordArticleLink ;
using BtreeIndexing : : IndexedWords ;
2009-04-14 16:35:47 +00:00
using BtreeIndexing : : IndexInfo ;
2009-01-28 20:55:45 +00:00
namespace
{
enum
{
Signature = 0x584c4742 , // BGLX on little-endian, XLGB on big-endian
2013-01-18 11:50:49 +00:00
CurrentFormatVersion = 19 + BtreeIndexing : : FormatVersion
2009-01-28 20:55:45 +00:00
} ;
struct IdxHeader
{
uint32_t signature ; // First comes the signature, BGLX
uint32_t formatVersion ; // File format version, currently 1.
uint32_t parserVersion ; // Version of the parser used to parse the BGL file.
// If it's lower than the current one, the file is to
// be re-parsed.
uint32_t foldingVersion ; // Version of the folding algorithm used when building
// index. If it's different from the current one,
// the file is to be rebuilt.
uint32_t articleCount ; // Total number of articles, for informative purposes only
uint32_t wordCount ; // Total number of words, for informative purposes only
/// Add more fields here, like name, description, author and such.
uint32_t chunksOffset ; // The offset to chunks' storage
2009-04-14 16:35:47 +00:00
uint32_t indexBtreeMaxElements ; // Two fields from IndexInfo
uint32_t indexRootOffset ;
2009-01-28 20:55:45 +00:00
uint32_t resourceListOffset ; // The offset of the list of resources
uint32_t resourcesCount ; // Number of resources stored
2009-05-06 18:17:13 +00:00
uint32_t langFrom ; // Source language
uint32_t langTo ; // Target language
2009-05-07 10:59:58 +00:00
uint32_t iconAddress ; // Address of the icon in the chunks' storage
uint32_t iconSize ; // Size of the icon in the chunks' storage, 0 = no icon
2012-11-18 10:00:50 +00:00
uint32_t descriptionAddress ; // Address of the dictionary description in the chunks' storage
uint32_t descriptionSize ; // Size of the description in the chunks' storage, 0 = no description
2009-05-06 18:17:13 +00:00
}
2009-04-29 23:18:26 +00:00
# ifndef _MSC_VER
2009-05-06 18:17:13 +00:00
__attribute__ ( ( packed ) )
2009-04-29 23:18:26 +00:00
# endif
;
2009-01-28 20:55:45 +00:00
bool indexIsOldOrBad ( string const & indexFile )
{
File : : Class idx ( indexFile , " rb " ) ;
IdxHeader header ;
return idx . readRecords ( & header , sizeof ( header ) , 1 ) ! = 1 | |
header . signature ! = Signature | |
header . formatVersion ! = CurrentFormatVersion | |
header . parserVersion ! = Babylon : : ParserVersion | |
header . foldingVersion ! = Folding : : Version ;
}
// Removes the $1$-like postfix
string removePostfix ( string const & in )
{
if ( in . size ( ) & & in [ in . size ( ) - 1 ] = = ' $ ' )
{
// Find the end of it and cut it, barring any unexpectedness
for ( long x = in . size ( ) - 2 ; x > = 0 ; x - - )
{
if ( in [ x ] = = ' $ ' )
return in . substr ( 0 , x ) ;
else
if ( ! isdigit ( in [ x ] ) )
break ;
}
}
return in ;
}
// Removes any leading or trailing whitespace
void trimWs ( string & word )
{
if ( word . size ( ) )
{
unsigned begin = 0 ;
2016-04-15 14:44:53 +00:00
while ( begin < word . size ( ) & & Utf8 : : isspace ( word [ begin ] ) )
2009-01-28 20:55:45 +00:00
+ + begin ;
if ( begin = = word . size ( ) ) // Consists of ws entirely?
word . clear ( ) ;
else
{
unsigned end = word . size ( ) ;
// Doesn't consist of ws entirely, so must end with just isspace()
// condition.
2016-04-15 14:44:53 +00:00
while ( Utf8 : : isspace ( word [ end - 1 ] ) )
2009-01-28 20:55:45 +00:00
- - end ;
if ( end ! = word . size ( ) | | begin )
word = string ( word , begin , end - begin ) ;
}
}
}
void addEntryToIndex ( string & word ,
uint32_t articleOffset ,
IndexedWords & indexedWords ,
2009-04-18 17:20:12 +00:00
vector < wchar > & wcharBuffer )
2009-01-28 20:55:45 +00:00
{
// Strip any leading or trailing whitespaces
trimWs ( word ) ;
2009-04-27 12:23:10 +00:00
// If the word starts with a slash, we drop it. There are quite a lot
// of them, and they all seem to be redudant duplicates.
if ( word . size ( ) & & word [ 0 ] = = ' / ' )
return ;
2009-01-28 20:55:45 +00:00
// Check the input word for a superscript postfix ($1$, $2$ etc), which
// signifies different meaning in Bgl files. We emit different meaning
// as different articles, but they appear in the index as the same word.
if ( word . size ( ) & & word [ word . size ( ) - 1 ] = = ' $ ' )
{
word = removePostfix ( word ) ;
trimWs ( word ) ;
}
// Convert the word from utf8 to wide chars
2022-03-31 09:51:22 +00:00
indexedWords . addWord ( Utf8 : : decode ( word ) , articleOffset ) ;
2009-01-28 20:55:45 +00:00
}
DEF_EX ( exFailedToDecompressArticle , " Failed to decompress article's body " , Dictionary : : Ex )
DEF_EX ( exChunkIndexOutOfRange , " Chunk index is out of range " , Dictionary : : Ex )
class BglDictionary : public BtreeIndexing : : BtreeDictionary
{
2009-03-26 19:00:08 +00:00
Mutex idxMutex ;
2009-01-28 20:55:45 +00:00
File : : Class idx ;
IdxHeader idxHeader ;
string dictionaryName ;
ChunkedStorage : : Reader chunks ;
public :
BglDictionary ( string const & id , string const & indexFile ,
string const & dictionaryFile ) ;
virtual string getName ( ) throw ( )
{ return dictionaryName ; }
virtual map < Dictionary : : Property , string > getProperties ( ) throw ( )
{ return map < Dictionary : : Property , string > ( ) ; }
virtual unsigned long getArticleCount ( ) throw ( )
{ return idxHeader . articleCount ; }
virtual unsigned long getWordCount ( ) throw ( )
{ return idxHeader . wordCount ; }
2009-05-06 18:17:13 +00:00
inline virtual quint32 getLangFrom ( ) const
{ return idxHeader . langFrom ; }
inline virtual quint32 getLangTo ( ) const
{ return idxHeader . langTo ; }
2009-03-26 19:00:08 +00:00
virtual sptr < Dictionary : : WordSearchRequest > findHeadwordsForSynonym ( wstring const & )
2022-01-09 08:35:07 +00:00
;
2009-01-28 20:55:45 +00:00
2009-03-26 19:00:08 +00:00
virtual sptr < Dictionary : : DataRequest > getArticle ( wstring const & ,
2009-05-29 19:48:50 +00:00
vector < wstring > const & alts ,
2018-06-13 16:00:42 +00:00
wstring const & ,
bool ignoreDiacritics )
2022-01-09 08:35:07 +00:00
;
2009-01-28 20:55:45 +00:00
2009-03-26 19:00:08 +00:00
virtual sptr < Dictionary : : DataRequest > getResource ( string const & name )
2022-01-09 08:35:07 +00:00
;
2009-01-28 20:55:45 +00:00
2014-04-16 16:18:28 +00:00
virtual sptr < Dictionary : : DataRequest > getSearchResults ( QString const & searchString ,
int searchMode , bool matchCase ,
int distanceBetweenWords ,
2017-07-25 15:28:29 +00:00
int maxResults ,
2018-04-10 14:49:52 +00:00
bool ignoreWordsOrder ,
bool ignoreDiacritics ) ;
2012-11-18 10:00:50 +00:00
virtual QString const & getDescription ( ) ;
2014-04-16 16:18:28 +00:00
virtual void getArticleText ( uint32_t articleAddress , QString & headword , QString & text ) ;
virtual void makeFTSIndex ( QAtomicInt & isCancelled , bool firstIteration ) ;
2014-04-17 14:31:51 +00:00
virtual void setFTSParameters ( Config : : FullTextSearch const & fts )
{
can_FTS = fts . enabled
& & ! fts . disabledTypes . contains ( " BGL " , Qt : : CaseInsensitive )
& & ( fts . maxDictionarySize = = 0 | | getArticleCount ( ) < = fts . maxDictionarySize ) ;
}
2012-12-03 12:47:43 +00:00
protected :
virtual void loadIcon ( ) throw ( ) ;
2009-01-28 20:55:45 +00:00
private :
/// Loads an article with the given offset, filling the given strings.
void loadArticle ( uint32_t offset , string & headword ,
string & displayedHeadword , string & articleText ) ;
2009-04-16 11:33:12 +00:00
static void replaceCharsetEntities ( string & ) ;
friend class BglHeadwordsRequest ;
friend class BglArticleRequest ;
friend class BglResourceRequest ;
2009-01-28 20:55:45 +00:00
} ;
BglDictionary : : BglDictionary ( string const & id , string const & indexFile ,
string const & dictionaryFile ) :
BtreeDictionary ( id , vector < string > ( 1 , dictionaryFile ) ) ,
idx ( indexFile , " rb " ) ,
idxHeader ( idx . read < IdxHeader > ( ) ) ,
2012-12-03 12:47:43 +00:00
chunks ( idx , idxHeader . chunksOffset )
2009-01-28 20:55:45 +00:00
{
idx . seek ( sizeof ( idxHeader ) ) ;
// Read the dictionary's name
size_t len = idx . read < uint32_t > ( ) ;
2019-01-17 14:53:13 +00:00
if ( len )
{
vector < char > nameBuf ( len ) ;
2009-01-28 20:55:45 +00:00
2019-01-17 14:53:13 +00:00
idx . read ( & nameBuf . front ( ) , len ) ;
2009-01-28 20:55:45 +00:00
2019-01-17 14:53:13 +00:00
dictionaryName = string ( & nameBuf . front ( ) , len ) ;
}
2009-01-28 20:55:45 +00:00
// Initialize the index
2009-04-14 16:35:47 +00:00
openIndex ( IndexInfo ( idxHeader . indexBtreeMaxElements ,
idxHeader . indexRootOffset ) ,
idx , idxMutex ) ;
2014-04-16 16:18:28 +00:00
can_FTS = true ;
2014-05-08 12:38:00 +00:00
ftsIdxName = indexFile + " _FTS " ;
2014-04-16 16:18:28 +00:00
2014-05-08 12:38:00 +00:00
if ( ! Dictionary : : needToRebuildIndex ( getDictionaryFilenames ( ) , ftsIdxName )
2014-11-22 14:22:04 +00:00
& & ! FtsHelpers : : ftsIndexIsOldOrBad ( ftsIdxName , this ) )
2014-05-08 12:38:00 +00:00
FTS_index_completed . ref ( ) ;
2009-01-28 20:55:45 +00:00
}
2012-12-03 12:47:43 +00:00
void BglDictionary : : loadIcon ( ) throw ( )
2009-05-07 10:59:58 +00:00
{
2013-01-31 23:53:45 +00:00
if ( dictionaryIconLoaded )
return ;
QString fileName =
QDir : : fromNativeSeparators ( FsEncoding : : decode ( getDictionaryFilenames ( ) [ 0 ] . c_str ( ) ) ) ;
// Remove the extension
fileName . chop ( 3 ) ;
if ( ! loadIconFromFile ( fileName ) )
2009-05-07 10:59:58 +00:00
{
2013-01-31 23:53:45 +00:00
if ( idxHeader . iconSize )
{
2012-12-03 12:47:43 +00:00
2013-01-31 23:53:45 +00:00
// Try loading icon now
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
vector < char > chunk ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
Mutex : : Lock _ ( idxMutex ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
char * iconData = chunks . getBlock ( idxHeader . iconAddress , chunk ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
QImage img ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
if ( img . loadFromData ( ( unsigned char * ) iconData , idxHeader . iconSize ) )
{
// Load successful
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
dictionaryNativeIcon = QIcon ( QPixmap : : fromImage ( img ) ) ;
2012-12-03 12:47:43 +00:00
2013-01-31 23:53:45 +00:00
// Transform it to be square
int max = img . width ( ) > img . height ( ) ? img . width ( ) : img . height ( ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
QImage result ( max , max , QImage : : Format_ARGB32 ) ;
result . fill ( 0 ) ; // Black transparent
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
QPainter painter ( & result ) ;
2022-01-23 03:36:58 +00:00
painter . setRenderHint ( QPainter : : RenderHint : : Antialiasing ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
painter . drawImage ( QPoint ( img . width ( ) = = max ? 0 : ( max - img . width ( ) ) / 2 ,
img . height ( ) = = max ? 0 : ( max - img . height ( ) ) / 2 ) ,
img ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
painter . end ( ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
dictionaryIcon = QIcon ( QPixmap : : fromImage ( result ) ) ;
}
2009-05-07 10:59:58 +00:00
}
2013-01-31 23:53:45 +00:00
if ( dictionaryIcon . isNull ( ) )
dictionaryIcon = dictionaryNativeIcon = QIcon ( " :/icons/icon32_bgl.png " ) ;
}
2012-12-03 12:47:43 +00:00
dictionaryIconLoaded = true ;
2009-05-07 10:59:58 +00:00
}
2009-01-28 20:55:45 +00:00
void BglDictionary : : loadArticle ( uint32_t offset , string & headword ,
string & displayedHeadword ,
string & articleText )
{
vector < char > chunk ;
2009-03-26 19:00:08 +00:00
Mutex : : Lock _ ( idxMutex ) ;
2009-04-21 19:03:16 +00:00
2009-01-28 20:55:45 +00:00
char * articleData = chunks . getBlock ( offset , chunk ) ;
headword = articleData ;
displayedHeadword = articleData + headword . size ( ) + 1 ;
articleText =
2009-04-21 19:03:16 +00:00
string ( articleData + headword . size ( ) +
2009-01-28 20:55:45 +00:00
displayedHeadword . size ( ) + 2 ) ;
}
2012-11-18 10:00:50 +00:00
QString const & BglDictionary : : getDescription ( )
{
if ( ! dictionaryDescription . isEmpty ( ) )
return dictionaryDescription ;
if ( idxHeader . descriptionSize = = 0 )
dictionaryDescription = " NONE " ;
else
{
2014-04-20 20:02:41 +00:00
Mutex : : Lock _ ( idxMutex ) ;
2012-11-18 10:00:50 +00:00
vector < char > chunk ;
char * dictDescription = chunks . getBlock ( idxHeader . descriptionAddress , chunk ) ;
2012-11-19 15:30:26 +00:00
string str ( dictDescription ) ;
if ( ! str . empty ( ) )
2017-03-16 15:20:36 +00:00
dictionaryDescription + = QString ( QObject : : tr ( " Copyright: %1%2 " ) )
. arg ( Html : : unescape ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) ) )
. arg ( " \n \n " ) ;
2012-11-19 15:30:26 +00:00
dictDescription + = str . size ( ) + 1 ;
str = string ( dictDescription ) ;
if ( ! str . empty ( ) )
2017-03-16 15:20:36 +00:00
dictionaryDescription + = QString ( QObject : : tr ( " Author: %1%2 " ) )
. arg ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) )
. arg ( " \n \n " ) ;
2012-11-19 15:30:26 +00:00
dictDescription + = str . size ( ) + 1 ;
str = string ( dictDescription ) ;
if ( ! str . empty ( ) )
2017-03-16 15:20:36 +00:00
dictionaryDescription + = QString ( QObject : : tr ( " E-mail: %1%2 " ) )
. arg ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) )
. arg ( " \n \n " ) ;
2012-11-19 15:30:26 +00:00
dictDescription + = str . size ( ) + 1 ;
str = string ( dictDescription ) ;
if ( ! str . empty ( ) )
dictionaryDescription + = Html : : unescape ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) ) ;
2012-11-18 10:00:50 +00:00
}
return dictionaryDescription ;
}
2014-04-16 16:18:28 +00:00
void BglDictionary : : getArticleText ( uint32_t articleAddress , QString & headword , QString & text )
{
try
{
string headwordStr , displayedHeadwordStr , articleStr ;
loadArticle ( articleAddress , headwordStr , displayedHeadwordStr , articleStr ) ;
2014-05-08 21:26:03 +00:00
// Some headword normalization similar while indexing
trimWs ( headwordStr ) ;
if ( headwordStr . size ( ) & & headwordStr [ 0 ] = = ' / ' )
headwordStr . erase ( ) ; // We will take headword from index later
if ( headwordStr . size ( )
& & headwordStr [ headwordStr . size ( ) - 1 ] = = ' $ ' )
{
headwordStr = removePostfix ( headwordStr ) ;
trimWs ( headwordStr ) ;
}
2014-04-16 16:18:28 +00:00
headword = QString : : fromUtf8 ( headwordStr . data ( ) , headwordStr . size ( ) ) ;
wstring wstr = Utf8 : : decode ( articleStr ) ;
if ( getLangTo ( ) = = LangCoder : : code2toInt ( " he " ) )
{
for ( unsigned int i = 0 ; i < wstr . size ( ) ; i + + )
{
2015-06-25 20:53:04 +00:00
if ( ( wstr [ i ] > = 224 & & wstr [ i ] < = 250 ) | | ( wstr [ i ] > = 192 & & wstr [ i ] < = 210 ) ) // Hebrew chars encoded ecoded as windows-1255 or ISO-8859-8, or as vowel-points of windows-1255
2014-04-16 16:18:28 +00:00
wstr [ i ] + = 1488 - 224 ; // Convert to Hebrew unicode
}
}
text = Html : : unescape ( gd : : toQString ( wstr ) ) ;
}
catch ( std : : exception & ex )
{
gdWarning ( " BGL: Failed retrieving article from \" %s \" , reason: %s \n " , getName ( ) . c_str ( ) , ex . what ( ) ) ;
}
}
void BglDictionary : : makeFTSIndex ( QAtomicInt & isCancelled , bool firstIteration )
{
if ( ! ( Dictionary : : needToRebuildIndex ( getDictionaryFilenames ( ) , ftsIdxName )
2014-11-22 14:22:04 +00:00
| | FtsHelpers : : ftsIndexIsOldOrBad ( ftsIdxName , this ) ) )
2014-04-16 16:18:28 +00:00
FTS_index_completed . ref ( ) ;
if ( haveFTSIndex ( ) )
return ;
if ( firstIteration & & getArticleCount ( ) > FTS : : MaxDictionarySizeForFastSearch )
return ;
gdDebug ( " Bgl: Building the full-text index for dictionary: %s \n " ,
getName ( ) . c_str ( ) ) ;
try
{
FtsHelpers : : makeFTSIndex ( this , isCancelled ) ;
2014-04-17 14:31:51 +00:00
FTS_index_completed . ref ( ) ;
2014-04-16 16:18:28 +00:00
}
catch ( std : : exception & ex )
{
gdWarning ( " Bgl: Failed building full-text search index for \" %s \" , reason: %s \n " , getName ( ) . c_str ( ) , ex . what ( ) ) ;
QFile : : remove ( FsEncoding : : decode ( ftsIdxName . c_str ( ) ) ) ;
}
}
2009-04-16 11:33:12 +00:00
/// BglDictionary::findHeadwordsForSynonym()
class BglHeadwordsRequest ;
class BglHeadwordsRequestRunnable : public QRunnable
{
BglHeadwordsRequest & r ;
QSemaphore & hasExited ;
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
public :
BglHeadwordsRequestRunnable ( BglHeadwordsRequest & r_ ,
QSemaphore & hasExited_ ) : r ( r_ ) ,
hasExited ( hasExited_ )
{ }
~ BglHeadwordsRequestRunnable ( )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
hasExited . release ( ) ;
}
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
virtual void run ( ) ;
} ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
class BglHeadwordsRequest : public Dictionary : : WordSearchRequest
{
friend class BglHeadwordsRequestRunnable ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
wstring str ;
BglDictionary & dict ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
QAtomicInt isCancelled ;
QSemaphore hasExited ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
public :
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
BglHeadwordsRequest ( wstring const & word_ ,
BglDictionary & dict_ ) :
str ( word_ ) , dict ( dict_ )
{
QThreadPool : : globalInstance ( ) - > start (
new BglHeadwordsRequestRunnable ( * this , hasExited ) ) ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
void run ( ) ; // Run from another thread by BglHeadwordsRequestRunnable
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
virtual void cancel ( )
{
isCancelled . ref ( ) ;
}
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
~ BglHeadwordsRequest ( )
{
isCancelled . ref ( ) ;
hasExited . acquire ( ) ;
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
} ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
void BglHeadwordsRequestRunnable : : run ( )
{
r . run ( ) ;
}
void BglHeadwordsRequest : : run ( )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
finish ( ) ;
return ;
}
vector < WordArticleLink > chain = dict . findArticles ( str ) ;
wstring caseFolded = Folding : : applySimpleCaseOnly ( str ) ;
for ( unsigned x = 0 ; x < chain . size ( ) ; + + x )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
finish ( ) ;
return ;
}
string headword , displayedHeadword , articleText ;
dict . loadArticle ( chain [ x ] . articleOffset ,
headword , displayedHeadword , articleText ) ;
2012-12-19 17:34:56 +00:00
wstring headwordDecoded ;
try
{
headwordDecoded = Utf8 : : decode ( removePostfix ( headword ) ) ;
}
2018-05-22 14:48:14 +00:00
catch ( Utf8 : : exCantDecode & )
2012-12-19 17:34:56 +00:00
{
}
2009-04-16 11:33:12 +00:00
2012-12-19 17:34:56 +00:00
if ( caseFolded ! = Folding : : applySimpleCaseOnly ( headwordDecoded ) & & ! headwordDecoded . empty ( ) )
2009-04-16 11:33:12 +00:00
{
// The headword seems to differ from the input word, which makes the
// input word its synonym.
Mutex : : Lock _ ( dataMutex ) ;
matches . push_back ( headwordDecoded ) ;
}
}
finish ( ) ;
}
sptr < Dictionary : : WordSearchRequest >
BglDictionary : : findHeadwordsForSynonym ( wstring const & word )
2022-01-09 08:35:07 +00:00
2009-04-16 11:33:12 +00:00
{
2017-03-09 16:11:17 +00:00
return synonymSearchEnabled ? new BglHeadwordsRequest ( word , * this ) :
Class : : findHeadwordsForSynonym ( word ) ;
2009-04-16 11:33:12 +00:00
}
// Converts a $1$-like postfix to a <sup>1</sup> one
string postfixToSuperscript ( string const & in )
{
if ( ! in . size ( ) | | in [ in . size ( ) - 1 ] ! = ' $ ' )
return in ;
for ( long x = in . size ( ) - 2 ; x > = 0 ; x - - )
{
if ( in [ x ] = = ' $ ' )
{
if ( in . size ( ) - x - 2 > 2 )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
// Large postfixes seem like something we wouldn't want to show --
// some dictionaries seem to have each word numbered using the
// postfix.
return in . substr ( 0 , x ) ;
2009-01-28 20:55:45 +00:00
}
else
2009-04-16 11:33:12 +00:00
return in . substr ( 0 , x ) + " <sup> " + in . substr ( x + 1 , in . size ( ) - x - 2 ) + " </sup> " ;
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
else
if ( ! isdigit ( in [ x ] ) )
break ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
return in ;
}
/// BglDictionary::getArticle()
class BglArticleRequest ;
class BglArticleRequestRunnable : public QRunnable
{
BglArticleRequest & r ;
QSemaphore & hasExited ;
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
public :
BglArticleRequestRunnable ( BglArticleRequest & r_ ,
QSemaphore & hasExited_ ) : r ( r_ ) ,
hasExited ( hasExited_ )
{ }
~ BglArticleRequestRunnable ( )
{
hasExited . release ( ) ;
2009-01-28 20:55:45 +00:00
}
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
virtual void run ( ) ;
} ;
class BglArticleRequest : public Dictionary : : DataRequest
{
friend class BglArticleRequestRunnable ;
wstring word ;
vector < wstring > alts ;
BglDictionary & dict ;
QAtomicInt isCancelled ;
QSemaphore hasExited ;
2018-06-13 16:00:42 +00:00
bool ignoreDiacritics ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
public :
BglArticleRequest ( wstring const & word_ ,
vector < wstring > const & alts_ ,
2018-06-13 16:00:42 +00:00
BglDictionary & dict_ , bool ignoreDiacritics_ ) :
word ( word_ ) , alts ( alts_ ) , dict ( dict_ ) , ignoreDiacritics ( ignoreDiacritics_ )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
QThreadPool : : globalInstance ( ) - > start (
new BglArticleRequestRunnable ( * this , hasExited ) ) ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
void run ( ) ; // Run from another thread by BglArticleRequestRunnable
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
virtual void cancel ( )
{
isCancelled . ref ( ) ;
}
2009-04-21 19:03:16 +00:00
2011-06-04 17:51:48 +00:00
void fixHebString ( string & hebStr ) ; // Hebrew support
void fixHebArticle ( string & hebArticle ) ; // Hebrew support
2009-04-16 11:33:12 +00:00
~ BglArticleRequest ( )
{
isCancelled . ref ( ) ;
hasExited . acquire ( ) ;
}
} ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
void BglArticleRequestRunnable : : run ( )
{
r . run ( ) ;
}
2011-06-04 17:51:48 +00:00
void BglArticleRequest : : fixHebString ( string & hebStr ) // Hebrew support - convert non-unicode to unicode
{
2012-12-19 17:34:56 +00:00
wstring hebWStr ;
try
{
hebWStr = Utf8 : : decode ( hebStr ) ;
}
2018-05-22 14:48:14 +00:00
catch ( Utf8 : : exCantDecode & )
2012-12-19 17:34:56 +00:00
{
hebStr = " Utf-8 decoding error " ;
return ;
}
2011-06-04 17:51:48 +00:00
for ( unsigned int i = 0 ; i < hebWStr . size ( ) ; i + + )
{
2015-06-25 20:53:04 +00:00
if ( ( hebWStr [ i ] > = 224 & & hebWStr [ i ] < = 250 ) | | ( hebWStr [ i ] > = 192 & & hebWStr [ i ] < = 210 ) ) // Hebrew chars encoded ecoded as windows-1255 or ISO-8859-8, or as vowel-points of windows-1255
2011-06-04 17:51:48 +00:00
hebWStr [ i ] + = 1488 - 224 ; // Convert to Hebrew unicode
}
hebStr = Utf8 : : encode ( hebWStr ) ;
}
void BglArticleRequest : : fixHebArticle ( string & hebArticle ) // Hebrew support - remove extra chars at the end
{
2011-06-08 02:48:05 +00:00
unsigned nulls ;
for ( nulls = hebArticle . size ( ) ; nulls > 0 & &
( ( hebArticle [ nulls - 1 ] < = 32 & &
hebArticle [ nulls - 1 ] > = 0 ) | |
( hebArticle [ nulls - 1 ] > = 65 & &
hebArticle [ nulls - 1 ] < = 90 ) ) ; - - nulls ) ; //special chars and A-Z
hebArticle . resize ( nulls ) ;
2011-06-04 17:51:48 +00:00
}
2009-04-16 11:33:12 +00:00
void BglArticleRequest : : run ( )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-16 11:33:12 +00:00
{
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2018-06-13 16:00:42 +00:00
vector < WordArticleLink > chain = dict . findArticles ( word , ignoreDiacritics ) ;
2009-01-28 20:55:45 +00:00
2011-06-04 17:51:48 +00:00
static Language : : Id hebrew = LangCoder : : code2toInt ( " he " ) ; // Hebrew support
2009-04-16 11:33:12 +00:00
for ( unsigned x = 0 ; x < alts . size ( ) ; + + x )
{
/// Make an additional query for each alt
2018-06-13 16:00:42 +00:00
vector < WordArticleLink > altChain = dict . findArticles ( alts [ x ] , ignoreDiacritics ) ;
2009-04-16 11:33:12 +00:00
chain . insert ( chain . end ( ) , altChain . begin ( ) , altChain . end ( ) ) ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
multimap < wstring , pair < string , string > > mainArticles , alternateArticles ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
set < uint32_t > articlesIncluded ; // Some synonims make it that the articles
// appear several times. We combat this
// by only allowing them to appear once.
2011-05-22 20:45:06 +00:00
// Sometimes the articles are physically duplicated. We store hashes of
// the bodies to account for this.
set < QByteArray > articleBodiesIncluded ;
2009-04-16 11:33:12 +00:00
wstring wordCaseFolded = Folding : : applySimpleCaseOnly ( word ) ;
2018-06-13 16:00:42 +00:00
if ( ignoreDiacritics )
wordCaseFolded = Folding : : applyDiacriticsOnly ( wordCaseFolded ) ;
2009-04-16 11:33:12 +00:00
for ( unsigned x = 0 ; x < chain . size ( ) ; + + x )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2012-12-19 17:34:56 +00:00
try
{
2009-04-16 11:33:12 +00:00
if ( articlesIncluded . find ( chain [ x ] . articleOffset ) ! = articlesIncluded . end ( ) )
continue ; // We already have this article in the body.
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
// Now grab that article
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
string headword , displayedHeadword , articleText ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
dict . loadArticle ( chain [ x ] . articleOffset ,
headword , displayedHeadword , articleText ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
// Ok. Now, does it go to main articles, or to alternate ones? We list
// main ones first, and alternates after.
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
// We do the case-folded and postfix-less comparison here.
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
wstring headwordStripped =
Folding : : applySimpleCaseOnly ( Utf8 : : decode ( removePostfix ( headword ) ) ) ;
2018-06-13 16:00:42 +00:00
if ( ignoreDiacritics )
headwordStripped = Folding : : applyDiacriticsOnly ( headwordStripped ) ;
2009-01-28 20:55:45 +00:00
2018-06-13 16:00:42 +00:00
// Hebrew support - fix Hebrew text
2011-06-04 17:51:48 +00:00
if ( dict . idxHeader . langFrom = = hebrew )
{
displayedHeadword = displayedHeadword . size ( ) ? displayedHeadword : headword ;
fixHebString ( articleText ) ;
fixHebArticle ( articleText ) ;
fixHebString ( displayedHeadword ) ;
}
2011-05-22 20:45:06 +00:00
string const & targetHeadword = displayedHeadword . size ( ) ?
displayedHeadword : headword ;
QCryptographicHash hash ( QCryptographicHash : : Md5 ) ;
hash . addData ( targetHeadword . data ( ) , targetHeadword . size ( ) + 1 ) ; // with 0
hash . addData ( articleText . data ( ) , articleText . size ( ) ) ;
if ( ! articleBodiesIncluded . insert ( hash . result ( ) ) . second )
continue ; // Already had this body
2009-04-21 19:03:16 +00:00
multimap < wstring , pair < string , string > > & mapToUse =
2009-04-16 11:33:12 +00:00
( wordCaseFolded = = headwordStripped ) ?
mainArticles : alternateArticles ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
mapToUse . insert ( pair < wstring , pair < string , string > > (
Folding : : applySimpleCaseOnly ( Utf8 : : decode ( headword ) ) ,
2011-05-22 20:45:06 +00:00
pair < string , string > ( targetHeadword , articleText ) ) ) ;
2009-04-16 11:33:12 +00:00
articlesIncluded . insert ( chain [ x ] . articleOffset ) ;
2012-12-19 17:34:56 +00:00
} // try
2013-09-24 13:56:47 +00:00
catch ( std : : exception & ex )
2012-12-19 17:34:56 +00:00
{
2013-11-16 18:34:09 +00:00
gdWarning ( " BGL: Failed loading article from \" %s \" , reason: %s \n " , dict . getName ( ) . c_str ( ) , ex . what ( ) ) ;
2012-12-19 17:34:56 +00:00
}
2009-04-16 11:33:12 +00:00
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
if ( mainArticles . empty ( ) & & alternateArticles . empty ( ) )
{
// No such word
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
string result ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
multimap < wstring , pair < string , string > > : : const_iterator i ;
2009-01-28 20:55:45 +00:00
2022-01-11 12:33:46 +00:00
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = " " ;
2009-04-16 11:33:12 +00:00
for ( i = mainArticles . begin ( ) ; i ! = mainArticles . end ( ) ; + + i )
{
2013-07-10 13:48:09 +00:00
if ( dict . isFromLanguageRTL ( ) ) // RTL support
2011-06-04 17:51:48 +00:00
result + = " <h3 style= \" text-align:right;direction:rtl \" > " ;
else
result + = " <h3> " ;
2009-04-16 11:33:12 +00:00
result + = postfixToSuperscript ( i - > second . first ) ;
result + = " </h3> " ;
2013-07-10 13:48:09 +00:00
if ( dict . isToLanguageRTL ( ) )
2009-12-27 12:40:20 +00:00
result + = " <div class= \" bglrtl \" > " + i - > second . second + " </div> " ;
else
2011-07-01 16:25:14 +00:00
result + = " <div> " + i - > second . second + " </div> " ;
2009-04-16 11:33:12 +00:00
result + = cleaner ;
}
2011-06-04 17:51:48 +00:00
2009-12-27 12:40:20 +00:00
2009-04-16 11:33:12 +00:00
for ( i = alternateArticles . begin ( ) ; i ! = alternateArticles . end ( ) ; + + i )
{
2013-07-10 13:48:09 +00:00
if ( dict . isFromLanguageRTL ( ) ) // RTL support
2011-06-04 17:51:48 +00:00
result + = " <h3 style= \" text-align:right;direction:rtl \" > " ;
else
result + = " <h3> " ;
2009-04-16 11:33:12 +00:00
result + = postfixToSuperscript ( i - > second . first ) ;
result + = " </h3> " ;
2013-07-10 13:48:09 +00:00
if ( dict . isToLanguageRTL ( ) )
2009-12-27 12:40:20 +00:00
result + = " <div class= \" bglrtl \" > " + i - > second . second + " </div> " ;
else
2011-07-01 16:25:14 +00:00
result + = " <div> " + i - > second . second + " </div> " ;
2009-04-16 11:33:12 +00:00
result + = cleaner ;
}
// Do some cleanups in the text
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
BglDictionary : : replaceCharsetEntities ( result ) ;
2018-02-21 14:43:35 +00:00
result = QString : : fromUtf8 ( result . c_str ( ) )
// onclick location to link
. replace ( QRegularExpression ( " <([a-z0-9]+) \\ s+[^>]*onclick= \" [a-z.]*location(?: \\ .href) \\ s*= \\ s*'([^']+)[^>]*>([^<]+)</ \\ 1> " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2018-02-21 14:43:35 +00:00
" <a href= \" \\ 2 \" > \\ 3</a> " )
. replace ( QRegularExpression ( " (< \\ s*a \\ s+[^>]*href \\ s*= \\ s*[ \" '] \\ s*)bword:// " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2018-02-21 14:43:35 +00:00
" \\ 1bword: " )
//remove invalid width, height attrs
2018-02-28 14:15:27 +00:00
. replace ( QRegularExpression ( " (width|height) \\ s*= \\ s*[ \" '] \\ d{7,}[ \" ''] " ) ,
2018-02-21 14:43:35 +00:00
" " )
//remove invalid <br> tag
. replace ( QRegularExpression ( " <br>(<div|<table|<tbody|<tr|<td|</div>|</table>|</tbody>|</tr>|</td>|function addScript|var scNode|scNode|var atag|while \\ (atag|atag=atag|document \\ .getElementsByTagName|addScript|src= \" bres|<a onmouseover= \" return overlib|onclick= \" return overlib) " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2018-02-21 14:43:35 +00:00
" \\ 1 " )
. replace ( QRegularExpression ( " (AUTOSTATUS, WRAP \\ ); \" |</DIV>|addScript \\ ('JS_FILE_PHONG_VT_45634' \\ );|appendChild \\ (scNode \\ );|atag \\ .firstChild;)<br> " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2018-02-21 14:43:35 +00:00
" \\ 1 " )
. toUtf8 ( ) . data ( ) ;
2021-11-19 13:47:22 +00:00
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
Mutex : : Lock _ ( dataMutex ) ;
2009-03-26 19:00:08 +00:00
2009-04-16 11:33:12 +00:00
data . resize ( result . size ( ) ) ;
2009-03-26 19:00:08 +00:00
2009-04-16 11:33:12 +00:00
memcpy ( & data . front ( ) , result . data ( ) , result . size ( ) ) ;
hasAnyData = true ;
finish ( ) ;
}
sptr < Dictionary : : DataRequest > BglDictionary : : getArticle ( wstring const & word ,
2009-05-29 19:48:50 +00:00
vector < wstring > const & alts ,
2018-06-13 16:00:42 +00:00
wstring const & ,
bool ignoreDiacritics )
2022-01-09 08:35:07 +00:00
2009-04-16 11:33:12 +00:00
{
2018-06-13 16:00:42 +00:00
return new BglArticleRequest ( word , alts , * this , ignoreDiacritics ) ;
2009-04-16 11:33:12 +00:00
}
//// BglDictionary::getResource()
class BglResourceRequest ;
class BglResourceRequestRunnable : public QRunnable
{
BglResourceRequest & r ;
QSemaphore & hasExited ;
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
public :
BglResourceRequestRunnable ( BglResourceRequest & r_ ,
QSemaphore & hasExited_ ) : r ( r_ ) ,
hasExited ( hasExited_ )
{ }
~ BglResourceRequestRunnable ( )
{
hasExited . release ( ) ;
2009-01-28 20:55:45 +00:00
}
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
virtual void run ( ) ;
} ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
class BglResourceRequest : public Dictionary : : DataRequest
{
friend class BglResourceRequestRunnable ;
Mutex & idxMutex ;
File : : Class & idx ;
uint32_t resourceListOffset , resourcesCount ;
string name ;
QAtomicInt isCancelled ;
QSemaphore hasExited ;
public :
BglResourceRequest ( Mutex & idxMutex_ ,
File : : Class & idx_ ,
uint32_t resourceListOffset_ ,
uint32_t resourcesCount_ ,
string const & name_ ) :
idxMutex ( idxMutex_ ) ,
idx ( idx_ ) ,
resourceListOffset ( resourceListOffset_ ) ,
resourcesCount ( resourcesCount_ ) ,
name ( name_ )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
QThreadPool : : globalInstance ( ) - > start (
new BglResourceRequestRunnable ( * this , hasExited ) ) ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
void run ( ) ; // Run from another thread by BglResourceRequestRunnable
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
virtual void cancel ( )
{
isCancelled . ref ( ) ;
}
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
~ BglResourceRequest ( )
{
isCancelled . ref ( ) ;
hasExited . acquire ( ) ;
}
} ;
2009-03-26 19:00:08 +00:00
2009-04-16 11:33:12 +00:00
void BglResourceRequestRunnable : : run ( )
{
r . run ( ) ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
void BglResourceRequest : : run ( )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-16 11:33:12 +00:00
{
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
string nameLowercased = name ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
for ( string : : iterator i = nameLowercased . begin ( ) ; i ! = nameLowercased . end ( ) ;
+ + i )
* i = tolower ( * i ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
Mutex : : Lock _ ( idxMutex ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
idx . seek ( resourceListOffset ) ;
for ( size_t count = resourcesCount ; count - - ; )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-16 11:33:12 +00:00
break ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
vector < char > nameData ( idx . read < uint32_t > ( ) ) ;
idx . read ( & nameData . front ( ) , nameData . size ( ) ) ;
for ( size_t x = nameData . size ( ) ; x - - ; )
nameData [ x ] = tolower ( nameData [ x ] ) ;
uint32_t offset = idx . read < uint32_t > ( ) ;
if ( string ( & nameData . front ( ) , nameData . size ( ) ) = = nameLowercased )
{
// We have a match.
2009-03-26 19:00:08 +00:00
2009-04-16 11:33:12 +00:00
idx . seek ( offset ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
Mutex : : Lock _ ( dataMutex ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
data . resize ( idx . read < uint32_t > ( ) ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
vector < unsigned char > compressedData ( idx . read < uint32_t > ( ) ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
idx . read ( & compressedData . front ( ) , compressedData . size ( ) ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
unsigned long decompressedLength = data . size ( ) ;
if ( uncompress ( ( unsigned char * ) & data . front ( ) ,
& decompressedLength ,
& compressedData . front ( ) ,
compressedData . size ( ) ) ! = Z_OK | |
decompressedLength ! = data . size ( ) )
{
2013-11-16 18:34:09 +00:00
gdWarning ( " Failed to decompress resource \" %s \" , ignoring it. \n " , name . c_str ( ) ) ;
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
else
hasAnyData = true ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
break ;
}
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
finish ( ) ;
}
sptr < Dictionary : : DataRequest > BglDictionary : : getResource ( string const & name )
2022-01-09 08:35:07 +00:00
2009-04-16 11:33:12 +00:00
{
return new BglResourceRequest ( idxMutex , idx , idxHeader . resourceListOffset ,
idxHeader . resourcesCount , name ) ;
}
2018-07-07 09:33:15 +00:00
/// Replaces <CHARSET c="t">1234;</CHARSET> occurrences with ሴ
2009-01-28 20:55:45 +00:00
void BglDictionary : : replaceCharsetEntities ( string & text )
{
2018-02-21 14:43:35 +00:00
QString str = QString : : fromUtf8 ( text . c_str ( ) ) ;
QRegularExpression charsetExp ( " < \\ s*charset \\ s+c \\ s*= \\ s*[ \" ']?t[ \" ']? \\ s*>((?: \\ s*[0-9a-fA-F]+ \\ s*; \\ s*)*)< \\ s*/ \\ s*charset \\ s*> " ,
QRegularExpression : : CaseInsensitiveOption
| QRegularExpression : : InvertedGreedinessOption ) ;
2018-02-28 14:15:27 +00:00
QRegularExpression oneValueExp ( " \\ s*([0-9a-fA-F]+) \ \ s * ; " );
2018-02-21 14:43:35 +00:00
QString result ;
int pos = 0 ;
QRegularExpressionMatchIterator it = charsetExp . globalMatch ( str ) ;
while ( it . hasNext ( ) )
{
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
result + = str . mid ( pos , match . capturedStart ( ) - pos ) ;
2018-02-21 14:43:35 +00:00
pos = match . capturedEnd ( ) ;
QRegularExpressionMatchIterator itValue = oneValueExp . globalMatch ( match . captured ( 1 ) ) ;
while ( itValue . hasNext ( ) )
{
QRegularExpressionMatch matchValue = itValue . next ( ) ;
result + = " &#x " + matchValue . captured ( 1 ) + " ; " ;
}
}
if ( pos )
{
2022-02-27 05:17:37 +00:00
result + = str . mid ( pos ) ;
2018-02-21 14:43:35 +00:00
str = result ;
}
2009-01-28 20:55:45 +00:00
2009-05-07 16:14:56 +00:00
text = str . toUtf8 ( ) . data ( ) ;
2009-01-28 20:55:45 +00:00
}
class ResourceHandler : public Babylon : : ResourceHandler
{
File : : Class & idxFile ;
list < pair < string , uint32_t > > resources ;
public :
ResourceHandler ( File : : Class & idxFile_ ) : idxFile ( idxFile_ )
{ }
list < pair < string , uint32_t > > const & getResources ( ) const
{ return resources ; }
protected :
virtual void handleBabylonResource ( string const & filename ,
char const * data , size_t size ) ;
} ;
void ResourceHandler : : handleBabylonResource ( string const & filename ,
char const * data , size_t size )
{
2022-01-15 07:29:20 +00:00
//GD_DPRINTF( "Handling resource file %s (%u bytes)\n", filename.c_str(), size );
2009-01-28 20:55:45 +00:00
vector < unsigned char > compressedData ( compressBound ( size ) ) ;
unsigned long compressedSize = compressedData . size ( ) ;
if ( compress ( & compressedData . front ( ) , & compressedSize ,
( unsigned char const * ) data , size ) ! = Z_OK )
{
2013-11-16 18:34:09 +00:00
gdWarning ( " Failed to compress the body of resource \" %s \" , dropping it. \n " , filename . c_str ( ) ) ;
2009-01-28 20:55:45 +00:00
return ;
}
resources . push_back ( pair < string , uint32_t > ( filename , idxFile . tell ( ) ) ) ;
idxFile . write < uint32_t > ( size ) ;
idxFile . write < uint32_t > ( compressedSize ) ;
idxFile . write ( & compressedData . front ( ) , compressedSize ) ;
}
}
2014-04-16 16:18:28 +00:00
sptr < Dictionary : : DataRequest > BglDictionary : : getSearchResults ( QString const & searchString ,
int searchMode , bool matchCase ,
int distanceBetweenWords ,
2017-07-25 15:28:29 +00:00
int maxResults ,
2018-04-10 14:49:52 +00:00
bool ignoreWordsOrder ,
bool ignoreDiacritics )
2014-04-16 16:18:28 +00:00
{
2018-04-10 14:49:52 +00:00
return new FtsHelpers : : FTSResultsRequest ( * this , searchString , searchMode , matchCase , distanceBetweenWords , maxResults , ignoreWordsOrder , ignoreDiacritics ) ;
2014-04-16 16:18:28 +00:00
}
2009-01-28 20:55:45 +00:00
2009-03-26 19:00:08 +00:00
vector < sptr < Dictionary : : Class > > makeDictionaries (
vector < string > const & fileNames ,
string const & indicesDir ,
Dictionary : : Initializing & initializing )
2022-01-09 08:35:07 +00:00
2009-01-28 20:55:45 +00:00
{
vector < sptr < Dictionary : : Class > > dictionaries ;
for ( vector < string > : : const_iterator i = fileNames . begin ( ) ; i ! = fileNames . end ( ) ;
+ + i )
{
// Skip files with the extensions different to .bgl to speed up the
// scanning
if ( i - > size ( ) < 4 | |
strcasecmp ( i - > c_str ( ) + ( i - > size ( ) - 4 ) , " .bgl " ) ! = 0 )
continue ;
// Got the file -- check if we need to rebuid the index
vector < string > dictFiles ( 1 , * i ) ;
2009-03-26 19:00:08 +00:00
string dictId = Dictionary : : makeDictionaryId ( dictFiles ) ;
2009-01-28 20:55:45 +00:00
string indexFile = indicesDir + dictId ;
2009-03-26 19:00:08 +00:00
if ( Dictionary : : needToRebuildIndex ( dictFiles , indexFile ) | |
indexIsOldOrBad ( indexFile ) )
2009-01-28 20:55:45 +00:00
{
// Building the index
2013-11-16 18:34:09 +00:00
gdDebug ( " Bgl: Building the index for dictionary: %s \n " , i - > c_str ( ) ) ;
2013-09-20 14:25:44 +00:00
2014-04-25 13:13:56 +00:00
try
{
Babylon b ( * i ) ;
2009-04-14 13:25:16 +00:00
2014-04-25 13:13:56 +00:00
if ( ! b . open ( ) )
continue ;
2009-04-14 13:25:16 +00:00
2014-04-25 13:13:56 +00:00
std : : string sourceCharset , targetCharset ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
if ( ! b . read ( sourceCharset , targetCharset ) )
{
gdWarning ( " Failed to start reading from %s, skipping it \n " , i - > c_str ( ) ) ;
continue ;
}
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
initializing . indexingDictionary ( b . title ( ) ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
File : : Class idx ( indexFile , " wb " ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
IdxHeader idxHeader ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
memset ( & idxHeader , 0 , sizeof ( idxHeader ) ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idx . write ( idxHeader ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idx . write < uint32_t > ( b . title ( ) . size ( ) ) ;
idx . write ( b . title ( ) . data ( ) , b . title ( ) . size ( ) ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// This is our index data that we accumulate during the loading process.
// For each new word encountered, we emit the article's body to the file
// immediately, inserting the word itself and its offset in this map.
// This map maps folded words to the original words and the corresponding
// articles' offsets.
IndexedWords indexedWords ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// We use this buffer to decode utf8 into it.
vector < wchar > wcharBuffer ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
ChunkedStorage : : Writer chunks ( idx ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
uint32_t articleCount = 0 , wordCount = 0 ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
ResourceHandler resourceHandler ( idx ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
b . setResourcePrefix ( string ( " bres:// " ) + dictId + " / " ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Save icon if there's one
if ( size_t sz = b . getIcon ( ) . size ( ) )
{
idxHeader . iconAddress = chunks . startNewBlock ( ) ;
chunks . addToBlock ( & b . getIcon ( ) . front ( ) , sz ) ;
idxHeader . iconSize = sz ;
}
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
// Save dictionary description if there's one
idxHeader . descriptionSize = 0 ;
idxHeader . descriptionAddress = chunks . startNewBlock ( ) ;
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . copyright ( ) . c_str ( ) , b . copyright ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . copyright ( ) . size ( ) + 1 ;
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . author ( ) . c_str ( ) , b . author ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . author ( ) . size ( ) + 1 ;
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . email ( ) . c_str ( ) , b . email ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . email ( ) . size ( ) + 1 ;
2012-11-18 10:00:50 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . description ( ) . c_str ( ) , b . description ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . description ( ) . size ( ) + 1 ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
for ( ; ; )
{
bgl_entry e = b . readEntry ( & resourceHandler ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
if ( e . headword . empty ( ) )
break ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Save the article's body itself first
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
uint32_t articleAddress = chunks . startNewBlock ( ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( e . headword . c_str ( ) , e . headword . size ( ) + 1 ) ;
chunks . addToBlock ( e . displayedHeadword . c_str ( ) , e . displayedHeadword . size ( ) + 1 ) ;
chunks . addToBlock ( e . definition . c_str ( ) , e . definition . size ( ) + 1 ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Add entries to the index
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
addEntryToIndex ( e . headword , articleAddress , indexedWords , wcharBuffer ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
for ( unsigned x = 0 ; x < e . alternates . size ( ) ; + + x )
addEntryToIndex ( e . alternates [ x ] , articleAddress , indexedWords , wcharBuffer ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
wordCount + = 1 + e . alternates . size ( ) ;
+ + articleCount ;
}
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Finish with the chunks
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . chunksOffset = chunks . finish ( ) ;
2009-01-28 20:55:45 +00:00
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " Writing index... \n " ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Good. Now build the index
2009-04-14 16:35:47 +00:00
2014-04-25 13:13:56 +00:00
IndexInfo idxInfo = BtreeIndexing : : buildIndex ( indexedWords , idx ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . indexBtreeMaxElements = idxInfo . btreeMaxElements ;
idxHeader . indexRootOffset = idxInfo . rootOffset ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Save the resource's list.
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . resourceListOffset = idx . tell ( ) ;
idxHeader . resourcesCount = resourceHandler . getResources ( ) . size ( ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
for ( list < pair < string , uint32_t > > : : const_iterator j =
resourceHandler . getResources ( ) . begin ( ) ;
j ! = resourceHandler . getResources ( ) . end ( ) ; + + j )
{
idx . write < uint32_t > ( j - > first . size ( ) ) ;
idx . write ( j - > first . data ( ) , j - > first . size ( ) ) ;
idx . write < uint32_t > ( j - > second ) ;
}
// That concludes it. Update the header.
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . signature = Signature ;
idxHeader . formatVersion = CurrentFormatVersion ;
idxHeader . parserVersion = Babylon : : ParserVersion ;
idxHeader . foldingVersion = Folding : : Version ;
idxHeader . articleCount = articleCount ;
idxHeader . wordCount = wordCount ;
idxHeader . langFrom = b . sourceLang ( ) ; //LangCoder::findIdForLanguage( Utf8::decode( b.sourceLang() ) );
idxHeader . langTo = b . targetLang ( ) ; //LangCoder::findIdForLanguage( Utf8::decode( b.targetLang() ) );
2009-05-06 18:17:13 +00:00
2014-04-25 13:13:56 +00:00
idx . rewind ( ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idx . write ( & idxHeader , sizeof ( idxHeader ) ) ;
}
catch ( std : : exception & e )
{
gdWarning ( " BGL dictionary indexing failed: %s, error: %s \n " ,
i - > c_str ( ) , e . what ( ) ) ;
}
2009-01-28 20:55:45 +00:00
}
2014-04-25 13:13:56 +00:00
try
{
dictionaries . push_back ( new BglDictionary ( dictId ,
indexFile ,
* i ) ) ;
}
catch ( std : : exception & e )
{
gdWarning ( " BGL dictionary initializing failed: %s, error: %s \n " ,
i - > c_str ( ) , e . what ( ) ) ;
}
2009-01-28 20:55:45 +00:00
}
return dictionaries ;
}
}