2012-02-20 21:47:14 +00:00
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2009-01-28 20:55:45 +00:00
* Part of GoldenDict . Licensed under GPLv3 or later , see the LICENSE file */
# include "bgl.hh"
# include "bgl_babylon.hh"
2023-05-30 06:31:07 +00:00
# include "btreeidx.hh"
# include "chunkedstorage.hh"
2009-01-28 20:55:45 +00:00
# include "file.hh"
# include "folding.hh"
2023-05-30 06:31:07 +00:00
# include "ftshelpers.hh"
2013-11-16 18:34:09 +00:00
# include "gddebug.hh"
2012-11-18 10:00:50 +00:00
# include "htmlescape.hh"
2023-05-30 06:31:07 +00:00
# include "langcoder.hh"
# include "language.hh"
# include "utf8.hh"
# include "utils.hh"
2009-05-06 18:17:13 +00:00
2023-05-30 06:31:07 +00:00
# include <ctype.h>
# include <list>
2009-01-28 20:55:45 +00:00
# include <map>
# include <set>
2009-01-30 01:20:37 +00:00
# include <string.h>
2023-05-30 06:31:07 +00:00
# include <zlib.h>
2009-01-28 20:55:45 +00:00
2009-04-29 23:18:26 +00:00
# ifdef _MSC_VER
# include <stub_msvc.h>
# endif
2023-05-30 06:31:07 +00:00
# include <QAtomicInt>
# include <QPainter>
# include <QRegularExpression>
2009-04-16 11:33:12 +00:00
# include <QSemaphore>
# include <QThreadPool>
2022-02-27 05:17:37 +00:00
# if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
2022-02-27 14:42:40 +00:00
# include <QtCore5Compat/QRegExp>
# else
# include <QRegExp>
2022-02-27 05:17:37 +00:00
# endif
2009-05-07 16:14:56 +00:00
2009-01-28 20:55:45 +00:00
namespace Bgl {
using std : : map ;
using std : : multimap ;
using std : : set ;
2009-04-18 17:20:12 +00:00
using gd : : wstring ;
using gd : : wchar ;
2009-01-28 20:55:45 +00:00
using std : : list ;
using std : : pair ;
2012-11-19 15:30:26 +00:00
using std : : string ;
2009-01-28 20:55:45 +00:00
using BtreeIndexing : : WordArticleLink ;
using BtreeIndexing : : IndexedWords ;
2009-04-14 16:35:47 +00:00
using BtreeIndexing : : IndexInfo ;
2009-01-28 20:55:45 +00:00
namespace
{
enum
{
Signature = 0x584c4742 , // BGLX on little-endian, XLGB on big-endian
2013-01-18 11:50:49 +00:00
CurrentFormatVersion = 19 + BtreeIndexing : : FormatVersion
2009-01-28 20:55:45 +00:00
} ;
struct IdxHeader
{
uint32_t signature ; // First comes the signature, BGLX
uint32_t formatVersion ; // File format version, currently 1.
uint32_t parserVersion ; // Version of the parser used to parse the BGL file.
// If it's lower than the current one, the file is to
// be re-parsed.
uint32_t foldingVersion ; // Version of the folding algorithm used when building
// index. If it's different from the current one,
// the file is to be rebuilt.
uint32_t articleCount ; // Total number of articles, for informative purposes only
uint32_t wordCount ; // Total number of words, for informative purposes only
/// Add more fields here, like name, description, author and such.
uint32_t chunksOffset ; // The offset to chunks' storage
2009-04-14 16:35:47 +00:00
uint32_t indexBtreeMaxElements ; // Two fields from IndexInfo
uint32_t indexRootOffset ;
2009-01-28 20:55:45 +00:00
uint32_t resourceListOffset ; // The offset of the list of resources
uint32_t resourcesCount ; // Number of resources stored
2009-05-06 18:17:13 +00:00
uint32_t langFrom ; // Source language
uint32_t langTo ; // Target language
2009-05-07 10:59:58 +00:00
uint32_t iconAddress ; // Address of the icon in the chunks' storage
uint32_t iconSize ; // Size of the icon in the chunks' storage, 0 = no icon
2012-11-18 10:00:50 +00:00
uint32_t descriptionAddress ; // Address of the dictionary description in the chunks' storage
uint32_t descriptionSize ; // Size of the description in the chunks' storage, 0 = no description
2009-05-06 18:17:13 +00:00
}
2009-04-29 23:18:26 +00:00
# ifndef _MSC_VER
2009-05-06 18:17:13 +00:00
__attribute__ ( ( packed ) )
2009-04-29 23:18:26 +00:00
# endif
;
2009-01-28 20:55:45 +00:00
bool indexIsOldOrBad ( string const & indexFile )
{
File : : Class idx ( indexFile , " rb " ) ;
IdxHeader header ;
return idx . readRecords ( & header , sizeof ( header ) , 1 ) ! = 1 | |
header . signature ! = Signature | |
header . formatVersion ! = CurrentFormatVersion | |
header . parserVersion ! = Babylon : : ParserVersion | |
header . foldingVersion ! = Folding : : Version ;
}
// Removes the $1$-like postfix
string removePostfix ( string const & in )
{
if ( in . size ( ) & & in [ in . size ( ) - 1 ] = = ' $ ' )
{
// Find the end of it and cut it, barring any unexpectedness
for ( long x = in . size ( ) - 2 ; x > = 0 ; x - - )
{
if ( in [ x ] = = ' $ ' )
return in . substr ( 0 , x ) ;
else
if ( ! isdigit ( in [ x ] ) )
break ;
}
}
return in ;
}
// Removes any leading or trailing whitespace
void trimWs ( string & word )
{
if ( word . size ( ) )
{
unsigned begin = 0 ;
2016-04-15 14:44:53 +00:00
while ( begin < word . size ( ) & & Utf8 : : isspace ( word [ begin ] ) )
2009-01-28 20:55:45 +00:00
+ + begin ;
if ( begin = = word . size ( ) ) // Consists of ws entirely?
word . clear ( ) ;
else
{
unsigned end = word . size ( ) ;
// Doesn't consist of ws entirely, so must end with just isspace()
// condition.
2016-04-15 14:44:53 +00:00
while ( Utf8 : : isspace ( word [ end - 1 ] ) )
2009-01-28 20:55:45 +00:00
- - end ;
if ( end ! = word . size ( ) | | begin )
word = string ( word , begin , end - begin ) ;
}
}
}
void addEntryToIndex ( string & word ,
uint32_t articleOffset ,
IndexedWords & indexedWords ,
2009-04-18 17:20:12 +00:00
vector < wchar > & wcharBuffer )
2009-01-28 20:55:45 +00:00
{
// Strip any leading or trailing whitespaces
trimWs ( word ) ;
2009-04-27 12:23:10 +00:00
// If the word starts with a slash, we drop it. There are quite a lot
// of them, and they all seem to be redudant duplicates.
if ( word . size ( ) & & word [ 0 ] = = ' / ' )
return ;
2009-01-28 20:55:45 +00:00
// Check the input word for a superscript postfix ($1$, $2$ etc), which
// signifies different meaning in Bgl files. We emit different meaning
// as different articles, but they appear in the index as the same word.
if ( word . size ( ) & & word [ word . size ( ) - 1 ] = = ' $ ' )
{
word = removePostfix ( word ) ;
trimWs ( word ) ;
}
// Convert the word from utf8 to wide chars
2022-03-31 09:51:22 +00:00
indexedWords . addWord ( Utf8 : : decode ( word ) , articleOffset ) ;
2009-01-28 20:55:45 +00:00
}
DEF_EX ( exFailedToDecompressArticle , " Failed to decompress article's body " , Dictionary : : Ex )
DEF_EX ( exChunkIndexOutOfRange , " Chunk index is out of range " , Dictionary : : Ex )
class BglDictionary : public BtreeIndexing : : BtreeDictionary
{
2023-05-29 13:56:04 +00:00
QMutex idxMutex ;
2009-01-28 20:55:45 +00:00
File : : Class idx ;
IdxHeader idxHeader ;
ChunkedStorage : : Reader chunks ;
public :
BglDictionary ( string const & id , string const & indexFile ,
string const & dictionaryFile ) ;
2022-12-29 07:07:40 +00:00
map < Dictionary : : Property , string > getProperties ( ) noexcept override
2009-01-28 20:55:45 +00:00
{ return map < Dictionary : : Property , string > ( ) ; }
2022-12-29 07:07:40 +00:00
unsigned long getArticleCount ( ) noexcept override
2009-01-28 20:55:45 +00:00
{ return idxHeader . articleCount ; }
2022-12-29 07:07:40 +00:00
unsigned long getWordCount ( ) noexcept override
2009-01-28 20:55:45 +00:00
{ return idxHeader . wordCount ; }
2022-12-29 07:07:40 +00:00
inline quint32 getLangFrom ( ) const override
2009-05-06 18:17:13 +00:00
{ return idxHeader . langFrom ; }
2022-12-29 07:07:40 +00:00
inline quint32 getLangTo ( ) const override
2009-05-06 18:17:13 +00:00
{ return idxHeader . langTo ; }
2022-12-29 07:07:40 +00:00
sptr < Dictionary : : WordSearchRequest > findHeadwordsForSynonym ( wstring const & ) override
2022-01-09 08:35:07 +00:00
;
2009-01-28 20:55:45 +00:00
2022-12-29 07:07:40 +00:00
sptr < Dictionary : : DataRequest > getArticle ( wstring const & ,
2009-05-29 19:48:50 +00:00
vector < wstring > const & alts ,
2018-06-13 16:00:42 +00:00
wstring const & ,
2022-12-29 07:07:40 +00:00
bool ignoreDiacritics ) override
2022-01-09 08:35:07 +00:00
;
2009-01-28 20:55:45 +00:00
2022-12-29 07:07:40 +00:00
sptr < Dictionary : : DataRequest > getResource ( string const & name ) override
2022-01-09 08:35:07 +00:00
;
2009-01-28 20:55:45 +00:00
2023-05-30 23:42:31 +00:00
sptr < Dictionary : : DataRequest >
getSearchResults ( QString const & searchString , int searchMode , bool matchCase , bool ignoreDiacritics ) override ;
2022-12-29 07:07:40 +00:00
QString const & getDescription ( ) override ;
2012-11-18 10:00:50 +00:00
2022-12-29 07:07:40 +00:00
void getArticleText ( uint32_t articleAddress , QString & headword , QString & text ) override ;
2014-04-16 16:18:28 +00:00
2022-12-29 07:07:40 +00:00
void makeFTSIndex ( QAtomicInt & isCancelled , bool firstIteration ) override ;
2014-04-16 16:18:28 +00:00
2022-12-29 07:07:40 +00:00
void setFTSParameters ( Config : : FullTextSearch const & fts ) override
2014-04-17 14:31:51 +00:00
{
can_FTS = fts . enabled
& & ! fts . disabledTypes . contains ( " BGL " , Qt : : CaseInsensitive )
& & ( fts . maxDictionarySize = = 0 | | getArticleCount ( ) < = fts . maxDictionarySize ) ;
}
2012-12-03 12:47:43 +00:00
protected :
2022-12-29 07:07:40 +00:00
void loadIcon ( ) noexcept override ;
2012-12-03 12:47:43 +00:00
2009-01-28 20:55:45 +00:00
private :
/// Loads an article with the given offset, filling the given strings.
void loadArticle ( uint32_t offset , string & headword ,
string & displayedHeadword , string & articleText ) ;
2009-04-16 11:33:12 +00:00
static void replaceCharsetEntities ( string & ) ;
friend class BglHeadwordsRequest ;
friend class BglArticleRequest ;
friend class BglResourceRequest ;
2009-01-28 20:55:45 +00:00
} ;
BglDictionary : : BglDictionary ( string const & id , string const & indexFile ,
string const & dictionaryFile ) :
BtreeDictionary ( id , vector < string > ( 1 , dictionaryFile ) ) ,
idx ( indexFile , " rb " ) ,
idxHeader ( idx . read < IdxHeader > ( ) ) ,
2012-12-03 12:47:43 +00:00
chunks ( idx , idxHeader . chunksOffset )
2009-01-28 20:55:45 +00:00
{
idx . seek ( sizeof ( idxHeader ) ) ;
// Read the dictionary's name
size_t len = idx . read < uint32_t > ( ) ;
2019-01-17 14:53:13 +00:00
if ( len )
{
vector < char > nameBuf ( len ) ;
2009-01-28 20:55:45 +00:00
2019-01-17 14:53:13 +00:00
idx . read ( & nameBuf . front ( ) , len ) ;
2009-01-28 20:55:45 +00:00
2019-01-17 14:53:13 +00:00
dictionaryName = string ( & nameBuf . front ( ) , len ) ;
}
2009-01-28 20:55:45 +00:00
// Initialize the index
2009-04-14 16:35:47 +00:00
openIndex ( IndexInfo ( idxHeader . indexBtreeMaxElements ,
idxHeader . indexRootOffset ) ,
idx , idxMutex ) ;
2014-04-16 16:18:28 +00:00
can_FTS = true ;
2022-10-06 03:04:48 +00:00
ftsIdxName = indexFile + Dictionary : : getFtsSuffix ( ) ;
2014-04-16 16:18:28 +00:00
2023-06-03 00:29:19 +00:00
if ( ! Dictionary : : needToRebuildIndex ( getDictionaryFilenames ( ) , ftsIdxName )
& & ! FtsHelpers : : ftsIndexIsOldOrBad ( this ) )
2014-05-08 12:38:00 +00:00
FTS_index_completed . ref ( ) ;
2009-01-28 20:55:45 +00:00
}
2022-06-03 13:28:41 +00:00
void BglDictionary : : loadIcon ( ) noexcept
2009-05-07 10:59:58 +00:00
{
2013-01-31 23:53:45 +00:00
if ( dictionaryIconLoaded )
return ;
2023-04-13 10:08:32 +00:00
QString fileName = QDir : : fromNativeSeparators ( QString : : fromStdString ( getDictionaryFilenames ( ) [ 0 ] ) ) ;
2013-01-31 23:53:45 +00:00
// Remove the extension
fileName . chop ( 3 ) ;
if ( ! loadIconFromFile ( fileName ) )
2009-05-07 10:59:58 +00:00
{
2013-01-31 23:53:45 +00:00
if ( idxHeader . iconSize )
{
2012-12-03 12:47:43 +00:00
2013-01-31 23:53:45 +00:00
// Try loading icon now
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
vector < char > chunk ;
2009-05-07 10:59:58 +00:00
2023-05-29 13:56:04 +00:00
QMutexLocker _ ( & idxMutex ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
char * iconData = chunks . getBlock ( idxHeader . iconAddress , chunk ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
QImage img ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
if ( img . loadFromData ( ( unsigned char * ) iconData , idxHeader . iconSize ) )
{
2012-12-03 12:47:43 +00:00
2013-01-31 23:53:45 +00:00
// Transform it to be square
int max = img . width ( ) > img . height ( ) ? img . width ( ) : img . height ( ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
QImage result ( max , max , QImage : : Format_ARGB32 ) ;
result . fill ( 0 ) ; // Black transparent
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
QPainter painter ( & result ) ;
2022-01-23 03:36:58 +00:00
painter . setRenderHint ( QPainter : : RenderHint : : Antialiasing ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
painter . drawImage ( QPoint ( img . width ( ) = = max ? 0 : ( max - img . width ( ) ) / 2 ,
img . height ( ) = = max ? 0 : ( max - img . height ( ) ) / 2 ) ,
img ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
painter . end ( ) ;
2009-05-07 10:59:58 +00:00
2013-01-31 23:53:45 +00:00
dictionaryIcon = QIcon ( QPixmap : : fromImage ( result ) ) ;
}
2009-05-07 10:59:58 +00:00
}
2013-01-31 23:53:45 +00:00
if ( dictionaryIcon . isNull ( ) )
2023-06-19 02:34:08 +00:00
dictionaryIcon = QIcon ( " :/icons/icon32_bgl.png " ) ;
2013-01-31 23:53:45 +00:00
}
2012-12-03 12:47:43 +00:00
dictionaryIconLoaded = true ;
2009-05-07 10:59:58 +00:00
}
2009-01-28 20:55:45 +00:00
void BglDictionary : : loadArticle ( uint32_t offset , string & headword ,
string & displayedHeadword ,
string & articleText )
{
vector < char > chunk ;
2023-05-29 13:56:04 +00:00
QMutexLocker _ ( & idxMutex ) ;
2009-04-21 19:03:16 +00:00
2009-01-28 20:55:45 +00:00
char * articleData = chunks . getBlock ( offset , chunk ) ;
headword = articleData ;
displayedHeadword = articleData + headword . size ( ) + 1 ;
articleText =
2009-04-21 19:03:16 +00:00
string ( articleData + headword . size ( ) +
2009-01-28 20:55:45 +00:00
displayedHeadword . size ( ) + 2 ) ;
}
2012-11-18 10:00:50 +00:00
QString const & BglDictionary : : getDescription ( )
{
if ( ! dictionaryDescription . isEmpty ( ) )
return dictionaryDescription ;
if ( idxHeader . descriptionSize = = 0 )
dictionaryDescription = " NONE " ;
else
{
2023-05-29 13:56:04 +00:00
QMutexLocker _ ( & idxMutex ) ;
2012-11-18 10:00:50 +00:00
vector < char > chunk ;
char * dictDescription = chunks . getBlock ( idxHeader . descriptionAddress , chunk ) ;
2012-11-19 15:30:26 +00:00
string str ( dictDescription ) ;
if ( ! str . empty ( ) )
2023-05-07 12:47:41 +00:00
dictionaryDescription + = QObject : : tr ( " Copyright: %1%2 " )
2017-03-16 15:20:36 +00:00
. arg ( Html : : unescape ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) ) )
. arg ( " \n \n " ) ;
2012-11-19 15:30:26 +00:00
dictDescription + = str . size ( ) + 1 ;
str = string ( dictDescription ) ;
if ( ! str . empty ( ) )
2023-05-07 12:47:41 +00:00
dictionaryDescription + = QObject : : tr ( " Author: %1%2 " )
2017-03-16 15:20:36 +00:00
. arg ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) )
. arg ( " \n \n " ) ;
2012-11-19 15:30:26 +00:00
dictDescription + = str . size ( ) + 1 ;
str = string ( dictDescription ) ;
if ( ! str . empty ( ) )
2023-05-07 12:47:41 +00:00
dictionaryDescription + = QObject : : tr ( " E-mail: %1%2 " )
2017-03-16 15:20:36 +00:00
. arg ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) )
. arg ( " \n \n " ) ;
2012-11-19 15:30:26 +00:00
dictDescription + = str . size ( ) + 1 ;
str = string ( dictDescription ) ;
if ( ! str . empty ( ) )
dictionaryDescription + = Html : : unescape ( QString : : fromUtf8 ( str . data ( ) , str . size ( ) ) ) ;
2012-11-18 10:00:50 +00:00
}
return dictionaryDescription ;
}
2014-04-16 16:18:28 +00:00
void BglDictionary : : getArticleText ( uint32_t articleAddress , QString & headword , QString & text )
{
try
{
string headwordStr , displayedHeadwordStr , articleStr ;
loadArticle ( articleAddress , headwordStr , displayedHeadwordStr , articleStr ) ;
2014-05-08 21:26:03 +00:00
// Some headword normalization similar while indexing
trimWs ( headwordStr ) ;
if ( headwordStr . size ( ) & & headwordStr [ 0 ] = = ' / ' )
headwordStr . erase ( ) ; // We will take headword from index later
if ( headwordStr . size ( )
& & headwordStr [ headwordStr . size ( ) - 1 ] = = ' $ ' )
{
headwordStr = removePostfix ( headwordStr ) ;
trimWs ( headwordStr ) ;
}
2014-04-16 16:18:28 +00:00
headword = QString : : fromUtf8 ( headwordStr . data ( ) , headwordStr . size ( ) ) ;
wstring wstr = Utf8 : : decode ( articleStr ) ;
if ( getLangTo ( ) = = LangCoder : : code2toInt ( " he " ) )
{
for ( unsigned int i = 0 ; i < wstr . size ( ) ; i + + )
{
2015-06-25 20:53:04 +00:00
if ( ( wstr [ i ] > = 224 & & wstr [ i ] < = 250 ) | | ( wstr [ i ] > = 192 & & wstr [ i ] < = 210 ) ) // Hebrew chars encoded ecoded as windows-1255 or ISO-8859-8, or as vowel-points of windows-1255
2014-04-16 16:18:28 +00:00
wstr [ i ] + = 1488 - 224 ; // Convert to Hebrew unicode
}
}
2023-04-16 09:07:07 +00:00
text = Html : : unescape ( QString : : fromStdU32String ( wstr ) ) ;
2014-04-16 16:18:28 +00:00
}
catch ( std : : exception & ex )
{
gdWarning ( " BGL: Failed retrieving article from \" %s \" , reason: %s \n " , getName ( ) . c_str ( ) , ex . what ( ) ) ;
}
}
void BglDictionary : : makeFTSIndex ( QAtomicInt & isCancelled , bool firstIteration )
{
2023-06-03 00:29:19 +00:00
if ( ! ( Dictionary : : needToRebuildIndex ( getDictionaryFilenames ( ) , ftsIdxName )
| | FtsHelpers : : ftsIndexIsOldOrBad ( this ) ) )
2014-04-16 16:18:28 +00:00
FTS_index_completed . ref ( ) ;
if ( haveFTSIndex ( ) )
return ;
if ( firstIteration & & getArticleCount ( ) > FTS : : MaxDictionarySizeForFastSearch )
return ;
gdDebug ( " Bgl: Building the full-text index for dictionary: %s \n " ,
getName ( ) . c_str ( ) ) ;
try
{
FtsHelpers : : makeFTSIndex ( this , isCancelled ) ;
2014-04-17 14:31:51 +00:00
FTS_index_completed . ref ( ) ;
2014-04-16 16:18:28 +00:00
}
catch ( std : : exception & ex )
{
gdWarning ( " Bgl: Failed building full-text search index for \" %s \" , reason: %s \n " , getName ( ) . c_str ( ) , ex . what ( ) ) ;
2023-04-13 10:08:32 +00:00
QFile : : remove ( QString : : fromStdString ( ftsIdxName ) ) ;
2014-04-16 16:18:28 +00:00
}
}
2009-04-16 11:33:12 +00:00
/// BglDictionary::findHeadwordsForSynonym()
class BglHeadwordsRequest : public Dictionary : : WordSearchRequest
{
wstring str ;
BglDictionary & dict ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
QAtomicInt isCancelled ;
2023-04-29 04:12:49 +00:00
QFuture < void > f ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
public :
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
BglHeadwordsRequest ( wstring const & word_ ,
BglDictionary & dict_ ) :
2023-04-29 04:12:49 +00:00
str ( word_ ) ,
dict ( dict_ )
2009-04-16 11:33:12 +00:00
{
2023-04-29 04:12:49 +00:00
f = QtConcurrent : : run ( [ this ] ( ) {
this - > run ( ) ;
} ) ;
2009-04-16 11:33:12 +00:00
}
2009-01-28 20:55:45 +00:00
2023-05-08 08:41:54 +00:00
void run ( ) ;
2009-01-28 20:55:45 +00:00
2022-12-29 07:07:40 +00:00
void cancel ( ) override
2009-04-16 11:33:12 +00:00
{
isCancelled . ref ( ) ;
}
2009-04-21 19:03:16 +00:00
2023-04-29 04:12:49 +00:00
~ BglHeadwordsRequest ( ) override
2009-04-16 11:33:12 +00:00
{
isCancelled . ref ( ) ;
2023-04-29 04:12:49 +00:00
f . waitForFinished ( ) ;
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
} ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
void BglHeadwordsRequest : : run ( )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
finish ( ) ;
return ;
}
vector < WordArticleLink > chain = dict . findArticles ( str ) ;
wstring caseFolded = Folding : : applySimpleCaseOnly ( str ) ;
for ( unsigned x = 0 ; x < chain . size ( ) ; + + x )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
finish ( ) ;
return ;
}
string headword , displayedHeadword , articleText ;
dict . loadArticle ( chain [ x ] . articleOffset ,
headword , displayedHeadword , articleText ) ;
2012-12-19 17:34:56 +00:00
wstring headwordDecoded ;
try
{
headwordDecoded = Utf8 : : decode ( removePostfix ( headword ) ) ;
}
2018-05-22 14:48:14 +00:00
catch ( Utf8 : : exCantDecode & )
2012-12-19 17:34:56 +00:00
{
}
2009-04-16 11:33:12 +00:00
2012-12-19 17:34:56 +00:00
if ( caseFolded ! = Folding : : applySimpleCaseOnly ( headwordDecoded ) & & ! headwordDecoded . empty ( ) )
2009-04-16 11:33:12 +00:00
{
// The headword seems to differ from the input word, which makes the
// input word its synonym.
2023-05-29 13:56:04 +00:00
QMutexLocker _ ( & dataMutex ) ;
2009-04-16 11:33:12 +00:00
matches . push_back ( headwordDecoded ) ;
}
}
finish ( ) ;
}
sptr < Dictionary : : WordSearchRequest >
BglDictionary : : findHeadwordsForSynonym ( wstring const & word )
2022-01-09 08:35:07 +00:00
2009-04-16 11:33:12 +00:00
{
2022-11-29 03:54:31 +00:00
return synonymSearchEnabled ? std : : make_shared < BglHeadwordsRequest > ( word , * this ) :
2017-03-09 16:11:17 +00:00
Class : : findHeadwordsForSynonym ( word ) ;
2009-04-16 11:33:12 +00:00
}
// Converts a $1$-like postfix to a <sup>1</sup> one
string postfixToSuperscript ( string const & in )
{
if ( ! in . size ( ) | | in [ in . size ( ) - 1 ] ! = ' $ ' )
return in ;
for ( long x = in . size ( ) - 2 ; x > = 0 ; x - - )
{
if ( in [ x ] = = ' $ ' )
{
if ( in . size ( ) - x - 2 > 2 )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
// Large postfixes seem like something we wouldn't want to show --
// some dictionaries seem to have each word numbered using the
// postfix.
return in . substr ( 0 , x ) ;
2009-01-28 20:55:45 +00:00
}
else
2009-04-16 11:33:12 +00:00
return in . substr ( 0 , x ) + " <sup> " + in . substr ( x + 1 , in . size ( ) - x - 2 ) + " </sup> " ;
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
else
if ( ! isdigit ( in [ x ] ) )
break ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
return in ;
}
/// BglDictionary::getArticle()
class BglArticleRequest : public Dictionary : : DataRequest
{
wstring word ;
vector < wstring > alts ;
BglDictionary & dict ;
QAtomicInt isCancelled ;
2018-06-13 16:00:42 +00:00
bool ignoreDiacritics ;
2023-04-29 04:12:49 +00:00
QFuture < void > f ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
public :
BglArticleRequest ( wstring const & word_ ,
vector < wstring > const & alts_ ,
2018-06-13 16:00:42 +00:00
BglDictionary & dict_ , bool ignoreDiacritics_ ) :
word ( word_ ) , alts ( alts_ ) , dict ( dict_ ) , ignoreDiacritics ( ignoreDiacritics_ )
2009-01-28 20:55:45 +00:00
{
2023-04-29 04:12:49 +00:00
f = QtConcurrent : : run ( [ this ] ( ) {
this - > run ( ) ;
} ) ;
2009-04-16 11:33:12 +00:00
}
2009-01-28 20:55:45 +00:00
2023-05-08 08:41:54 +00:00
void run ( ) ;
2009-01-28 20:55:45 +00:00
2022-12-29 07:07:40 +00:00
void cancel ( ) override
2009-04-16 11:33:12 +00:00
{
isCancelled . ref ( ) ;
}
2009-04-21 19:03:16 +00:00
2011-06-04 17:51:48 +00:00
void fixHebString ( string & hebStr ) ; // Hebrew support
void fixHebArticle ( string & hebArticle ) ; // Hebrew support
2009-04-16 11:33:12 +00:00
~ BglArticleRequest ( )
{
isCancelled . ref ( ) ;
2023-04-29 04:12:49 +00:00
f . waitForFinished ( ) ;
2009-04-16 11:33:12 +00:00
}
} ;
2009-01-28 20:55:45 +00:00
2011-06-04 17:51:48 +00:00
void BglArticleRequest : : fixHebString ( string & hebStr ) // Hebrew support - convert non-unicode to unicode
{
2012-12-19 17:34:56 +00:00
wstring hebWStr ;
try
{
hebWStr = Utf8 : : decode ( hebStr ) ;
}
2018-05-22 14:48:14 +00:00
catch ( Utf8 : : exCantDecode & )
2012-12-19 17:34:56 +00:00
{
hebStr = " Utf-8 decoding error " ;
return ;
}
2011-06-04 17:51:48 +00:00
for ( unsigned int i = 0 ; i < hebWStr . size ( ) ; i + + )
{
2015-06-25 20:53:04 +00:00
if ( ( hebWStr [ i ] > = 224 & & hebWStr [ i ] < = 250 ) | | ( hebWStr [ i ] > = 192 & & hebWStr [ i ] < = 210 ) ) // Hebrew chars encoded ecoded as windows-1255 or ISO-8859-8, or as vowel-points of windows-1255
2011-06-04 17:51:48 +00:00
hebWStr [ i ] + = 1488 - 224 ; // Convert to Hebrew unicode
}
hebStr = Utf8 : : encode ( hebWStr ) ;
}
void BglArticleRequest : : fixHebArticle ( string & hebArticle ) // Hebrew support - remove extra chars at the end
{
2011-06-08 02:48:05 +00:00
unsigned nulls ;
for ( nulls = hebArticle . size ( ) ; nulls > 0 & &
( ( hebArticle [ nulls - 1 ] < = 32 & &
hebArticle [ nulls - 1 ] > = 0 ) | |
( hebArticle [ nulls - 1 ] > = 65 & &
hebArticle [ nulls - 1 ] < = 90 ) ) ; - - nulls ) ; //special chars and A-Z
hebArticle . resize ( nulls ) ;
2011-06-04 17:51:48 +00:00
}
2009-04-16 11:33:12 +00:00
void BglArticleRequest : : run ( )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-16 11:33:12 +00:00
{
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2018-06-13 16:00:42 +00:00
vector < WordArticleLink > chain = dict . findArticles ( word , ignoreDiacritics ) ;
2009-01-28 20:55:45 +00:00
2011-06-04 17:51:48 +00:00
static Language : : Id hebrew = LangCoder : : code2toInt ( " he " ) ; // Hebrew support
2009-04-16 11:33:12 +00:00
for ( unsigned x = 0 ; x < alts . size ( ) ; + + x )
{
/// Make an additional query for each alt
2018-06-13 16:00:42 +00:00
vector < WordArticleLink > altChain = dict . findArticles ( alts [ x ] , ignoreDiacritics ) ;
2009-04-16 11:33:12 +00:00
chain . insert ( chain . end ( ) , altChain . begin ( ) , altChain . end ( ) ) ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
multimap < wstring , pair < string , string > > mainArticles , alternateArticles ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
set < uint32_t > articlesIncluded ; // Some synonims make it that the articles
// appear several times. We combat this
// by only allowing them to appear once.
2011-05-22 20:45:06 +00:00
// Sometimes the articles are physically duplicated. We store hashes of
// the bodies to account for this.
set < QByteArray > articleBodiesIncluded ;
2009-04-16 11:33:12 +00:00
wstring wordCaseFolded = Folding : : applySimpleCaseOnly ( word ) ;
2018-06-13 16:00:42 +00:00
if ( ignoreDiacritics )
wordCaseFolded = Folding : : applyDiacriticsOnly ( wordCaseFolded ) ;
2009-04-16 11:33:12 +00:00
for ( unsigned x = 0 ; x < chain . size ( ) ; + + x )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-01-28 20:55:45 +00:00
{
2009-04-16 11:33:12 +00:00
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2012-12-19 17:34:56 +00:00
try
{
2009-04-16 11:33:12 +00:00
if ( articlesIncluded . find ( chain [ x ] . articleOffset ) ! = articlesIncluded . end ( ) )
continue ; // We already have this article in the body.
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
// Now grab that article
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
string headword , displayedHeadword , articleText ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
dict . loadArticle ( chain [ x ] . articleOffset ,
headword , displayedHeadword , articleText ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
// Ok. Now, does it go to main articles, or to alternate ones? We list
// main ones first, and alternates after.
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
// We do the case-folded and postfix-less comparison here.
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
wstring headwordStripped =
2023-04-29 02:35:56 +00:00
Folding : : applySimpleCaseOnly ( removePostfix ( headword ) ) ;
2018-06-13 16:00:42 +00:00
if ( ignoreDiacritics )
headwordStripped = Folding : : applyDiacriticsOnly ( headwordStripped ) ;
2009-01-28 20:55:45 +00:00
2018-06-13 16:00:42 +00:00
// Hebrew support - fix Hebrew text
2011-06-04 17:51:48 +00:00
if ( dict . idxHeader . langFrom = = hebrew )
{
displayedHeadword = displayedHeadword . size ( ) ? displayedHeadword : headword ;
fixHebString ( articleText ) ;
fixHebArticle ( articleText ) ;
fixHebString ( displayedHeadword ) ;
}
2011-05-22 20:45:06 +00:00
string const & targetHeadword = displayedHeadword . size ( ) ?
displayedHeadword : headword ;
QCryptographicHash hash ( QCryptographicHash : : Md5 ) ;
hash . addData ( targetHeadword . data ( ) , targetHeadword . size ( ) + 1 ) ; // with 0
hash . addData ( articleText . data ( ) , articleText . size ( ) ) ;
if ( ! articleBodiesIncluded . insert ( hash . result ( ) ) . second )
continue ; // Already had this body
2009-04-21 19:03:16 +00:00
multimap < wstring , pair < string , string > > & mapToUse =
2009-04-16 11:33:12 +00:00
( wordCaseFolded = = headwordStripped ) ?
mainArticles : alternateArticles ;
2009-01-28 20:55:45 +00:00
2023-04-29 02:35:56 +00:00
mapToUse . insert ( pair (
Folding : : applySimpleCaseOnly ( headword ) ,
pair ( targetHeadword , articleText ) ) ) ;
2009-04-16 11:33:12 +00:00
articlesIncluded . insert ( chain [ x ] . articleOffset ) ;
2012-12-19 17:34:56 +00:00
} // try
2013-09-24 13:56:47 +00:00
catch ( std : : exception & ex )
2012-12-19 17:34:56 +00:00
{
2013-11-16 18:34:09 +00:00
gdWarning ( " BGL: Failed loading article from \" %s \" , reason: %s \n " , dict . getName ( ) . c_str ( ) , ex . what ( ) ) ;
2012-12-19 17:34:56 +00:00
}
2009-04-16 11:33:12 +00:00
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
if ( mainArticles . empty ( ) & & alternateArticles . empty ( ) )
{
// No such word
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
string result ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
multimap < wstring , pair < string , string > > : : const_iterator i ;
2009-01-28 20:55:45 +00:00
2023-04-15 07:39:49 +00:00
string cleaner = Utils : : Html : : getHtmlCleaner ( ) ;
2009-04-16 11:33:12 +00:00
for ( i = mainArticles . begin ( ) ; i ! = mainArticles . end ( ) ; + + i )
{
2013-07-10 13:48:09 +00:00
if ( dict . isFromLanguageRTL ( ) ) // RTL support
2011-06-04 17:51:48 +00:00
result + = " <h3 style= \" text-align:right;direction:rtl \" > " ;
else
result + = " <h3> " ;
2009-04-16 11:33:12 +00:00
result + = postfixToSuperscript ( i - > second . first ) ;
result + = " </h3> " ;
2013-07-10 13:48:09 +00:00
if ( dict . isToLanguageRTL ( ) )
2009-12-27 12:40:20 +00:00
result + = " <div class= \" bglrtl \" > " + i - > second . second + " </div> " ;
else
2011-07-01 16:25:14 +00:00
result + = " <div> " + i - > second . second + " </div> " ;
2009-04-16 11:33:12 +00:00
result + = cleaner ;
}
2011-06-04 17:51:48 +00:00
2009-12-27 12:40:20 +00:00
2009-04-16 11:33:12 +00:00
for ( i = alternateArticles . begin ( ) ; i ! = alternateArticles . end ( ) ; + + i )
{
2013-07-10 13:48:09 +00:00
if ( dict . isFromLanguageRTL ( ) ) // RTL support
2011-06-04 17:51:48 +00:00
result + = " <h3 style= \" text-align:right;direction:rtl \" > " ;
else
result + = " <h3> " ;
2009-04-16 11:33:12 +00:00
result + = postfixToSuperscript ( i - > second . first ) ;
result + = " </h3> " ;
2013-07-10 13:48:09 +00:00
if ( dict . isToLanguageRTL ( ) )
2009-12-27 12:40:20 +00:00
result + = " <div class= \" bglrtl \" > " + i - > second . second + " </div> " ;
else
2011-07-01 16:25:14 +00:00
result + = " <div> " + i - > second . second + " </div> " ;
2009-04-16 11:33:12 +00:00
result + = cleaner ;
}
// Do some cleanups in the text
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
BglDictionary : : replaceCharsetEntities ( result ) ;
2018-02-21 14:43:35 +00:00
result = QString : : fromUtf8 ( result . c_str ( ) )
// onclick location to link
2022-12-24 22:01:50 +00:00
. replace ( QRegularExpression ( R " (<([a-z0-9]+) \ s+[^>]*onclick= " [ a - z . ] * location ( ? : \ . href ) \ s * = \ s * ' ( [ ^ ' ] + ) [ ^ > ] * > ( [ ^ < ] + ) < / \ 1 > ) " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2022-12-24 22:01:50 +00:00
R " (<a href= " \ 2 " > \3 </a>) " )
. replace ( QRegularExpression ( R " ((< \ s*a \ s+[^>]*href \ s*= \ s*[ " ' ] \ s * ) bword : //)",
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2018-02-21 14:43:35 +00:00
" \\ 1bword: " )
//remove invalid width, height attrs
2022-12-24 22:01:50 +00:00
. replace ( QRegularExpression ( R " ((width|height) \ s*= \ s*[ " ' ] \ d { 7 , } [ " '']) " ) ,
2018-02-21 14:43:35 +00:00
" " )
//remove invalid <br> tag
2022-12-24 22:01:50 +00:00
. replace ( QRegularExpression ( R " (<br>(<div|<table|<tbody|<tr|<td|</div>|</table>|</tbody>|</tr>|</td>|function addScript|var scNode|scNode|var atag|while \ (atag|atag=atag|document \ .getElementsByTagName|addScript|src= " bres | < a onmouseover = " return overlib|onclick= " return overlib ) ) " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2018-02-21 14:43:35 +00:00
" \\ 1 " )
2022-12-24 22:01:50 +00:00
. replace ( QRegularExpression ( R " ((AUTOSTATUS, WRAP \ ); " | < / DIV > | addScript \ ( ' JS_FILE_PHONG_VT_45634 ' \ ) ; | appendChild \ ( scNode \ ) ; | atag \ . firstChild ; ) < br > ) " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2018-02-21 14:43:35 +00:00
" \\ 1 " )
. toUtf8 ( ) . data ( ) ;
2021-11-19 13:47:22 +00:00
2009-01-28 20:55:45 +00:00
2023-06-23 15:09:31 +00:00
appendString ( result ) ;
2009-04-16 11:33:12 +00:00
hasAnyData = true ;
finish ( ) ;
}
sptr < Dictionary : : DataRequest > BglDictionary : : getArticle ( wstring const & word ,
2009-05-29 19:48:50 +00:00
vector < wstring > const & alts ,
2018-06-13 16:00:42 +00:00
wstring const & ,
bool ignoreDiacritics )
2022-01-09 08:35:07 +00:00
2009-04-16 11:33:12 +00:00
{
2022-11-29 03:54:31 +00:00
return std : : make_shared < BglArticleRequest > ( word , alts , * this , ignoreDiacritics ) ;
2009-04-16 11:33:12 +00:00
}
//// BglDictionary::getResource()
class BglResourceRequest : public Dictionary : : DataRequest
{
2023-05-29 13:56:04 +00:00
QMutex & idxMutex ;
2009-04-16 11:33:12 +00:00
File : : Class & idx ;
uint32_t resourceListOffset , resourcesCount ;
string name ;
QAtomicInt isCancelled ;
2023-04-29 04:12:49 +00:00
QFuture < void > f ;
2009-04-16 11:33:12 +00:00
public :
2023-05-29 13:56:04 +00:00
BglResourceRequest ( QMutex & idxMutex_ ,
2009-04-16 11:33:12 +00:00
File : : Class & idx_ ,
uint32_t resourceListOffset_ ,
uint32_t resourcesCount_ ,
string const & name_ ) :
idxMutex ( idxMutex_ ) ,
idx ( idx_ ) ,
resourceListOffset ( resourceListOffset_ ) ,
resourcesCount ( resourcesCount_ ) ,
name ( name_ )
2009-01-28 20:55:45 +00:00
{
2023-04-29 04:12:49 +00:00
f = QtConcurrent : : run ( [ this ] ( ) {
this - > run ( ) ;
} ) ;
2009-04-16 11:33:12 +00:00
}
2009-01-28 20:55:45 +00:00
2023-05-08 08:41:54 +00:00
void run ( ) ;
2009-01-28 20:55:45 +00:00
2022-12-29 07:07:40 +00:00
void cancel ( ) override
2009-04-16 11:33:12 +00:00
{
isCancelled . ref ( ) ;
}
2009-04-21 19:03:16 +00:00
2009-04-16 11:33:12 +00:00
~ BglResourceRequest ( )
{
isCancelled . ref ( ) ;
2023-04-29 04:12:49 +00:00
f . waitForFinished ( ) ;
2009-04-16 11:33:12 +00:00
}
} ;
2009-03-26 19:00:08 +00:00
2009-04-16 11:33:12 +00:00
void BglResourceRequest : : run ( )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-16 11:33:12 +00:00
{
finish ( ) ;
return ;
}
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
string nameLowercased = name ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
for ( string : : iterator i = nameLowercased . begin ( ) ; i ! = nameLowercased . end ( ) ;
+ + i )
* i = tolower ( * i ) ;
2009-01-28 20:55:45 +00:00
2023-05-29 13:56:04 +00:00
QMutexLocker _ ( & idxMutex ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
idx . seek ( resourceListOffset ) ;
for ( size_t count = resourcesCount ; count - - ; )
{
2021-11-27 07:17:33 +00:00
if ( Utils : : AtomicInt : : loadAcquire ( isCancelled ) )
2009-04-16 11:33:12 +00:00
break ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
vector < char > nameData ( idx . read < uint32_t > ( ) ) ;
idx . read ( & nameData . front ( ) , nameData . size ( ) ) ;
for ( size_t x = nameData . size ( ) ; x - - ; )
nameData [ x ] = tolower ( nameData [ x ] ) ;
uint32_t offset = idx . read < uint32_t > ( ) ;
if ( string ( & nameData . front ( ) , nameData . size ( ) ) = = nameLowercased )
{
// We have a match.
2009-03-26 19:00:08 +00:00
2009-04-16 11:33:12 +00:00
idx . seek ( offset ) ;
2009-01-28 20:55:45 +00:00
2023-05-29 13:56:04 +00:00
QMutexLocker _ ( & dataMutex ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
data . resize ( idx . read < uint32_t > ( ) ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
vector < unsigned char > compressedData ( idx . read < uint32_t > ( ) ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
idx . read ( & compressedData . front ( ) , compressedData . size ( ) ) ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
unsigned long decompressedLength = data . size ( ) ;
if ( uncompress ( ( unsigned char * ) & data . front ( ) ,
& decompressedLength ,
& compressedData . front ( ) ,
compressedData . size ( ) ) ! = Z_OK | |
decompressedLength ! = data . size ( ) )
{
2013-11-16 18:34:09 +00:00
gdWarning ( " Failed to decompress resource \" %s \" , ignoring it. \n " , name . c_str ( ) ) ;
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
else
hasAnyData = true ;
2009-01-28 20:55:45 +00:00
2009-04-16 11:33:12 +00:00
break ;
}
2009-01-28 20:55:45 +00:00
}
2009-04-16 11:33:12 +00:00
finish ( ) ;
}
sptr < Dictionary : : DataRequest > BglDictionary : : getResource ( string const & name )
2022-01-09 08:35:07 +00:00
2009-04-16 11:33:12 +00:00
{
2022-11-29 03:54:31 +00:00
return std : : shared_ptr < BglResourceRequest > ( new BglResourceRequest ( idxMutex , idx , idxHeader . resourceListOffset ,
idxHeader . resourcesCount , name ) ) ;
2009-04-16 11:33:12 +00:00
}
2018-07-07 09:33:15 +00:00
/// Replaces <CHARSET c="t">1234;</CHARSET> occurrences with ሴ
2009-01-28 20:55:45 +00:00
void BglDictionary : : replaceCharsetEntities ( string & text )
{
2018-02-21 14:43:35 +00:00
QString str = QString : : fromUtf8 ( text . c_str ( ) ) ;
2022-12-24 22:01:50 +00:00
QRegularExpression charsetExp ( R " (< \ s*charset \ s+c \ s*= \ s*[ " ' ] ? t [ " ']? \ s*>((?: \ s*[0-9a-fA-F]+ \ s*; \ s*)*)< \ s*/ \ s*charset \ s*>) " ,
2018-02-21 14:43:35 +00:00
QRegularExpression : : CaseInsensitiveOption
| QRegularExpression : : InvertedGreedinessOption ) ;
2018-02-28 14:15:27 +00:00
QRegularExpression oneValueExp ( " \\ s*([0-9a-fA-F]+) \ \ s * ; " );
2018-02-21 14:43:35 +00:00
QString result ;
int pos = 0 ;
QRegularExpressionMatchIterator it = charsetExp . globalMatch ( str ) ;
while ( it . hasNext ( ) )
{
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
result + = str . mid ( pos , match . capturedStart ( ) - pos ) ;
2018-02-21 14:43:35 +00:00
pos = match . capturedEnd ( ) ;
QRegularExpressionMatchIterator itValue = oneValueExp . globalMatch ( match . captured ( 1 ) ) ;
while ( itValue . hasNext ( ) )
{
QRegularExpressionMatch matchValue = itValue . next ( ) ;
result + = " &#x " + matchValue . captured ( 1 ) + " ; " ;
}
}
if ( pos )
{
2022-02-27 05:17:37 +00:00
result + = str . mid ( pos ) ;
2018-02-21 14:43:35 +00:00
str = result ;
}
2009-01-28 20:55:45 +00:00
2009-05-07 16:14:56 +00:00
text = str . toUtf8 ( ) . data ( ) ;
2009-01-28 20:55:45 +00:00
}
class ResourceHandler : public Babylon : : ResourceHandler
{
File : : Class & idxFile ;
list < pair < string , uint32_t > > resources ;
public :
ResourceHandler ( File : : Class & idxFile_ ) : idxFile ( idxFile_ )
{ }
list < pair < string , uint32_t > > const & getResources ( ) const
{ return resources ; }
protected :
2022-12-29 07:07:40 +00:00
void handleBabylonResource ( string const & filename ,
char const * data , size_t size ) override ;
2009-01-28 20:55:45 +00:00
} ;
void ResourceHandler : : handleBabylonResource ( string const & filename ,
char const * data , size_t size )
{
2022-01-15 07:29:20 +00:00
//GD_DPRINTF( "Handling resource file %s (%u bytes)\n", filename.c_str(), size );
2009-01-28 20:55:45 +00:00
vector < unsigned char > compressedData ( compressBound ( size ) ) ;
unsigned long compressedSize = compressedData . size ( ) ;
if ( compress ( & compressedData . front ( ) , & compressedSize ,
( unsigned char const * ) data , size ) ! = Z_OK )
{
2013-11-16 18:34:09 +00:00
gdWarning ( " Failed to compress the body of resource \" %s \" , dropping it. \n " , filename . c_str ( ) ) ;
2009-01-28 20:55:45 +00:00
return ;
}
resources . push_back ( pair < string , uint32_t > ( filename , idxFile . tell ( ) ) ) ;
idxFile . write < uint32_t > ( size ) ;
idxFile . write < uint32_t > ( compressedSize ) ;
idxFile . write ( & compressedData . front ( ) , compressedSize ) ;
}
}
2014-04-16 16:18:28 +00:00
sptr < Dictionary : : DataRequest > BglDictionary : : getSearchResults ( QString const & searchString ,
2023-05-30 23:42:31 +00:00
int searchMode ,
bool matchCase ,
2018-04-10 14:49:52 +00:00
bool ignoreDiacritics )
2014-04-16 16:18:28 +00:00
{
2023-05-30 23:42:31 +00:00
return std : : make_shared < FtsHelpers : : FTSResultsRequest > ( * this ,
searchString ,
searchMode ,
matchCase ,
ignoreDiacritics ) ;
2014-04-16 16:18:28 +00:00
}
2009-01-28 20:55:45 +00:00
2009-03-26 19:00:08 +00:00
vector < sptr < Dictionary : : Class > > makeDictionaries (
vector < string > const & fileNames ,
string const & indicesDir ,
Dictionary : : Initializing & initializing )
2022-01-09 08:35:07 +00:00
2009-01-28 20:55:45 +00:00
{
vector < sptr < Dictionary : : Class > > dictionaries ;
for ( vector < string > : : const_iterator i = fileNames . begin ( ) ; i ! = fileNames . end ( ) ;
+ + i )
{
// Skip files with the extensions different to .bgl to speed up the
// scanning
if ( i - > size ( ) < 4 | |
strcasecmp ( i - > c_str ( ) + ( i - > size ( ) - 4 ) , " .bgl " ) ! = 0 )
continue ;
// Got the file -- check if we need to rebuid the index
vector < string > dictFiles ( 1 , * i ) ;
2009-03-26 19:00:08 +00:00
string dictId = Dictionary : : makeDictionaryId ( dictFiles ) ;
2009-01-28 20:55:45 +00:00
string indexFile = indicesDir + dictId ;
2009-03-26 19:00:08 +00:00
if ( Dictionary : : needToRebuildIndex ( dictFiles , indexFile ) | |
indexIsOldOrBad ( indexFile ) )
2009-01-28 20:55:45 +00:00
{
// Building the index
2013-11-16 18:34:09 +00:00
gdDebug ( " Bgl: Building the index for dictionary: %s \n " , i - > c_str ( ) ) ;
2013-09-20 14:25:44 +00:00
2014-04-25 13:13:56 +00:00
try
{
Babylon b ( * i ) ;
2009-04-14 13:25:16 +00:00
2014-04-25 13:13:56 +00:00
if ( ! b . open ( ) )
continue ;
2009-04-14 13:25:16 +00:00
2014-04-25 13:13:56 +00:00
std : : string sourceCharset , targetCharset ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
if ( ! b . read ( sourceCharset , targetCharset ) )
{
gdWarning ( " Failed to start reading from %s, skipping it \n " , i - > c_str ( ) ) ;
continue ;
}
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
initializing . indexingDictionary ( b . title ( ) ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
File : : Class idx ( indexFile , " wb " ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
IdxHeader idxHeader ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
memset ( & idxHeader , 0 , sizeof ( idxHeader ) ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idx . write ( idxHeader ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idx . write < uint32_t > ( b . title ( ) . size ( ) ) ;
idx . write ( b . title ( ) . data ( ) , b . title ( ) . size ( ) ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// This is our index data that we accumulate during the loading process.
// For each new word encountered, we emit the article's body to the file
// immediately, inserting the word itself and its offset in this map.
// This map maps folded words to the original words and the corresponding
// articles' offsets.
IndexedWords indexedWords ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// We use this buffer to decode utf8 into it.
vector < wchar > wcharBuffer ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
ChunkedStorage : : Writer chunks ( idx ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
uint32_t articleCount = 0 , wordCount = 0 ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
ResourceHandler resourceHandler ( idx ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
b . setResourcePrefix ( string ( " bres:// " ) + dictId + " / " ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Save icon if there's one
if ( size_t sz = b . getIcon ( ) . size ( ) )
{
idxHeader . iconAddress = chunks . startNewBlock ( ) ;
chunks . addToBlock ( & b . getIcon ( ) . front ( ) , sz ) ;
idxHeader . iconSize = sz ;
}
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
// Save dictionary description if there's one
idxHeader . descriptionSize = 0 ;
idxHeader . descriptionAddress = chunks . startNewBlock ( ) ;
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . copyright ( ) . c_str ( ) , b . copyright ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . copyright ( ) . size ( ) + 1 ;
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . author ( ) . c_str ( ) , b . author ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . author ( ) . size ( ) + 1 ;
2012-11-19 15:30:26 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . email ( ) . c_str ( ) , b . email ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . email ( ) . size ( ) + 1 ;
2012-11-18 10:00:50 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( b . description ( ) . c_str ( ) , b . description ( ) . size ( ) + 1 ) ;
idxHeader . descriptionSize + = b . description ( ) . size ( ) + 1 ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
for ( ; ; )
{
bgl_entry e = b . readEntry ( & resourceHandler ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
if ( e . headword . empty ( ) )
break ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Save the article's body itself first
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
uint32_t articleAddress = chunks . startNewBlock ( ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
chunks . addToBlock ( e . headword . c_str ( ) , e . headword . size ( ) + 1 ) ;
chunks . addToBlock ( e . displayedHeadword . c_str ( ) , e . displayedHeadword . size ( ) + 1 ) ;
chunks . addToBlock ( e . definition . c_str ( ) , e . definition . size ( ) + 1 ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Add entries to the index
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
addEntryToIndex ( e . headword , articleAddress , indexedWords , wcharBuffer ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
for ( unsigned x = 0 ; x < e . alternates . size ( ) ; + + x )
addEntryToIndex ( e . alternates [ x ] , articleAddress , indexedWords , wcharBuffer ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
wordCount + = 1 + e . alternates . size ( ) ;
+ + articleCount ;
}
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Finish with the chunks
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . chunksOffset = chunks . finish ( ) ;
2009-01-28 20:55:45 +00:00
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " Writing index... \n " ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Good. Now build the index
2009-04-14 16:35:47 +00:00
2014-04-25 13:13:56 +00:00
IndexInfo idxInfo = BtreeIndexing : : buildIndex ( indexedWords , idx ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . indexBtreeMaxElements = idxInfo . btreeMaxElements ;
idxHeader . indexRootOffset = idxInfo . rootOffset ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
// Save the resource's list.
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . resourceListOffset = idx . tell ( ) ;
idxHeader . resourcesCount = resourceHandler . getResources ( ) . size ( ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
for ( list < pair < string , uint32_t > > : : const_iterator j =
resourceHandler . getResources ( ) . begin ( ) ;
j ! = resourceHandler . getResources ( ) . end ( ) ; + + j )
{
idx . write < uint32_t > ( j - > first . size ( ) ) ;
idx . write ( j - > first . data ( ) , j - > first . size ( ) ) ;
idx . write < uint32_t > ( j - > second ) ;
}
// That concludes it. Update the header.
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idxHeader . signature = Signature ;
idxHeader . formatVersion = CurrentFormatVersion ;
idxHeader . parserVersion = Babylon : : ParserVersion ;
idxHeader . foldingVersion = Folding : : Version ;
idxHeader . articleCount = articleCount ;
idxHeader . wordCount = wordCount ;
idxHeader . langFrom = b . sourceLang ( ) ; //LangCoder::findIdForLanguage( Utf8::decode( b.sourceLang() ) );
idxHeader . langTo = b . targetLang ( ) ; //LangCoder::findIdForLanguage( Utf8::decode( b.targetLang() ) );
2009-05-06 18:17:13 +00:00
2014-04-25 13:13:56 +00:00
idx . rewind ( ) ;
2009-01-28 20:55:45 +00:00
2014-04-25 13:13:56 +00:00
idx . write ( & idxHeader , sizeof ( idxHeader ) ) ;
}
catch ( std : : exception & e )
{
gdWarning ( " BGL dictionary indexing failed: %s, error: %s \n " ,
i - > c_str ( ) , e . what ( ) ) ;
}
2009-01-28 20:55:45 +00:00
}
2014-04-25 13:13:56 +00:00
try
{
2022-11-29 03:54:31 +00:00
dictionaries . push_back ( std : : make_shared < BglDictionary > ( dictId ,
2014-04-25 13:13:56 +00:00
indexFile ,
* i ) ) ;
}
catch ( std : : exception & e )
{
gdWarning ( " BGL dictionary initializing failed: %s, error: %s \n " ,
i - > c_str ( ) , e . what ( ) ) ;
}
2009-01-28 20:55:45 +00:00
}
return dictionaries ;
}
}