2017-03-06 15:07:39 +00:00
|
|
|
/* This file is (c) 2008-2017 Abs62
|
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#include <zlib.h>
|
|
|
|
#include "gls.hh"
|
|
|
|
#include "iconv.hh"
|
|
|
|
#include "dictionary.hh"
|
|
|
|
#include "ufile.hh"
|
|
|
|
#include "btreeidx.hh"
|
|
|
|
#include "folding.hh"
|
|
|
|
#include "gddebug.hh"
|
|
|
|
#include "utf8.hh"
|
|
|
|
#include "wstring_qt.hh"
|
|
|
|
#include "chunkedstorage.hh"
|
|
|
|
#include "langcoder.hh"
|
2023-04-17 20:12:27 +00:00
|
|
|
#include "dictzip.hh"
|
2017-03-06 15:07:39 +00:00
|
|
|
#include "indexedzip.hh"
|
|
|
|
#include "ftshelpers.hh"
|
|
|
|
#include "fsencoding.hh"
|
|
|
|
#include "htmlescape.hh"
|
|
|
|
#include "filetype.hh"
|
|
|
|
#include "tiff.hh"
|
2017-10-16 15:34:22 +00:00
|
|
|
#include "audiolink.hh"
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
#include <QString>
|
|
|
|
#include <QSemaphore>
|
|
|
|
#include <QThreadPool>
|
|
|
|
#include <QAtomicInt>
|
|
|
|
// For TIFF conversion
|
|
|
|
#include <QImage>
|
|
|
|
#include <QByteArray>
|
|
|
|
#include <QBuffer>
|
|
|
|
|
2018-02-27 16:42:21 +00:00
|
|
|
#include <QRegularExpression>
|
2022-02-27 14:42:40 +00:00
|
|
|
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
|
|
|
|
#include <QtCore5Compat/QTextCodec>
|
|
|
|
#else
|
|
|
|
#include <QTextCodec>
|
|
|
|
#endif
|
2017-03-07 17:47:47 +00:00
|
|
|
#include <string>
|
2017-03-06 15:07:39 +00:00
|
|
|
#include <list>
|
|
|
|
#include <map>
|
|
|
|
#include <set>
|
|
|
|
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
#include <stub_msvc.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace Gls {
|
|
|
|
|
|
|
|
using std::list;
|
|
|
|
using std::map;
|
|
|
|
using std::set;
|
2017-03-07 17:47:47 +00:00
|
|
|
using std::multimap;
|
|
|
|
using std::pair;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
using gd::wstring;
|
|
|
|
using gd::wchar;
|
|
|
|
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
|
|
using BtreeIndexing::IndexedWords;
|
|
|
|
using BtreeIndexing::IndexInfo;
|
2021-11-06 08:26:30 +00:00
|
|
|
using Utf8::Encoding;
|
2021-11-06 08:55:51 +00:00
|
|
|
using Utf8::LineFeed;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
/////////////// GlsScanner
|
|
|
|
|
|
|
|
class GlsScanner
|
|
|
|
{
|
|
|
|
gzFile f;
|
|
|
|
Encoding encoding;
|
2021-10-23 09:37:29 +00:00
|
|
|
QTextCodec* codec;
|
2017-03-06 15:07:39 +00:00
|
|
|
wstring dictionaryName;
|
|
|
|
wstring dictionaryDecription, dictionaryAuthor;
|
|
|
|
wstring langFrom, langTo;
|
2021-10-23 09:37:29 +00:00
|
|
|
char readBuffer[ 10000 ];
|
2017-03-06 15:07:39 +00:00
|
|
|
char * readBufferPtr;
|
|
|
|
size_t readBufferLeft;
|
2021-11-06 08:55:51 +00:00
|
|
|
LineFeed lineFeed;
|
2017-03-06 15:07:39 +00:00
|
|
|
unsigned linesRead;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
DEF_EX( Ex, "Gls scanner exception", Dictionary::Ex )
|
|
|
|
DEF_EX_STR( exCantOpen, "Can't open .gls file", Ex )
|
|
|
|
DEF_EX( exCantReadGlsFile, "Can't read .gls file", Ex )
|
|
|
|
DEF_EX_STR( exMalformedGlsFile, "The .gls file is malformed:", Ex )
|
|
|
|
DEF_EX( exEncodingError, "Encoding error", Ex ) // Should never happen really
|
|
|
|
|
2022-01-09 08:35:07 +00:00
|
|
|
GlsScanner( string const & fileName ) ;
|
2022-06-03 13:28:41 +00:00
|
|
|
~GlsScanner() noexcept;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
/// Returns the detected encoding of this file.
|
|
|
|
Encoding getEncoding() const
|
|
|
|
{ return encoding; }
|
|
|
|
|
|
|
|
/// Returns the dictionary's name, as was read from file's headers.
|
|
|
|
wstring const & getDictionaryName() const
|
|
|
|
{ return dictionaryName; }
|
|
|
|
|
|
|
|
/// Returns the dictionary's author, as was read from file's headers.
|
|
|
|
wstring const & getDictionaryAuthor() const
|
|
|
|
{ return dictionaryAuthor; }
|
|
|
|
|
|
|
|
/// Returns the dictionary's description, as was read from file's headers.
|
|
|
|
wstring const & getDictionaryDescription() const
|
|
|
|
{ return dictionaryDecription; }
|
|
|
|
|
|
|
|
/// Returns the dictionary's source language, as was read from file's headers.
|
|
|
|
wstring const & getLangFrom() const
|
|
|
|
{ return langFrom; }
|
|
|
|
|
|
|
|
/// Returns the dictionary's target language, as was read from file's headers.
|
|
|
|
wstring const & getLangTo() const
|
|
|
|
{ return langTo; }
|
|
|
|
|
|
|
|
/// Reads next line from the file. Returns true if reading succeeded --
|
|
|
|
/// the string gets stored in the one passed, along with its physical
|
|
|
|
/// file offset in the file (the uncompressed one if the file is compressed).
|
|
|
|
/// If end of file is reached, false is returned.
|
|
|
|
/// Reading begins from the first line after the headers (ones which end
|
|
|
|
/// by the "### Glossary section:" line).
|
2022-01-09 08:35:07 +00:00
|
|
|
bool readNextLine( wstring &, size_t & offset ) ;
|
2017-03-06 15:07:39 +00:00
|
|
|
/// Returns the number of lines read so far from the file.
|
|
|
|
unsigned getLinesRead() const
|
|
|
|
{ return linesRead; }
|
|
|
|
};
|
|
|
|
|
2022-01-09 08:35:07 +00:00
|
|
|
GlsScanner::GlsScanner( string const & fileName ) :
|
2021-11-06 08:26:30 +00:00
|
|
|
encoding( Utf8::Utf8 ), readBufferPtr( readBuffer ),
|
|
|
|
readBufferLeft( 0 ), linesRead( 0 )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
|
|
|
// read it -- they are much nicer than the dict_data- ones.
|
|
|
|
|
|
|
|
f = gd_gzopen( fileName.c_str() );
|
|
|
|
if ( !f )
|
|
|
|
throw exCantOpen( fileName );
|
|
|
|
|
|
|
|
// Now try guessing the encoding by reading the first two bytes
|
|
|
|
|
|
|
|
unsigned char firstBytes[ 2 ];
|
|
|
|
|
|
|
|
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
|
|
|
|
{
|
|
|
|
// Apparently the file's too short
|
|
|
|
gzclose( f );
|
|
|
|
throw exMalformedGlsFile( fileName );
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the file begins with the dedicated Unicode marker, we just consume
|
|
|
|
// it. If, on the other hand, it's not, we return the bytes back
|
|
|
|
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
2021-11-06 08:26:30 +00:00
|
|
|
encoding = Utf8::Utf16LE;
|
2017-03-06 15:07:39 +00:00
|
|
|
else
|
|
|
|
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
2021-11-06 08:26:30 +00:00
|
|
|
encoding = Utf8::Utf16BE;
|
2017-03-06 15:07:39 +00:00
|
|
|
else
|
|
|
|
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
|
|
|
{
|
|
|
|
// Looks like Utf8, read one more byte
|
|
|
|
if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
|
|
|
|
{
|
|
|
|
// Either the file's too short, or the BOM is weird
|
|
|
|
gzclose( f );
|
|
|
|
throw exMalformedGlsFile( fileName );
|
|
|
|
}
|
2021-11-06 08:26:30 +00:00
|
|
|
encoding = Utf8::Utf8;
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if ( gzrewind( f ) )
|
|
|
|
{
|
|
|
|
gzclose( f );
|
|
|
|
throw exCantOpen( fileName );
|
|
|
|
}
|
2021-11-06 08:26:30 +00:00
|
|
|
encoding = Utf8::Utf8;
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
|
2021-11-06 08:26:30 +00:00
|
|
|
codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding));
|
2017-03-06 15:07:39 +00:00
|
|
|
// We now can use our own readNextLine() function
|
2021-11-06 08:55:51 +00:00
|
|
|
lineFeed = Utf8::initLineFeed(encoding);
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
wstring str;
|
|
|
|
wstring *currentField = 0;
|
2022-02-17 14:01:09 +00:00
|
|
|
wstring mark = U"###" ;
|
|
|
|
wstring titleMark = U"### Glossary title:" ;
|
|
|
|
wstring authorMark = U"### Author:" ;
|
|
|
|
wstring descriptionMark = U"### Description:" ;
|
|
|
|
wstring langFromMark = U"### Source language:" ;
|
|
|
|
wstring langToMark = U"### Target language:" ;
|
|
|
|
wstring endOfHeaderMark = U"### Glossary section:" ;
|
2017-03-06 15:07:39 +00:00
|
|
|
size_t offset;
|
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
if ( !readNextLine( str, offset ) )
|
|
|
|
{
|
|
|
|
gzclose( f );
|
|
|
|
throw exMalformedGlsFile( fileName );
|
|
|
|
}
|
|
|
|
|
|
|
|
if( str.compare( 0, 3, mark.c_str(), 3 ) == 0 )
|
|
|
|
{
|
|
|
|
currentField = 0;
|
|
|
|
|
|
|
|
if( str.compare( 0, titleMark.size(), titleMark ) == 0 )
|
|
|
|
{
|
|
|
|
dictionaryName = wstring( str, titleMark.size(), str.size() - titleMark.size() );
|
|
|
|
currentField = &dictionaryName;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( str.compare( 0, authorMark.size(), authorMark ) == 0 )
|
|
|
|
{
|
|
|
|
dictionaryAuthor = wstring( str, authorMark.size(), str.size() - authorMark.size() );
|
|
|
|
currentField = &dictionaryAuthor;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( str.compare( 0, descriptionMark.size(), descriptionMark ) == 0 )
|
|
|
|
{
|
|
|
|
dictionaryDecription = wstring( str, descriptionMark.size(), str.size() - descriptionMark.size() );
|
|
|
|
currentField = &dictionaryDecription;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( str.compare( 0, langFromMark.size(), langFromMark ) == 0 )
|
|
|
|
{
|
|
|
|
langFrom = wstring( str, langFromMark.size(), str.size() - langFromMark.size() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( str.compare( 0, langToMark.size(), langToMark ) == 0 )
|
|
|
|
{
|
|
|
|
langTo = wstring( str, langToMark.size(), str.size() - langToMark.size() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if( str.compare( 0, endOfHeaderMark.size(), endOfHeaderMark ) == 0 )
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Handle multiline headers
|
|
|
|
if( currentField )
|
|
|
|
*currentField += str;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-11-06 08:26:30 +00:00
|
|
|
|
2022-01-09 08:35:07 +00:00
|
|
|
bool GlsScanner::readNextLine( wstring & out, size_t & offset )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
2021-11-06 08:26:30 +00:00
|
|
|
offset = (size_t)(gztell(f) - readBufferLeft);
|
|
|
|
|
|
|
|
{
|
|
|
|
// Check that we have bytes to read
|
|
|
|
if ( readBufferLeft < 5000 )
|
|
|
|
{
|
|
|
|
if ( !gzeof( f ) )
|
|
|
|
{
|
|
|
|
// To avoid having to deal with ring logic, we move the remaining bytes
|
|
|
|
// to the beginning
|
|
|
|
memmove( readBuffer, readBufferPtr, readBufferLeft );
|
|
|
|
|
|
|
|
// Read some more bytes to readBuffer
|
|
|
|
int result = gzread( f, readBuffer + readBufferLeft,
|
|
|
|
sizeof( readBuffer ) - readBufferLeft );
|
|
|
|
|
2021-11-06 08:55:51 +00:00
|
|
|
if (result == -1)
|
2021-11-06 08:26:30 +00:00
|
|
|
throw exCantReadGlsFile();
|
|
|
|
|
|
|
|
readBufferPtr = readBuffer;
|
|
|
|
readBufferLeft += (size_t) result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(readBufferLeft<=0)
|
|
|
|
return false;
|
|
|
|
|
2021-11-06 08:55:51 +00:00
|
|
|
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed.lineFeed,lineFeed.length);
|
2021-11-06 08:26:30 +00:00
|
|
|
if(pos==-1)
|
|
|
|
return false;
|
|
|
|
QString line = codec->toUnicode(readBufferPtr, pos);
|
2021-11-27 07:17:33 +00:00
|
|
|
|
|
|
|
line = Utils::rstrip(line);
|
2021-11-06 08:26:30 +00:00
|
|
|
|
|
|
|
if(pos>readBufferLeft){
|
|
|
|
pos=readBufferLeft;
|
|
|
|
}
|
|
|
|
readBufferLeft -= pos;
|
|
|
|
readBufferPtr += pos;
|
|
|
|
linesRead++;
|
2021-10-23 09:37:29 +00:00
|
|
|
|
2022-02-18 14:12:58 +00:00
|
|
|
out = line.toStdU32String();
|
|
|
|
return true;
|
2021-11-06 08:26:30 +00:00
|
|
|
}
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
|
2022-06-03 13:28:41 +00:00
|
|
|
GlsScanner::~GlsScanner() noexcept
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
gzclose( f );
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
////////////////// GLS Dictionary
|
|
|
|
|
|
|
|
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
|
|
|
|
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
|
|
|
|
DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
|
|
|
|
|
|
|
|
enum
|
|
|
|
{
|
|
|
|
Signature = 0x58534c47, // GLSX on little-endian, XSLG on big-endian
|
|
|
|
CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version,
|
|
|
|
CurrentZipSupportVersion = 2,
|
|
|
|
CurrentFtsIndexVersion = 1
|
|
|
|
};
|
|
|
|
|
|
|
|
struct IdxHeader
|
|
|
|
{
|
|
|
|
uint32_t signature; // First comes the signature, GLSX
|
|
|
|
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
|
|
|
uint32_t zipSupportVersion; // Zip support version -- narrows down reindexing
|
|
|
|
// when it changes only for dictionaries with the
|
|
|
|
// zip files
|
|
|
|
int glsEncoding; // Which encoding is used for the file indexed
|
|
|
|
uint32_t chunksOffset; // The offset to chunks' storage
|
|
|
|
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
|
|
|
uint32_t indexRootOffset;
|
|
|
|
uint32_t articleCount; // Number of articles this dictionary has
|
|
|
|
uint32_t wordCount; // Number of headwords this dictionary has
|
|
|
|
uint32_t langFrom; // Source language
|
|
|
|
uint32_t langTo; // Target language
|
|
|
|
uint32_t hasZipFile; // Non-zero means there's a zip file with resources
|
|
|
|
// present
|
|
|
|
uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
|
|
|
|
// resource index.
|
|
|
|
uint32_t zipIndexRootOffset;
|
|
|
|
}
|
|
|
|
#ifndef _MSC_VER
|
|
|
|
__attribute__((packed))
|
|
|
|
#endif
|
|
|
|
;
|
|
|
|
|
|
|
|
bool indexIsOldOrBad( string const & indexFile, bool hasZipFile )
|
|
|
|
{
|
|
|
|
File::Class idx( indexFile, "rb" );
|
|
|
|
|
|
|
|
IdxHeader header;
|
|
|
|
|
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
|
|
|
|
header.signature != Signature ||
|
|
|
|
header.formatVersion != CurrentFormatVersion ||
|
|
|
|
(bool) header.hasZipFile != hasZipFile ||
|
|
|
|
( hasZipFile && header.zipSupportVersion != CurrentZipSupportVersion );
|
|
|
|
}
|
|
|
|
|
|
|
|
class GlsDictionary: public BtreeIndexing::BtreeDictionary
|
|
|
|
{
|
|
|
|
Mutex idxMutex;
|
|
|
|
File::Class idx;
|
|
|
|
IdxHeader idxHeader;
|
|
|
|
dictData * dz;
|
|
|
|
ChunkedStorage::Reader chunks;
|
|
|
|
Mutex dzMutex;
|
|
|
|
Mutex resourceZipMutex;
|
|
|
|
IndexedZip resourceZip;
|
|
|
|
string dictionaryName;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
GlsDictionary( string const & id, string const & indexFile,
|
|
|
|
vector< string > const & dictionaryFiles );
|
|
|
|
|
|
|
|
~GlsDictionary();
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
string getName() noexcept override
|
2017-03-06 15:07:39 +00:00
|
|
|
{ return dictionaryName; }
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
map< Dictionary::Property, string > getProperties() noexcept override
|
2017-03-06 15:07:39 +00:00
|
|
|
{ return map< Dictionary::Property, string >(); }
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
unsigned long getArticleCount() noexcept override
|
2017-03-15 14:44:21 +00:00
|
|
|
{ return idxHeader.articleCount; }
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
unsigned long getWordCount() noexcept override
|
2017-03-06 15:07:39 +00:00
|
|
|
{ return idxHeader.wordCount; }
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
inline quint32 getLangFrom() const override
|
2017-03-06 15:07:39 +00:00
|
|
|
{ return idxHeader.langFrom; }
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
inline quint32 getLangTo() const override
|
2017-03-06 15:07:39 +00:00
|
|
|
{ return idxHeader.langTo; }
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override
|
2022-01-09 08:35:07 +00:00
|
|
|
;
|
2017-03-07 17:47:47 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
sptr< Dictionary::DataRequest > getArticle( wstring const &,
|
2017-03-06 15:07:39 +00:00
|
|
|
vector< wstring > const & alts,
|
2018-06-13 16:00:42 +00:00
|
|
|
wstring const &,
|
2022-12-29 07:07:40 +00:00
|
|
|
bool ignoreDiacritics ) override
|
2022-01-09 08:35:07 +00:00
|
|
|
;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
sptr< Dictionary::DataRequest > getResource( string const & name ) override
|
2022-01-09 08:35:07 +00:00
|
|
|
;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
QString const& getDescription() override;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
QString getMainFilename() override;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
|
2017-03-06 15:07:39 +00:00
|
|
|
int searchMode, bool matchCase,
|
|
|
|
int distanceBetweenWords,
|
2017-07-25 15:28:29 +00:00
|
|
|
int maxResults,
|
2018-04-10 14:49:52 +00:00
|
|
|
bool ignoreWordsOrder,
|
2022-12-29 07:07:40 +00:00
|
|
|
bool ignoreDiacritics ) override;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void getArticleText( uint32_t articleAddress, QString & headword, QString & text ) override;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration ) override;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void setFTSParameters( Config::FullTextSearch const & fts ) override
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
can_FTS = fts.enabled
|
|
|
|
&& !fts.disabledTypes.contains( "GLS", Qt::CaseInsensitive )
|
|
|
|
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
|
|
|
|
}
|
|
|
|
protected:
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void loadIcon() noexcept override;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
/// Loads the article, storing its headword and formatting the data it has
|
|
|
|
/// into an html.
|
|
|
|
void loadArticle( uint32_t address,
|
2017-03-07 17:47:47 +00:00
|
|
|
string & headword,
|
2017-03-06 15:07:39 +00:00
|
|
|
string & articleText );
|
|
|
|
|
|
|
|
/// Loads the article
|
|
|
|
void loadArticleText( uint32_t address,
|
2017-03-20 14:34:42 +00:00
|
|
|
vector< string > & headwords,
|
2017-03-06 15:07:39 +00:00
|
|
|
string & articleText );
|
|
|
|
|
|
|
|
/// Process resource links (images, audios, etc)
|
|
|
|
QString & filterResource( QString & article );
|
|
|
|
|
|
|
|
friend class GlsResourceRequest;
|
|
|
|
friend class GlsArticleRequest;
|
2017-03-07 17:47:47 +00:00
|
|
|
friend class GlsHeadwordsRequest;
|
2017-03-06 15:07:39 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
GlsDictionary::GlsDictionary( string const & id,
|
|
|
|
string const & indexFile,
|
|
|
|
vector< string > const & dictionaryFiles ):
|
|
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
|
|
idx( indexFile, "rb" ),
|
|
|
|
idxHeader( idx.read< IdxHeader >() ),
|
|
|
|
dz( 0 ),
|
|
|
|
chunks( idx, idxHeader.chunksOffset )
|
|
|
|
{
|
|
|
|
// Open the .gls file
|
|
|
|
|
|
|
|
DZ_ERRORS error;
|
|
|
|
dz = dict_data_open( getDictionaryFilenames()[ 0 ].c_str(), &error, 0 );
|
|
|
|
|
|
|
|
if ( !dz )
|
|
|
|
throw exDictzipError( string( dz_error_str( error ) )
|
|
|
|
+ "(" + getDictionaryFilenames()[ 0 ] + ")" );
|
|
|
|
|
|
|
|
// Read the dictionary name
|
|
|
|
|
|
|
|
idx.seek( sizeof( idxHeader ) );
|
|
|
|
|
|
|
|
vector< char > dName( idx.read< uint32_t >() );
|
2019-01-17 14:53:13 +00:00
|
|
|
if( dName.size() > 0 )
|
|
|
|
{
|
|
|
|
idx.read( &dName.front(), dName.size() );
|
|
|
|
dictionaryName = string( &dName.front(), dName.size() );
|
|
|
|
}
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
// Initialize the index
|
|
|
|
|
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
|
|
|
idxHeader.indexRootOffset ),
|
|
|
|
idx, idxMutex );
|
|
|
|
|
|
|
|
// Open a resource zip file, if there's one
|
|
|
|
|
|
|
|
if ( idxHeader.hasZipFile &&
|
|
|
|
( idxHeader.zipIndexBtreeMaxElements ||
|
|
|
|
idxHeader.zipIndexRootOffset ) )
|
|
|
|
{
|
|
|
|
resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
|
|
|
|
idxHeader.zipIndexRootOffset ),
|
|
|
|
idx, idxMutex );
|
|
|
|
|
2023-04-13 10:08:32 +00:00
|
|
|
QString zipName = QDir::fromNativeSeparators( getDictionaryFilenames().back().c_str() );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
|
|
|
|
resourceZip.openZipFile( zipName );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Full-text search parameters
|
|
|
|
|
|
|
|
can_FTS = true;
|
|
|
|
|
2022-10-06 03:04:48 +00:00
|
|
|
ftsIdxName = indexFile + Dictionary::getFtsSuffix();
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
|
|
|
|
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
|
|
|
|
FTS_index_completed.ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
GlsDictionary::~GlsDictionary()
|
|
|
|
{
|
|
|
|
if ( dz )
|
|
|
|
dict_data_close( dz );
|
|
|
|
}
|
|
|
|
|
2022-06-03 13:28:41 +00:00
|
|
|
void GlsDictionary::loadIcon() noexcept
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
if ( dictionaryIconLoaded )
|
|
|
|
return;
|
|
|
|
|
2023-04-13 10:08:32 +00:00
|
|
|
QString fileName = QDir::fromNativeSeparators( getDictionaryFilenames()[ 0 ].c_str() );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
// Remove the extension
|
|
|
|
if ( fileName.endsWith( ".gls.dz", Qt::CaseInsensitive ) )
|
|
|
|
fileName.chop( 6 );
|
|
|
|
else
|
|
|
|
fileName.chop( 3 );
|
|
|
|
|
|
|
|
if ( !loadIconFromFile( fileName ) )
|
|
|
|
{
|
|
|
|
// Load failed -- use default icon
|
|
|
|
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_gls.png");
|
|
|
|
}
|
|
|
|
|
|
|
|
dictionaryIconLoaded = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
QString const& GlsDictionary::getDescription()
|
|
|
|
{
|
|
|
|
if( !dictionaryDescription.isEmpty() )
|
|
|
|
return dictionaryDescription;
|
|
|
|
|
|
|
|
try {
|
|
|
|
GlsScanner scanner( getDictionaryFilenames()[ 0 ] );
|
|
|
|
string str = Utf8::encode( scanner.getDictionaryAuthor() );
|
|
|
|
if( !str.empty() )
|
2017-03-16 15:20:36 +00:00
|
|
|
dictionaryDescription = QString( QObject::tr( "Author: %1%2" ) )
|
|
|
|
.arg( QString::fromUtf8( str.c_str() ) )
|
|
|
|
.arg( "\n\n" );
|
2017-03-06 15:07:39 +00:00
|
|
|
str = Utf8::encode( scanner.getDictionaryDescription() );
|
|
|
|
if( !str.empty() )
|
|
|
|
{
|
|
|
|
QString desc = QString::fromUtf8( str.c_str() );
|
|
|
|
desc.replace( "\t", "<br/>" );
|
|
|
|
desc.replace( "\\n", "<br/>" );
|
|
|
|
desc.replace( "<br>", "<br/>", Qt::CaseInsensitive );
|
|
|
|
dictionaryDescription += Html::unescape( desc, true );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch( std::exception & e )
|
|
|
|
{
|
|
|
|
gdWarning( "GLS dictionary description reading failed: %s, error: %s\n",
|
|
|
|
getName().c_str(), e.what() );
|
|
|
|
}
|
|
|
|
|
|
|
|
if( dictionaryDescription.isEmpty() )
|
|
|
|
dictionaryDescription = "NONE";
|
|
|
|
|
|
|
|
return dictionaryDescription;
|
|
|
|
}
|
|
|
|
|
2023-04-13 10:08:32 +00:00
|
|
|
QString GlsDictionary::getMainFilename() { return getDictionaryFilenames()[ 0 ].c_str(); }
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
void GlsDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
|
|
|
|
{
|
|
|
|
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|
|
|
|
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
|
|
|
|
FTS_index_completed.ref();
|
|
|
|
|
|
|
|
if( haveFTSIndex() )
|
|
|
|
return;
|
|
|
|
|
|
|
|
if( ensureInitDone().size() )
|
|
|
|
return;
|
|
|
|
|
|
|
|
if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
|
|
|
|
return;
|
|
|
|
|
|
|
|
gdDebug( "Gls: Building the full-text index for dictionary: %s\n",
|
|
|
|
getName().c_str() );
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
FtsHelpers::makeFTSIndex( this, isCancelled );
|
|
|
|
FTS_index_completed.ref();
|
|
|
|
}
|
|
|
|
catch( std::exception &ex )
|
|
|
|
{
|
|
|
|
gdWarning( "Gls: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
2023-04-13 10:08:32 +00:00
|
|
|
QFile::remove( ftsIdxName.c_str() );
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void GlsDictionary::loadArticleText( uint32_t address,
|
2017-03-20 14:34:42 +00:00
|
|
|
vector< string > & headwords,
|
2017-03-06 15:07:39 +00:00
|
|
|
string & articleText )
|
|
|
|
{
|
|
|
|
vector< char > chunk;
|
|
|
|
char * articleProps;
|
|
|
|
{
|
|
|
|
Mutex::Lock _( idxMutex );
|
|
|
|
|
|
|
|
articleProps = chunks.getBlock( address, chunk );
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t articleOffset, articleSize;
|
|
|
|
|
|
|
|
memcpy( &articleOffset, articleProps, sizeof( articleOffset ) );
|
|
|
|
memcpy( &articleSize, articleProps + sizeof( articleOffset ),
|
|
|
|
sizeof( articleSize ) );
|
|
|
|
|
|
|
|
char * articleBody;
|
|
|
|
|
|
|
|
{
|
|
|
|
Mutex::Lock _( dzMutex );
|
|
|
|
|
|
|
|
articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
|
|
|
|
}
|
|
|
|
|
|
|
|
headwords.clear();
|
|
|
|
articleText.clear();
|
|
|
|
string headword;
|
|
|
|
|
|
|
|
if ( !articleBody )
|
|
|
|
{
|
|
|
|
articleText = string( "\n\tDICTZIP error: " ) + dict_error_str( dz );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2021-11-06 08:26:30 +00:00
|
|
|
string articleData = Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
|
2017-03-06 15:07:39 +00:00
|
|
|
string::size_type start_pos = 0, end_pos = 0;
|
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
// Replace all "\r\n" by "\n"
|
|
|
|
end_pos = articleData.find( "\r\n", start_pos );
|
|
|
|
if( end_pos == string::npos )
|
|
|
|
{
|
|
|
|
articleText += articleData.substr( start_pos, end_pos );
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
articleText += articleData.substr( start_pos, end_pos - start_pos ) + "\n";
|
|
|
|
start_pos = end_pos + 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find headword
|
|
|
|
start_pos = articleText.find( '\n' );
|
|
|
|
if( start_pos != string::npos )
|
|
|
|
{
|
|
|
|
headword = articleText.substr( 0, start_pos );
|
|
|
|
articleText = articleText.substr( start_pos + 1, string::npos );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parse headwords
|
|
|
|
|
|
|
|
start_pos = 0;
|
|
|
|
end_pos = 0;
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
end_pos = headword.find( '|', start_pos );
|
|
|
|
if( end_pos == wstring::npos )
|
|
|
|
{
|
|
|
|
string hw = headword.substr( start_pos );
|
|
|
|
if( !hw.empty() )
|
|
|
|
headwords.push_back( hw );
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
headwords.push_back( headword.substr( start_pos, end_pos - start_pos ) );
|
|
|
|
start_pos = end_pos + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void GlsDictionary::loadArticle( uint32_t address,
|
2017-03-07 17:47:47 +00:00
|
|
|
string & headword,
|
2017-03-06 15:07:39 +00:00
|
|
|
string & articleText )
|
|
|
|
{
|
|
|
|
string articleBody;
|
2017-03-20 14:34:42 +00:00
|
|
|
vector< string > headwords;
|
2017-03-06 15:07:39 +00:00
|
|
|
loadArticleText( address, headwords, articleBody );
|
|
|
|
|
|
|
|
QString article = QString::fromLatin1( "<div class=\"glsdict\">" );
|
|
|
|
if( headwords.size() )
|
|
|
|
{
|
|
|
|
// Headwords
|
|
|
|
article += "<div class=\"glsdict_headwords\"";
|
|
|
|
if( isFromLanguageRTL() )
|
|
|
|
article += " dir=\"rtl\"";
|
2017-03-20 14:34:42 +00:00
|
|
|
if( headwords.size() > 1 )
|
|
|
|
{
|
|
|
|
QString altHeadwords;
|
|
|
|
for( vector< string >::size_type i = 1; i < headwords.size(); i++ )
|
|
|
|
{
|
|
|
|
if( i > 1 )
|
|
|
|
altHeadwords += ", ";
|
|
|
|
altHeadwords += QString::fromUtf8( headwords[ i ].c_str(), headwords[ i ].size() );
|
|
|
|
}
|
|
|
|
article += " title=\"" + altHeadwords + "\"";
|
|
|
|
}
|
2017-03-06 15:07:39 +00:00
|
|
|
article += ">";
|
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
headword = headwords.front();
|
|
|
|
article += QString::fromUtf8( headword.c_str(), headword.size() );
|
|
|
|
|
2017-03-06 15:07:39 +00:00
|
|
|
article += "</div>";
|
|
|
|
}
|
|
|
|
|
|
|
|
if( isToLanguageRTL() )
|
2022-12-24 22:01:50 +00:00
|
|
|
article += R"(<div style="display:inline;" dir="rtl">)";
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
QString text = QString::fromUtf8( articleBody.c_str(), articleBody.size() );
|
2018-02-27 16:42:21 +00:00
|
|
|
|
2017-03-06 15:07:39 +00:00
|
|
|
article += filterResource( text );
|
|
|
|
|
|
|
|
if( isToLanguageRTL() )
|
2017-06-09 12:38:23 +00:00
|
|
|
article += "</div>";
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
article +="</div>";
|
|
|
|
|
|
|
|
articleText = string( article.toUtf8().data() );
|
|
|
|
}
|
|
|
|
|
|
|
|
QString & GlsDictionary::filterResource( QString & article )
|
|
|
|
{
|
2022-12-24 22:01:50 +00:00
|
|
|
QRegularExpression imgRe( R"((<\s*img\s+[^>]*src\s*=\s*["']+)(?!(?:data|https?|ftp|qrcx):))",
|
2018-02-27 16:42:21 +00:00
|
|
|
QRegularExpression::CaseInsensitiveOption
|
|
|
|
| QRegularExpression::InvertedGreedinessOption );
|
2022-12-24 22:01:50 +00:00
|
|
|
QRegularExpression linkRe( R"((<\s*link\s+[^>]*href\s*=\s*["']+)(?!(?:data|https?|ftp):))",
|
2018-02-27 16:42:21 +00:00
|
|
|
QRegularExpression::CaseInsensitiveOption
|
|
|
|
| QRegularExpression::InvertedGreedinessOption );
|
2021-11-19 13:47:22 +00:00
|
|
|
|
2018-02-27 16:42:21 +00:00
|
|
|
article.replace( imgRe , "\\1bres://" + QString::fromStdString( getId() ) + "/" )
|
|
|
|
.replace( linkRe, "\\1bres://" + QString::fromStdString( getId() ) + "/" );
|
|
|
|
|
|
|
|
// Handle links to articles
|
2017-03-13 14:36:23 +00:00
|
|
|
|
2022-12-24 22:01:50 +00:00
|
|
|
QRegularExpression linksReg( R"(<a(\s+[^>]*)href\s*=\s*['"](bword://)?([^'"]+)['"])",
|
2018-02-28 14:15:27 +00:00
|
|
|
QRegularExpression::CaseInsensitiveOption );
|
2018-02-27 16:42:21 +00:00
|
|
|
|
|
|
|
int pos = 0;
|
|
|
|
QString articleNewText;
|
|
|
|
QRegularExpressionMatchIterator it = linksReg.globalMatch( article );
|
|
|
|
while( it.hasNext() )
|
|
|
|
{
|
|
|
|
QRegularExpressionMatch match = it.next();
|
2022-02-27 05:17:37 +00:00
|
|
|
articleNewText += article.mid( pos, match.capturedStart() - pos );
|
2018-02-27 16:42:21 +00:00
|
|
|
pos = match.capturedEnd();
|
|
|
|
|
|
|
|
QString link = match.captured( 3 );
|
|
|
|
|
|
|
|
if( link.indexOf( ':' ) < 0 )
|
|
|
|
{
|
|
|
|
QString newLink;
|
|
|
|
if( link.indexOf( '#' ) < 0 )
|
|
|
|
newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"bword:" + link + "\"";
|
|
|
|
|
|
|
|
// Anchors
|
|
|
|
|
|
|
|
if( link.indexOf( '#' ) > 0 )
|
|
|
|
{
|
|
|
|
newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
|
2021-11-26 09:24:59 +00:00
|
|
|
|
2018-02-27 16:42:21 +00:00
|
|
|
newLink.replace( "#", "?gdanchor=" );
|
|
|
|
}
|
|
|
|
|
|
|
|
if( !newLink.isEmpty() )
|
|
|
|
{
|
|
|
|
articleNewText += newLink;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
articleNewText += match.captured();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
articleNewText += match.captured();
|
|
|
|
}
|
|
|
|
if( pos )
|
|
|
|
{
|
2022-02-27 05:17:37 +00:00
|
|
|
articleNewText += article.mid( pos );
|
2018-02-27 16:42:21 +00:00
|
|
|
article = articleNewText;
|
|
|
|
articleNewText.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle "audio" tags
|
|
|
|
|
2022-12-24 22:01:50 +00:00
|
|
|
QRegularExpression audioRe( R"(<\s*audio\s+src\s*=\s*(["']+)([^"']+)(["'])\s*>(.*)</audio>)",
|
2018-02-27 16:42:21 +00:00
|
|
|
QRegularExpression::CaseInsensitiveOption
|
2018-03-11 21:06:09 +00:00
|
|
|
| QRegularExpression::DotMatchesEverythingOption
|
2018-02-27 16:42:21 +00:00
|
|
|
| QRegularExpression::InvertedGreedinessOption );
|
2021-11-19 13:47:22 +00:00
|
|
|
|
2018-02-27 16:42:21 +00:00
|
|
|
|
|
|
|
pos = 0;
|
|
|
|
|
|
|
|
it = audioRe.globalMatch( article );
|
|
|
|
while( it.hasNext() )
|
|
|
|
{
|
|
|
|
QRegularExpressionMatch match = it.next();
|
2022-02-27 05:17:37 +00:00
|
|
|
articleNewText += article.mid( pos, match.capturedStart() - pos );
|
2018-02-27 16:42:21 +00:00
|
|
|
pos = match.capturedEnd();
|
|
|
|
|
|
|
|
QString src = match.captured( 2 );
|
|
|
|
|
|
|
|
if( src.indexOf( "://" ) >= 0 )
|
|
|
|
articleNewText += match.captured();
|
|
|
|
else
|
|
|
|
{
|
|
|
|
std::string href = "\"gdau://" + getId() + "/" + src.toUtf8().data() + "\"";
|
|
|
|
QString newTag = QString::fromUtf8( ( addAudioLink( href, getId() ) + "<span class=\"gls_wav\"><a href=" + href + ">" ).c_str() );
|
|
|
|
newTag += match.captured( 4 );
|
|
|
|
if( match.captured( 4 ).indexOf( "<img " ) < 0 )
|
2023-03-05 20:20:05 +00:00
|
|
|
newTag += R"( <img src="qrc:///icons/playsound.png" border="0" alt="Play">)";
|
2018-02-27 16:42:21 +00:00
|
|
|
newTag += "</a></span>";
|
|
|
|
|
|
|
|
articleNewText += newTag;
|
|
|
|
}
|
|
|
|
}
|
2018-02-28 14:17:10 +00:00
|
|
|
if( pos )
|
|
|
|
{
|
2022-02-27 05:17:37 +00:00
|
|
|
articleNewText += article.mid( pos );
|
2018-02-28 14:17:10 +00:00
|
|
|
article = articleNewText;
|
|
|
|
articleNewText.clear();
|
|
|
|
}
|
2018-02-27 16:42:21 +00:00
|
|
|
|
|
|
|
return article;
|
|
|
|
}
|
|
|
|
|
2017-03-06 15:07:39 +00:00
|
|
|
void GlsDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
2017-03-20 14:34:42 +00:00
|
|
|
vector< string > headwords;
|
2017-03-06 15:07:39 +00:00
|
|
|
string articleStr;
|
|
|
|
loadArticleText( articleAddress, headwords, articleStr );
|
|
|
|
|
|
|
|
if( !headwords.empty() )
|
|
|
|
headword = QString::fromUtf8( headwords.front().data(), headwords.front().size() );
|
|
|
|
|
|
|
|
wstring wstr = Utf8::decode( articleStr );
|
|
|
|
|
2023-04-16 09:07:07 +00:00
|
|
|
text = Html::unescape( QString::fromStdU32String( wstr ) );
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
catch( std::exception &ex )
|
|
|
|
{
|
|
|
|
gdWarning( "Gls: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
/// GlsDictionary::findHeadwordsForSynonym()
|
|
|
|
|
|
|
|
class GlsHeadwordsRequest;
|
|
|
|
|
|
|
|
class GlsHeadwordsRequestRunnable: public QRunnable
|
|
|
|
{
|
|
|
|
GlsHeadwordsRequest & r;
|
|
|
|
QSemaphore & hasExited;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
GlsHeadwordsRequestRunnable( GlsHeadwordsRequest & r_,
|
|
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
|
|
hasExited( hasExited_ )
|
|
|
|
{}
|
|
|
|
|
|
|
|
~GlsHeadwordsRequestRunnable()
|
|
|
|
{
|
|
|
|
hasExited.release();
|
|
|
|
}
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void run() override;
|
2017-03-07 17:47:47 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class GlsHeadwordsRequest: public Dictionary::WordSearchRequest
|
|
|
|
{
|
|
|
|
friend class GlsHeadwordsRequestRunnable;
|
|
|
|
|
|
|
|
wstring word;
|
|
|
|
GlsDictionary & dict;
|
|
|
|
|
|
|
|
QAtomicInt isCancelled;
|
|
|
|
QSemaphore hasExited;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
GlsHeadwordsRequest( wstring const & word_, GlsDictionary & dict_ ):
|
|
|
|
word( word_ ), dict( dict_ )
|
|
|
|
{
|
|
|
|
QThreadPool::globalInstance()->start(
|
|
|
|
new GlsHeadwordsRequestRunnable( *this, hasExited ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
void run(); // Run from another thread by StardictHeadwordsRequestRunnable
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void cancel() override
|
2017-03-07 17:47:47 +00:00
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~GlsHeadwordsRequest()
|
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
hasExited.acquire();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
void GlsHeadwordsRequestRunnable::run()
|
|
|
|
{
|
|
|
|
r.run();
|
|
|
|
}
|
|
|
|
|
|
|
|
void GlsHeadwordsRequest::run()
|
|
|
|
{
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
2017-03-07 17:47:47 +00:00
|
|
|
{
|
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
vector< WordArticleLink > chain = dict.findArticles( word );
|
|
|
|
|
|
|
|
wstring caseFolded = Folding::applySimpleCaseOnly( word );
|
|
|
|
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
|
|
{
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
2017-03-07 17:47:47 +00:00
|
|
|
{
|
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
string articleText;
|
2017-03-20 14:34:42 +00:00
|
|
|
vector< string > headwords;
|
2017-03-07 17:47:47 +00:00
|
|
|
|
|
|
|
dict.loadArticleText( chain[ x ].articleOffset,
|
|
|
|
headwords, articleText );
|
|
|
|
|
|
|
|
wstring headwordDecoded = Utf8::decode( headwords.front() );
|
|
|
|
|
|
|
|
if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) )
|
|
|
|
{
|
|
|
|
// The headword seems to differ from the input word, which makes the
|
|
|
|
// input word its synonym.
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
|
|
|
|
matches.push_back( headwordDecoded );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch( std::exception & e )
|
|
|
|
{
|
|
|
|
setErrorString( QString::fromUtf8( e.what() ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
sptr< Dictionary::WordSearchRequest >
|
|
|
|
GlsDictionary::findHeadwordsForSynonym( wstring const & word )
|
2022-01-09 08:35:07 +00:00
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
{
|
2022-11-29 03:54:31 +00:00
|
|
|
return synonymSearchEnabled ? std::make_shared<GlsHeadwordsRequest>( word, *this ) :
|
|
|
|
Class::findHeadwordsForSynonym( word );
|
2017-03-07 17:47:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-03-06 15:07:39 +00:00
|
|
|
/// GlsDictionary::getArticle()
|
|
|
|
|
|
|
|
class GlsArticleRequest;
|
|
|
|
|
|
|
|
class GlsArticleRequestRunnable: public QRunnable
|
|
|
|
{
|
|
|
|
GlsArticleRequest & r;
|
|
|
|
QSemaphore & hasExited;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
GlsArticleRequestRunnable( GlsArticleRequest & r_,
|
|
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
|
|
hasExited( hasExited_ )
|
|
|
|
{}
|
|
|
|
|
|
|
|
~GlsArticleRequestRunnable()
|
|
|
|
{
|
|
|
|
hasExited.release();
|
|
|
|
}
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void run() override;
|
2017-03-06 15:07:39 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class GlsArticleRequest: public Dictionary::DataRequest
|
|
|
|
{
|
|
|
|
friend class GlsArticleRequestRunnable;
|
|
|
|
|
|
|
|
wstring word;
|
|
|
|
vector< wstring > alts;
|
|
|
|
GlsDictionary & dict;
|
2018-06-13 16:00:42 +00:00
|
|
|
bool ignoreDiacritics;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
QAtomicInt isCancelled;
|
|
|
|
QSemaphore hasExited;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
GlsArticleRequest( wstring const & word_,
|
|
|
|
vector< wstring > const & alts_,
|
2018-06-13 16:00:42 +00:00
|
|
|
GlsDictionary & dict_, bool ignoreDiacritics_ ):
|
|
|
|
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
QThreadPool::globalInstance()->start(
|
|
|
|
new GlsArticleRequestRunnable( *this, hasExited ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
void run(); // Run from another thread by GlsArticleRequestRunnable
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void cancel() override
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~GlsArticleRequest()
|
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
hasExited.acquire();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
void GlsArticleRequestRunnable::run()
|
|
|
|
{
|
|
|
|
r.run();
|
|
|
|
}
|
|
|
|
|
|
|
|
void GlsArticleRequest::run()
|
|
|
|
{
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
try
|
|
|
|
{
|
2018-06-13 16:00:42 +00:00
|
|
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
for( unsigned x = 0; x < alts.size(); ++x )
|
|
|
|
{
|
|
|
|
/// Make an additional query for each alt
|
|
|
|
|
2018-06-13 16:00:42 +00:00
|
|
|
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
|
|
}
|
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
|
|
|
|
|
|
|
set< uint32_t > articlesIncluded; // Some synonims make it that the articles
|
|
|
|
// appear several times. We combat this
|
|
|
|
// by only allowing them to appear once.
|
|
|
|
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
2018-06-13 16:00:42 +00:00
|
|
|
if( ignoreDiacritics )
|
|
|
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
|
|
{
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
|
|
|
continue; // We already have this article in the body.
|
|
|
|
|
|
|
|
// Now grab that article
|
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
string headword, articleText;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
dict.loadArticle( chain[ x ].articleOffset, headword, articleText );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
|
|
// main ones first, and alternates after.
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
// We do the case-folded comparison here.
|
|
|
|
|
|
|
|
wstring headwordStripped =
|
|
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
|
2018-06-13 16:00:42 +00:00
|
|
|
if( ignoreDiacritics )
|
|
|
|
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
|
2017-03-07 17:47:47 +00:00
|
|
|
|
|
|
|
multimap< wstring, pair< string, string > > & mapToUse =
|
|
|
|
( wordCaseFolded == headwordStripped ) ?
|
|
|
|
mainArticles : alternateArticles;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
mapToUse.insert( pair< wstring, pair< string, string > >(
|
|
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
|
|
|
|
pair< string, string >( headword, articleText ) ) );
|
|
|
|
|
|
|
|
articlesIncluded.insert( chain[ x ].articleOffset );
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
2017-03-07 17:47:47 +00:00
|
|
|
// No such word
|
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
string result;
|
|
|
|
|
|
|
|
multimap< wstring, pair< string, string > >::const_iterator i;
|
|
|
|
|
|
|
|
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
|
|
|
|
{
|
|
|
|
result += i->second.second;
|
|
|
|
}
|
2017-03-06 15:07:39 +00:00
|
|
|
|
2017-03-07 17:47:47 +00:00
|
|
|
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
|
|
|
|
{
|
|
|
|
result += i->second.second;
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
2017-03-07 17:47:47 +00:00
|
|
|
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
|
|
|
|
data.resize( result.size() );
|
|
|
|
|
|
|
|
memcpy( &data.front(), result.data(), result.size() );
|
|
|
|
|
|
|
|
hasAnyData = true;
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
catch( std::exception & e )
|
|
|
|
{
|
|
|
|
setErrorString( QString::fromUtf8( e.what() ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
sptr< Dictionary::DataRequest > GlsDictionary::getArticle( wstring const & word,
|
|
|
|
vector< wstring > const & alts,
|
2018-06-13 16:00:42 +00:00
|
|
|
wstring const &,
|
|
|
|
bool ignoreDiacritics )
|
2022-01-09 08:35:07 +00:00
|
|
|
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
2022-11-29 03:54:31 +00:00
|
|
|
return std::make_shared<GlsArticleRequest>( word, alts, *this, ignoreDiacritics );
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
//////////////// GlsDictionary::getResource()
|
|
|
|
|
|
|
|
class GlsResourceRequest;
|
|
|
|
|
|
|
|
class GlsResourceRequestRunnable: public QRunnable
|
|
|
|
{
|
|
|
|
GlsResourceRequest & r;
|
|
|
|
QSemaphore & hasExited;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
GlsResourceRequestRunnable( GlsResourceRequest & r_,
|
|
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
|
|
hasExited( hasExited_ )
|
|
|
|
{}
|
|
|
|
|
|
|
|
~GlsResourceRequestRunnable()
|
|
|
|
{
|
|
|
|
hasExited.release();
|
|
|
|
}
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void run() override;
|
2017-03-06 15:07:39 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class GlsResourceRequest: public Dictionary::DataRequest
|
|
|
|
{
|
|
|
|
friend class GlsResourceRequestRunnable;
|
|
|
|
|
|
|
|
GlsDictionary & dict;
|
|
|
|
|
|
|
|
string resourceName;
|
|
|
|
|
|
|
|
QAtomicInt isCancelled;
|
|
|
|
QSemaphore hasExited;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
GlsResourceRequest( GlsDictionary & dict_,
|
|
|
|
string const & resourceName_ ):
|
|
|
|
dict( dict_ ),
|
|
|
|
resourceName( resourceName_ )
|
|
|
|
{
|
|
|
|
QThreadPool::globalInstance()->start(
|
|
|
|
new GlsResourceRequestRunnable( *this, hasExited ) );
|
|
|
|
}
|
|
|
|
|
|
|
|
void run(); // Run from another thread by GlsResourceRequestRunnable
|
|
|
|
|
2022-12-29 07:07:40 +00:00
|
|
|
void cancel() override
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~GlsResourceRequest()
|
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
hasExited.acquire();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
void GlsResourceRequestRunnable::run()
|
|
|
|
{
|
|
|
|
r.run();
|
|
|
|
}
|
|
|
|
|
|
|
|
void GlsResourceRequest::run()
|
|
|
|
{
|
|
|
|
// Some runnables linger enough that they are cancelled before they start
|
2021-11-27 07:17:33 +00:00
|
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
finish();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
2023-04-14 03:53:23 +00:00
|
|
|
string n = dict.getContainingFolder().toStdString() + FsEncoding::separator() + resourceName;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
GD_DPRINTF( "n is %s\n", n.c_str() );
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
|
|
|
|
File::loadFromFile( n, data );
|
|
|
|
}
|
|
|
|
catch( File::exCantOpen & )
|
|
|
|
{
|
2023-04-13 10:08:32 +00:00
|
|
|
n = dict.getDictionaryFilenames()[ 0 ] + ".files" + FsEncoding::separator() + resourceName;
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
|
|
|
|
File::loadFromFile( n, data );
|
|
|
|
}
|
|
|
|
catch( File::exCantOpen & )
|
|
|
|
{
|
|
|
|
// Try reading from zip file
|
|
|
|
|
|
|
|
if ( dict.resourceZip.isOpen() )
|
|
|
|
{
|
|
|
|
Mutex::Lock _( dict.resourceZipMutex );
|
|
|
|
|
|
|
|
Mutex::Lock __( dataMutex );
|
|
|
|
|
|
|
|
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
|
|
|
|
throw; // Make it fail since we couldn't read the archive
|
|
|
|
}
|
|
|
|
else
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( Filetype::isNameOfTiff( resourceName ) )
|
|
|
|
{
|
|
|
|
// Convert it
|
|
|
|
|
2022-04-05 13:25:07 +00:00
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
GdTiff::tiff2img( data );
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if( Filetype::isNameOfCSS( resourceName ) )
|
|
|
|
{
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
|
|
|
|
QString css = QString::fromUtf8( data.data(), data.size() );
|
|
|
|
|
|
|
|
// Correct some url's
|
|
|
|
|
|
|
|
QString id = QString::fromUtf8( dict.getId().c_str() );
|
|
|
|
int pos = 0;
|
2018-02-27 16:42:21 +00:00
|
|
|
|
2022-12-24 22:01:50 +00:00
|
|
|
QRegularExpression links( R"(url\(\s*(['"]?)([^'"]*)(['"]?)\s*\))",
|
2018-02-28 14:15:27 +00:00
|
|
|
QRegularExpression::CaseInsensitiveOption );
|
2018-02-27 16:42:21 +00:00
|
|
|
|
|
|
|
QString newCSS;
|
|
|
|
QRegularExpressionMatchIterator it = links.globalMatch( css );
|
|
|
|
while( it.hasNext() )
|
|
|
|
{
|
|
|
|
QRegularExpressionMatch match = it.next();
|
2022-02-27 05:17:37 +00:00
|
|
|
newCSS += css.mid( pos, match.capturedStart() - pos );
|
2018-02-27 16:42:21 +00:00
|
|
|
pos = match.capturedEnd();
|
|
|
|
|
|
|
|
QString url = match.captured( 2 );
|
|
|
|
|
|
|
|
if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
|
|
|
|
{
|
|
|
|
// External link
|
|
|
|
newCSS += match.captured();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
QString newUrl = QString( "url(" ) + match.captured( 1 ) + "bres://"
|
|
|
|
+ id + "/" + url + match.captured( 3 ) + ")";
|
|
|
|
newCSS += newUrl;
|
|
|
|
}
|
|
|
|
if( pos )
|
|
|
|
{
|
2022-02-27 05:17:37 +00:00
|
|
|
newCSS += css.mid( pos );
|
2018-02-27 16:42:21 +00:00
|
|
|
css = newCSS;
|
|
|
|
newCSS.clear();
|
|
|
|
}
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
dict.isolateCSS( css );
|
|
|
|
QByteArray bytes = css.toUtf8();
|
|
|
|
data.resize( bytes.size() );
|
|
|
|
memcpy( &data.front(), bytes.constData(), bytes.size() );
|
|
|
|
}
|
|
|
|
|
2019-11-19 16:20:44 +00:00
|
|
|
Mutex::Lock _( dataMutex );
|
2017-03-06 15:07:39 +00:00
|
|
|
hasAnyData = true;
|
|
|
|
}
|
|
|
|
catch( std::exception &ex )
|
|
|
|
{
|
|
|
|
gdWarning( "GLS: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
|
|
|
|
resourceName.c_str(), dict.getName().c_str(), ex.what() );
|
|
|
|
// Resource not loaded -- we don't set the hasAnyData flag then
|
|
|
|
}
|
|
|
|
|
|
|
|
finish();
|
|
|
|
}
|
|
|
|
|
|
|
|
sptr< Dictionary::DataRequest > GlsDictionary::getResource( string const & name )
|
2022-01-09 08:35:07 +00:00
|
|
|
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
2022-11-29 03:54:31 +00:00
|
|
|
return std::make_shared<GlsResourceRequest>( *this, name );
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
sptr< Dictionary::DataRequest > GlsDictionary::getSearchResults( QString const & searchString,
|
|
|
|
int searchMode, bool matchCase,
|
|
|
|
int distanceBetweenWords,
|
2017-07-25 15:28:29 +00:00
|
|
|
int maxResults,
|
2018-04-10 14:49:52 +00:00
|
|
|
bool ignoreWordsOrder,
|
|
|
|
bool ignoreDiacritics )
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
2022-11-29 03:54:31 +00:00
|
|
|
return std::make_shared<FtsHelpers::FTSResultsRequest>( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
|
2017-03-06 15:07:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
/// makeDictionaries
|
|
|
|
|
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries(
|
|
|
|
vector< string > const & fileNames,
|
|
|
|
string const & indicesDir,
|
|
|
|
Dictionary::Initializing & initializing )
|
2022-01-09 08:35:07 +00:00
|
|
|
|
2017-03-06 15:07:39 +00:00
|
|
|
{
|
|
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
|
|
|
|
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
|
|
|
|
++i )
|
|
|
|
{
|
|
|
|
// Try .gls and .gls.dz suffixes
|
|
|
|
|
|
|
|
if( !( i->size() >= 4 && strcasecmp( i->c_str() + ( i->size() - 4 ), ".gls" ) == 0 )
|
|
|
|
&& !( i->size() >= 7 && strcasecmp( i->c_str() + ( i->size() - 7 ), ".gls.dz" ) == 0 ) )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned atLine = 0; // Indicates current line in .gls, for debug purposes
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
vector< string > dictFiles( 1, *i );
|
|
|
|
|
|
|
|
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
|
|
|
|
|
|
|
// See if there's a zip file with resources present. If so, include it.
|
|
|
|
|
|
|
|
string baseName = ( (*i)[ i->size() - 4 ] == '.' ) ?
|
|
|
|
string( *i, 0, i->size() - 4 ) : string( *i, 0, i->size() - 7 );
|
|
|
|
|
|
|
|
string zipFileName;
|
|
|
|
|
2017-04-24 14:42:01 +00:00
|
|
|
if ( File::tryPossibleZipName( baseName + ".gls.files.zip", zipFileName ) ||
|
|
|
|
File::tryPossibleZipName( baseName + ".gls.dz.files.zip", zipFileName ) ||
|
|
|
|
File::tryPossibleZipName( baseName + ".GLS.FILES.ZIP", zipFileName ) ||
|
|
|
|
File::tryPossibleZipName( baseName + ".GLS.DZ.FILES.ZIP", zipFileName ) )
|
2017-03-06 15:07:39 +00:00
|
|
|
dictFiles.push_back( zipFileName );
|
|
|
|
|
|
|
|
string indexFile = indicesDir + dictId;
|
|
|
|
|
|
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
|
|
|
|
indexIsOldOrBad( indexFile, zipFileName.size() ) )
|
|
|
|
{
|
|
|
|
GlsScanner scanner( *i );
|
|
|
|
|
|
|
|
try { // Here we intercept any errors during the read to save line at
|
|
|
|
// which the incident happened. We need alive scanner for that.
|
|
|
|
|
|
|
|
// Building the index
|
|
|
|
initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) );
|
|
|
|
|
|
|
|
gdDebug( "Gls: Building the index for dictionary: %s\n",
|
2023-04-16 09:07:07 +00:00
|
|
|
QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
File::Class idx( indexFile, "wb" );
|
|
|
|
|
|
|
|
IdxHeader idxHeader;
|
|
|
|
|
|
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
|
|
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
|
|
// will be rewritten with the right values.
|
|
|
|
|
|
|
|
idx.write( idxHeader );
|
|
|
|
|
|
|
|
string dictionaryName = Utf8::encode( scanner.getDictionaryName() );
|
|
|
|
|
|
|
|
idx.write( (uint32_t) dictionaryName.size() );
|
|
|
|
idx.write( dictionaryName.data(), dictionaryName.size() );
|
|
|
|
|
|
|
|
idxHeader.glsEncoding = scanner.getEncoding();
|
|
|
|
|
|
|
|
IndexedWords indexedWords;
|
|
|
|
|
|
|
|
ChunkedStorage::Writer chunks( idx );
|
|
|
|
|
|
|
|
wstring curString;
|
|
|
|
size_t curOffset;
|
|
|
|
|
|
|
|
uint32_t articleCount = 0, wordCount = 0;
|
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
// Find the headwords
|
|
|
|
|
|
|
|
if ( !scanner.readNextLine( curString, curOffset ) )
|
|
|
|
break; // Clean end of file
|
|
|
|
|
|
|
|
if( curString.empty() )
|
|
|
|
continue;
|
|
|
|
|
|
|
|
uint32_t articleOffset = curOffset;
|
|
|
|
|
|
|
|
// Parse headwords
|
|
|
|
|
|
|
|
list< wstring > allEntryWords;
|
|
|
|
wstring::size_type start_pos = 0, end_pos = 0;
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
end_pos = curString.find( '|', start_pos );
|
|
|
|
if( end_pos == wstring::npos )
|
|
|
|
{
|
|
|
|
wstring headword = curString.substr( start_pos );
|
|
|
|
if( !headword.empty() )
|
|
|
|
allEntryWords.push_back( headword );
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
allEntryWords.push_back( curString.substr( start_pos, end_pos - start_pos ) );
|
|
|
|
start_pos = end_pos + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip article body
|
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
if( !scanner.readNextLine( curString, curOffset ) )
|
|
|
|
break;
|
|
|
|
if( curString.empty() )
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Insert new entry
|
|
|
|
|
|
|
|
uint32_t descOffset = chunks.startNewBlock();
|
|
|
|
chunks.addToBlock( &articleOffset, sizeof( articleOffset ) );
|
|
|
|
|
|
|
|
uint32_t articleSize = curOffset - articleOffset;
|
|
|
|
chunks.addToBlock( &articleSize, sizeof( articleSize ) );
|
|
|
|
|
|
|
|
for( list< wstring >::iterator j = allEntryWords.begin();
|
|
|
|
j != allEntryWords.end(); ++j )
|
|
|
|
indexedWords.addWord( *j, descOffset );
|
|
|
|
|
|
|
|
++articleCount;
|
|
|
|
wordCount += allEntryWords.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finish with the chunks
|
|
|
|
|
|
|
|
idxHeader.chunksOffset = chunks.finish();
|
|
|
|
|
|
|
|
// Build index
|
|
|
|
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
|
|
|
|
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
|
|
|
|
|
|
|
indexedWords.clear(); // Release memory -- no need for this data
|
|
|
|
|
|
|
|
// If there was a zip file, index it too
|
|
|
|
|
|
|
|
if ( zipFileName.size() )
|
|
|
|
{
|
|
|
|
GD_DPRINTF( "Indexing zip file\n" );
|
|
|
|
|
|
|
|
idxHeader.hasZipFile = 1;
|
|
|
|
|
|
|
|
IndexedWords zipFileNames;
|
|
|
|
IndexedZip zipFile;
|
2023-04-13 10:08:32 +00:00
|
|
|
if ( zipFile.openZipFile( QDir::fromNativeSeparators( zipFileName.c_str() ) ) )
|
|
|
|
zipFile.indexFile( zipFileNames );
|
2017-03-06 15:07:39 +00:00
|
|
|
|
|
|
|
if( !zipFileNames.empty() )
|
|
|
|
{
|
|
|
|
// Build the resulting zip file index
|
|
|
|
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
|
|
|
|
|
|
|
|
idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
|
|
idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Bad zip file -- no index (though the mark that we have one
|
|
|
|
// remains)
|
|
|
|
idxHeader.zipIndexBtreeMaxElements = 0;
|
|
|
|
idxHeader.zipIndexRootOffset = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
idxHeader.hasZipFile = 0;
|
|
|
|
|
|
|
|
// That concludes it. Update the header.
|
|
|
|
|
|
|
|
idxHeader.signature = Signature;
|
|
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
|
|
|
idxHeader.zipSupportVersion = CurrentZipSupportVersion;
|
|
|
|
|
|
|
|
idxHeader.articleCount = articleCount;
|
|
|
|
idxHeader.wordCount = wordCount;
|
|
|
|
|
|
|
|
idxHeader.langFrom = LangCoder::findIdForLanguage( scanner.getLangFrom() );
|
|
|
|
idxHeader.langTo = LangCoder::findIdForLanguage( scanner.getLangTo() );
|
|
|
|
if( idxHeader.langFrom == 0 && idxHeader.langTo == 0 )
|
|
|
|
{
|
|
|
|
// if no languages found, try dictionary's file name
|
|
|
|
QPair<quint32,quint32> langs =
|
|
|
|
LangCoder::findIdsForFilename( QString::fromStdString( dictFiles[ 0 ] ) );
|
|
|
|
|
|
|
|
// if no languages found, try dictionary's name
|
|
|
|
if ( langs.first == 0 || langs.second == 0 )
|
|
|
|
{
|
|
|
|
langs =
|
|
|
|
LangCoder::findIdsForFilename( QString::fromStdString( dictionaryName ) );
|
|
|
|
}
|
|
|
|
idxHeader.langFrom = langs.first;
|
|
|
|
idxHeader.langTo = langs.second;
|
|
|
|
}
|
|
|
|
|
|
|
|
idx.rewind();
|
|
|
|
|
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
|
|
} // In-place try for saving line count
|
|
|
|
catch( ... )
|
|
|
|
{
|
|
|
|
atLine = scanner.getLinesRead();
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // if need to rebuild
|
2022-11-29 03:54:31 +00:00
|
|
|
dictionaries.push_back( std::make_shared<GlsDictionary>( dictId,
|
2017-03-06 15:07:39 +00:00
|
|
|
indexFile,
|
|
|
|
dictFiles ) );
|
|
|
|
}
|
|
|
|
catch( std::exception & e )
|
|
|
|
{
|
|
|
|
gdWarning( "GLS dictionary reading failed: %s:%u, error: %s\n",
|
|
|
|
i->c_str(), atLine, e.what() );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return dictionaries;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace Gls
|