mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 23:34:06 +00:00
5034348c1a
When a referenced audio resource is not found in a DSL or XDXF dictionary, GoldenDict searches for this resource by filename in all other dictionaries within the current group. Naturally, the file is absent from most dictionaries (see #970). Therefore a "Failed loading resource" warning is printed for almost every dictionary in the current group. These warnings are by far the most frequent on my system. And in the scenario described above there is nothing wrong at all. So the user may want to silence these warnings to help notice less frequent and more important messages. Implement categorized logging to enable this customization. These warnings can now be disabled by adding the following line in the [Rules] section of a logging configuration file (e.g. ~/.config/QtProject/qtlogging.ini on GNU/Linux): goldendict.dictionary.resource.warning=false See also https://doc.qt.io/qt-5/qloggingcategory.html#logging-rules
1811 lines
51 KiB
C++
1811 lines
51 KiB
C++
/* This file is (c) 2008-2017 Abs62
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#include <zlib.h>
|
|
#include "gls.hh"
|
|
#include "iconv.hh"
|
|
#include "dictionary.hh"
|
|
#include "ufile.hh"
|
|
#include "btreeidx.hh"
|
|
#include "folding.hh"
|
|
#include "categorized_logging.hh"
|
|
#include "gddebug.hh"
|
|
#include "utf8.hh"
|
|
#include "wstring_qt.hh"
|
|
#include "chunkedstorage.hh"
|
|
#include "langcoder.hh"
|
|
#include "dictzip.h"
|
|
#include "indexedzip.hh"
|
|
#include "ftshelpers.hh"
|
|
#include "fsencoding.hh"
|
|
#include "htmlescape.hh"
|
|
#include "filetype.hh"
|
|
#include "tiff.hh"
|
|
#include "audiolink.hh"
|
|
|
|
#include <QString>
|
|
#include <QSemaphore>
|
|
#include <QThreadPool>
|
|
#include <QAtomicInt>
|
|
// For TIFF conversion
|
|
#include <QImage>
|
|
#include <QByteArray>
|
|
#include <QBuffer>
|
|
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
#include <QRegularExpression>
|
|
#endif
|
|
|
|
#include <string>
|
|
#include <list>
|
|
#include <map>
|
|
#include <set>
|
|
|
|
#ifdef _MSC_VER
|
|
#include <stub_msvc.h>
|
|
#endif
|
|
|
|
namespace Gls {
|
|
|
|
using std::list;
|
|
using std::map;
|
|
using std::set;
|
|
using std::multimap;
|
|
using std::pair;
|
|
|
|
using gd::wstring;
|
|
using gd::wchar;
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
using BtreeIndexing::IndexedWords;
|
|
using BtreeIndexing::IndexInfo;
|
|
|
|
enum Encoding
|
|
{
|
|
Utf8,
|
|
Utf16LE,
|
|
Utf16BE
|
|
};
|
|
|
|
/////////////// GlsScanner
|
|
|
|
class GlsScanner
|
|
{
|
|
gzFile f;
|
|
Encoding encoding;
|
|
Iconv iconv;
|
|
wstring dictionaryName;
|
|
wstring dictionaryDecription, dictionaryAuthor;
|
|
wstring langFrom, langTo;
|
|
char readBuffer[ 65536 ];
|
|
char * readBufferPtr;
|
|
size_t readBufferLeft;
|
|
vector< wchar > wcharBuffer;
|
|
unsigned linesRead;
|
|
|
|
public:
|
|
|
|
DEF_EX( Ex, "Gls scanner exception", Dictionary::Ex )
|
|
DEF_EX_STR( exCantOpen, "Can't open .gls file", Ex )
|
|
DEF_EX( exCantReadGlsFile, "Can't read .gls file", Ex )
|
|
DEF_EX_STR( exMalformedGlsFile, "The .gls file is malformed:", Ex )
|
|
DEF_EX( exEncodingError, "Encoding error", Ex ) // Should never happen really
|
|
|
|
GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex );
|
|
~GlsScanner() throw();
|
|
|
|
/// Returns the detected encoding of this file.
|
|
Encoding getEncoding() const
|
|
{ return encoding; }
|
|
|
|
/// Returns the dictionary's name, as was read from file's headers.
|
|
wstring const & getDictionaryName() const
|
|
{ return dictionaryName; }
|
|
|
|
/// Returns the dictionary's author, as was read from file's headers.
|
|
wstring const & getDictionaryAuthor() const
|
|
{ return dictionaryAuthor; }
|
|
|
|
/// Returns the dictionary's description, as was read from file's headers.
|
|
wstring const & getDictionaryDescription() const
|
|
{ return dictionaryDecription; }
|
|
|
|
/// Returns the dictionary's source language, as was read from file's headers.
|
|
wstring const & getLangFrom() const
|
|
{ return langFrom; }
|
|
|
|
/// Returns the dictionary's target language, as was read from file's headers.
|
|
wstring const & getLangTo() const
|
|
{ return langTo; }
|
|
|
|
/// Reads next line from the file. Returns true if reading succeeded --
|
|
/// the string gets stored in the one passed, along with its physical
|
|
/// file offset in the file (the uncompressed one if the file is compressed).
|
|
/// If end of file is reached, false is returned.
|
|
/// Reading begins from the first line after the headers (ones which end
|
|
/// by the "### Glossary section:" line).
|
|
bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
|
|
|
|
/// Returns the number of lines read so far from the file.
|
|
unsigned getLinesRead() const
|
|
{ return linesRead; }
|
|
|
|
/// Returns a name to be passed to iconv for the given encoding.
|
|
static char const * getEncodingNameFor( Encoding e )
|
|
{
|
|
switch( e )
|
|
{
|
|
case Utf16LE:
|
|
return Iconv::Utf16Le;
|
|
case Utf16BE:
|
|
return "UTF-16BE";
|
|
case Utf8:
|
|
default:
|
|
return Iconv::Utf8;
|
|
}
|
|
}
|
|
};
|
|
|
|
GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
|
encoding( Utf8 ), iconv( Iconv::GdWchar, Iconv::Utf8 ), readBufferPtr( readBuffer ),
|
|
readBufferLeft( 0 ), wcharBuffer( 64 ), linesRead( 0 )
|
|
{
|
|
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
|
// read it -- they are much nicer than the dict_data- ones.
|
|
|
|
f = gd_gzopen( fileName.c_str() );
|
|
if ( !f )
|
|
throw exCantOpen( fileName );
|
|
|
|
// Now try guessing the encoding by reading the first two bytes
|
|
|
|
unsigned char firstBytes[ 2 ];
|
|
|
|
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
|
|
{
|
|
// Apparently the file's too short
|
|
gzclose( f );
|
|
throw exMalformedGlsFile( fileName );
|
|
}
|
|
|
|
// If the file begins with the dedicated Unicode marker, we just consume
|
|
// it. If, on the other hand, it's not, we return the bytes back
|
|
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
|
encoding = Utf16LE;
|
|
else
|
|
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
|
encoding = Utf16BE;
|
|
else
|
|
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
|
{
|
|
// Looks like Utf8, read one more byte
|
|
if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
|
|
{
|
|
// Either the file's too short, or the BOM is weird
|
|
gzclose( f );
|
|
throw exMalformedGlsFile( fileName );
|
|
}
|
|
encoding = Utf8;
|
|
}
|
|
else
|
|
{
|
|
if ( gzrewind( f ) )
|
|
{
|
|
gzclose( f );
|
|
throw exCantOpen( fileName );
|
|
}
|
|
encoding = Utf8;
|
|
}
|
|
|
|
if( encoding != Utf8 )
|
|
iconv.reinit( Iconv::GdWchar, getEncodingNameFor( encoding ) );
|
|
|
|
// We now can use our own readNextLine() function
|
|
|
|
wstring str;
|
|
wstring *currentField = 0;
|
|
wstring mark = GD_NATIVE_TO_WS( L"###" );
|
|
wstring titleMark = GD_NATIVE_TO_WS( L"### Glossary title:" );
|
|
wstring authorMark = GD_NATIVE_TO_WS( L"### Author:" );
|
|
wstring descriptionMark = GD_NATIVE_TO_WS( L"### Description:" );
|
|
wstring langFromMark = GD_NATIVE_TO_WS( L"### Source language:" );
|
|
wstring langToMark = GD_NATIVE_TO_WS( L"### Target language:" );
|
|
wstring endOfHeaderMark = GD_NATIVE_TO_WS( L"### Glossary section:" );
|
|
size_t offset;
|
|
|
|
for( ; ; )
|
|
{
|
|
if ( !readNextLine( str, offset ) )
|
|
{
|
|
gzclose( f );
|
|
throw exMalformedGlsFile( fileName );
|
|
}
|
|
|
|
if( str.compare( 0, 3, mark.c_str(), 3 ) == 0 )
|
|
{
|
|
currentField = 0;
|
|
|
|
if( str.compare( 0, titleMark.size(), titleMark ) == 0 )
|
|
{
|
|
dictionaryName = wstring( str, titleMark.size(), str.size() - titleMark.size() );
|
|
currentField = &dictionaryName;
|
|
}
|
|
else
|
|
if( str.compare( 0, authorMark.size(), authorMark ) == 0 )
|
|
{
|
|
dictionaryAuthor = wstring( str, authorMark.size(), str.size() - authorMark.size() );
|
|
currentField = &dictionaryAuthor;
|
|
}
|
|
else
|
|
if( str.compare( 0, descriptionMark.size(), descriptionMark ) == 0 )
|
|
{
|
|
dictionaryDecription = wstring( str, descriptionMark.size(), str.size() - descriptionMark.size() );
|
|
currentField = &dictionaryDecription;
|
|
}
|
|
else
|
|
if( str.compare( 0, langFromMark.size(), langFromMark ) == 0 )
|
|
{
|
|
langFrom = wstring( str, langFromMark.size(), str.size() - langFromMark.size() );
|
|
}
|
|
else
|
|
if( str.compare( 0, langToMark.size(), langToMark ) == 0 )
|
|
{
|
|
langTo = wstring( str, langToMark.size(), str.size() - langToMark.size() );
|
|
}
|
|
else
|
|
if( str.compare( 0, endOfHeaderMark.size(), endOfHeaderMark ) == 0 )
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/// Handle multiline headers
|
|
if( currentField )
|
|
*currentField += str;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
|
|
Iconv::Ex )
|
|
{
|
|
offset = (size_t)( gztell( f ) - readBufferLeft );
|
|
|
|
// For now we just read one char at a time
|
|
size_t readMultiple = ( encoding == Utf16LE || encoding == Utf16BE ) ? 2 : 1;
|
|
|
|
size_t leftInOut = wcharBuffer.size();
|
|
|
|
wchar * outPtr = &wcharBuffer.front();
|
|
|
|
for( ; ; )
|
|
{
|
|
// Check that we have bytes to read
|
|
if ( readBufferLeft < 4 ) // To convert one char, we need at most 4 bytes
|
|
{
|
|
if ( !gzeof( f ) )
|
|
{
|
|
// To avoid having to deal with ring logic, we move the remaining bytes
|
|
// to the beginning
|
|
memmove( readBuffer, readBufferPtr, readBufferLeft );
|
|
|
|
// Read some more bytes to readBuffer
|
|
int result = gzread( f, readBuffer + readBufferLeft,
|
|
sizeof( readBuffer ) - readBufferLeft );
|
|
|
|
if ( result == -1 )
|
|
throw exCantReadGlsFile();
|
|
|
|
readBufferPtr = readBuffer;
|
|
readBufferLeft += (size_t) result;
|
|
}
|
|
}
|
|
|
|
if ( readBufferLeft < readMultiple )
|
|
{
|
|
// No more data. Return what we've got so far, forget the last byte if
|
|
// it was a 16-bit Unicode and a file had an odd number of bytes.
|
|
readBufferLeft = 0;
|
|
|
|
if ( outPtr != &wcharBuffer.front() )
|
|
{
|
|
// If there was a stray \r, remove it
|
|
if ( outPtr[ -1 ] == L'\r' )
|
|
--outPtr;
|
|
|
|
out = wstring( &wcharBuffer.front(), outPtr - &wcharBuffer.front() );
|
|
|
|
++linesRead;
|
|
|
|
return true;
|
|
}
|
|
else
|
|
return false;
|
|
}
|
|
|
|
// Check that we have chars to write
|
|
if ( leftInOut < 2 ) // With 16-bit wchars, 2 is needed for a surrogate pair
|
|
{
|
|
wcharBuffer.resize( wcharBuffer.size() + 64 );
|
|
outPtr = &wcharBuffer.front() + wcharBuffer.size() - 64 - leftInOut;
|
|
leftInOut += 64;
|
|
}
|
|
|
|
// Ok, now convert one char
|
|
size_t outBytesLeft = sizeof( wchar );
|
|
|
|
Iconv::Result r =
|
|
iconv.convert( (void const *&)readBufferPtr, readBufferLeft,
|
|
(void *&)outPtr, outBytesLeft );
|
|
|
|
if ( r == Iconv::NeedMoreOut && outBytesLeft == sizeof( wchar ) )
|
|
{
|
|
// Seems to be a surrogate pair with a 16-bit target wchar
|
|
|
|
outBytesLeft *= 2;
|
|
r = iconv.convert( (void const *&)readBufferPtr, readBufferLeft,
|
|
(void *&)outPtr, outBytesLeft );
|
|
--leftInOut; // Complements the next decremention
|
|
}
|
|
|
|
if ( outBytesLeft )
|
|
throw exEncodingError();
|
|
|
|
--leftInOut;
|
|
|
|
// Have we got \n?
|
|
if ( outPtr[ -1 ] == L'\n' )
|
|
{
|
|
--outPtr;
|
|
|
|
// Now kill a \r if there is one, and return the result.
|
|
if ( outPtr != &wcharBuffer.front() && outPtr[ -1 ] == L'\r' )
|
|
--outPtr;
|
|
|
|
out = wstring( &wcharBuffer.front(), outPtr - &wcharBuffer.front() );
|
|
|
|
++linesRead;
|
|
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
GlsScanner::~GlsScanner() throw()
|
|
{
|
|
gzclose( f );
|
|
}
|
|
|
|
namespace {
|
|
|
|
////////////////// GLS Dictionary
|
|
|
|
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
|
|
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
|
|
DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
|
|
|
|
enum
|
|
{
|
|
Signature = 0x58534c47, // GLSX on little-endian, XSLG on big-endian
|
|
CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version,
|
|
CurrentZipSupportVersion = 2,
|
|
CurrentFtsIndexVersion = 1
|
|
};
|
|
|
|
struct IdxHeader
|
|
{
|
|
uint32_t signature; // First comes the signature, GLSX
|
|
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
|
uint32_t zipSupportVersion; // Zip support version -- narrows down reindexing
|
|
// when it changes only for dictionaries with the
|
|
// zip files
|
|
int glsEncoding; // Which encoding is used for the file indexed
|
|
uint32_t chunksOffset; // The offset to chunks' storage
|
|
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
|
uint32_t indexRootOffset;
|
|
uint32_t articleCount; // Number of articles this dictionary has
|
|
uint32_t wordCount; // Number of headwords this dictionary has
|
|
uint32_t langFrom; // Source language
|
|
uint32_t langTo; // Target language
|
|
uint32_t hasZipFile; // Non-zero means there's a zip file with resources
|
|
// present
|
|
uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
|
|
// resource index.
|
|
uint32_t zipIndexRootOffset;
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
bool indexIsOldOrBad( string const & indexFile, bool hasZipFile )
|
|
{
|
|
File::Class idx( indexFile, "rb" );
|
|
|
|
IdxHeader header;
|
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
|
|
header.signature != Signature ||
|
|
header.formatVersion != CurrentFormatVersion ||
|
|
(bool) header.hasZipFile != hasZipFile ||
|
|
( hasZipFile && header.zipSupportVersion != CurrentZipSupportVersion );
|
|
}
|
|
|
|
class GlsDictionary: public BtreeIndexing::BtreeDictionary
|
|
{
|
|
Mutex idxMutex;
|
|
File::Class idx;
|
|
IdxHeader idxHeader;
|
|
dictData * dz;
|
|
ChunkedStorage::Reader chunks;
|
|
Mutex dzMutex;
|
|
Mutex resourceZipMutex;
|
|
IndexedZip resourceZip;
|
|
string dictionaryName;
|
|
|
|
public:
|
|
|
|
GlsDictionary( string const & id, string const & indexFile,
|
|
vector< string > const & dictionaryFiles );
|
|
|
|
~GlsDictionary();
|
|
|
|
virtual string getName() throw()
|
|
{ return dictionaryName; }
|
|
|
|
virtual map< Dictionary::Property, string > getProperties() throw()
|
|
{ return map< Dictionary::Property, string >(); }
|
|
|
|
virtual unsigned long getArticleCount() throw()
|
|
{ return idxHeader.articleCount; }
|
|
|
|
virtual unsigned long getWordCount() throw()
|
|
{ return idxHeader.wordCount; }
|
|
|
|
inline virtual quint32 getLangFrom() const
|
|
{ return idxHeader.langFrom; }
|
|
|
|
inline virtual quint32 getLangTo() const
|
|
{ return idxHeader.langTo; }
|
|
|
|
virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & )
|
|
THROW_SPEC( std::exception );
|
|
|
|
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
THROW_SPEC( std::exception );
|
|
|
|
virtual sptr< Dictionary::DataRequest > getResource( string const & name )
|
|
THROW_SPEC( std::exception );
|
|
|
|
virtual QString const& getDescription();
|
|
|
|
virtual QString getMainFilename();
|
|
|
|
virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics,
|
|
QThreadPool * ftsThreadPoolPtr );
|
|
|
|
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
|
|
|
|
virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
|
|
|
|
virtual void setFTSParameters( Config::FullTextSearch const & fts )
|
|
{
|
|
can_FTS = fts.enabled
|
|
&& !fts.disabledTypes.contains( "GLS", Qt::CaseInsensitive )
|
|
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
|
|
}
|
|
protected:
|
|
|
|
void loadIcon() throw();
|
|
|
|
private:
|
|
|
|
/// Loads the article, storing its headword and formatting the data it has
|
|
/// into an html.
|
|
void loadArticle( uint32_t address,
|
|
string & headword,
|
|
string & articleText );
|
|
|
|
/// Loads the article
|
|
void loadArticleText( uint32_t address,
|
|
vector< string > & headwords,
|
|
string & articleText );
|
|
|
|
/// Process resource links (images, audios, etc)
|
|
QString & filterResource( QString & article );
|
|
|
|
friend class GlsResourceRequest;
|
|
friend class GlsArticleRequest;
|
|
friend class GlsHeadwordsRequest;
|
|
};
|
|
|
|
GlsDictionary::GlsDictionary( string const & id,
|
|
string const & indexFile,
|
|
vector< string > const & dictionaryFiles ):
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
idx( indexFile, "rb" ),
|
|
idxHeader( idx.read< IdxHeader >() ),
|
|
dz( 0 ),
|
|
chunks( idx, idxHeader.chunksOffset )
|
|
{
|
|
// Open the .gls file
|
|
|
|
DZ_ERRORS error;
|
|
dz = dict_data_open( getDictionaryFilenames()[ 0 ].c_str(), &error, 0 );
|
|
|
|
if ( !dz )
|
|
throw exDictzipError( string( dz_error_str( error ) )
|
|
+ "(" + getDictionaryFilenames()[ 0 ] + ")" );
|
|
|
|
// Read the dictionary name
|
|
|
|
idx.seek( sizeof( idxHeader ) );
|
|
|
|
vector< char > dName( idx.read< uint32_t >() );
|
|
if( dName.size() > 0 )
|
|
{
|
|
idx.read( &dName.front(), dName.size() );
|
|
dictionaryName = string( &dName.front(), dName.size() );
|
|
}
|
|
|
|
// Initialize the index
|
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
|
idxHeader.indexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
// Open a resource zip file, if there's one
|
|
|
|
if ( idxHeader.hasZipFile &&
|
|
( idxHeader.zipIndexBtreeMaxElements ||
|
|
idxHeader.zipIndexRootOffset ) )
|
|
{
|
|
resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
|
|
idxHeader.zipIndexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
QString zipName = QDir::fromNativeSeparators(
|
|
FsEncoding::decode( getDictionaryFilenames().back().c_str() ) );
|
|
|
|
if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
|
|
resourceZip.openZipFile( zipName );
|
|
}
|
|
|
|
// Full-text search parameters
|
|
|
|
can_FTS = true;
|
|
|
|
ftsIdxName = indexFile + "_FTS";
|
|
|
|
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
|
|
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
|
|
FTS_index_completed.ref();
|
|
}
|
|
|
|
GlsDictionary::~GlsDictionary()
|
|
{
|
|
if ( dz )
|
|
dict_data_close( dz );
|
|
}
|
|
|
|
void GlsDictionary::loadIcon() throw()
|
|
{
|
|
if ( dictionaryIconLoaded )
|
|
return;
|
|
|
|
QString fileName =
|
|
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
|
|
|
|
// Remove the extension
|
|
if ( fileName.endsWith( ".gls.dz", Qt::CaseInsensitive ) )
|
|
fileName.chop( 6 );
|
|
else
|
|
fileName.chop( 3 );
|
|
|
|
if ( !loadIconFromFile( fileName ) )
|
|
{
|
|
// Load failed -- use default icon
|
|
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_gls.png");
|
|
}
|
|
|
|
dictionaryIconLoaded = true;
|
|
}
|
|
|
|
QString const& GlsDictionary::getDescription()
|
|
{
|
|
if( !dictionaryDescription.isEmpty() )
|
|
return dictionaryDescription;
|
|
|
|
try {
|
|
GlsScanner scanner( getDictionaryFilenames()[ 0 ] );
|
|
string str = Utf8::encode( scanner.getDictionaryAuthor() );
|
|
if( !str.empty() )
|
|
dictionaryDescription = QString( QObject::tr( "Author: %1%2" ) )
|
|
.arg( QString::fromUtf8( str.c_str() ) )
|
|
.arg( "\n\n" );
|
|
str = Utf8::encode( scanner.getDictionaryDescription() );
|
|
if( !str.empty() )
|
|
{
|
|
QString desc = QString::fromUtf8( str.c_str() );
|
|
desc.replace( "\t", "<br/>" );
|
|
desc.replace( "\\n", "<br/>" );
|
|
desc.replace( "<br>", "<br/>", Qt::CaseInsensitive );
|
|
dictionaryDescription += Html::unescape( desc, true );
|
|
}
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
gdWarning( "GLS dictionary description reading failed: %s, error: %s\n",
|
|
getName().c_str(), e.what() );
|
|
}
|
|
|
|
if( dictionaryDescription.isEmpty() )
|
|
dictionaryDescription = "NONE";
|
|
|
|
return dictionaryDescription;
|
|
}
|
|
|
|
QString GlsDictionary::getMainFilename()
|
|
{
|
|
return FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() );
|
|
}
|
|
|
|
void GlsDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
|
|
{
|
|
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|
|
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
|
|
FTS_index_completed.ref();
|
|
|
|
if( haveFTSIndex() )
|
|
return;
|
|
|
|
if( ensureInitDone().size() )
|
|
return;
|
|
|
|
if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
|
|
return;
|
|
|
|
gdDebug( "Gls: Building the full-text index for dictionary: %s\n",
|
|
getName().c_str() );
|
|
|
|
try
|
|
{
|
|
FtsHelpers::makeFTSIndex( this, isCancelled );
|
|
FTS_index_completed.ref();
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Gls: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
|
|
}
|
|
}
|
|
|
|
void GlsDictionary::loadArticleText( uint32_t address,
|
|
vector< string > & headwords,
|
|
string & articleText )
|
|
{
|
|
vector< char > chunk;
|
|
char * articleProps;
|
|
{
|
|
Mutex::Lock _( idxMutex );
|
|
|
|
articleProps = chunks.getBlock( address, chunk );
|
|
}
|
|
|
|
uint32_t articleOffset, articleSize;
|
|
|
|
memcpy( &articleOffset, articleProps, sizeof( articleOffset ) );
|
|
memcpy( &articleSize, articleProps + sizeof( articleOffset ),
|
|
sizeof( articleSize ) );
|
|
|
|
char * articleBody;
|
|
|
|
{
|
|
Mutex::Lock _( dzMutex );
|
|
|
|
articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
|
|
}
|
|
|
|
headwords.clear();
|
|
articleText.clear();
|
|
string headword;
|
|
|
|
if ( !articleBody )
|
|
{
|
|
articleText = string( "\n\tDICTZIP error: " ) + dict_error_str( dz );
|
|
}
|
|
else
|
|
{
|
|
string articleData = Iconv::toUtf8( GlsScanner::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
|
|
string::size_type start_pos = 0, end_pos = 0;
|
|
|
|
for( ; ; )
|
|
{
|
|
// Replace all "\r\n" by "\n"
|
|
end_pos = articleData.find( "\r\n", start_pos );
|
|
if( end_pos == string::npos )
|
|
{
|
|
articleText += articleData.substr( start_pos, end_pos );
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
articleText += articleData.substr( start_pos, end_pos - start_pos ) + "\n";
|
|
start_pos = end_pos + 2;
|
|
}
|
|
}
|
|
|
|
// Find headword
|
|
start_pos = articleText.find( '\n' );
|
|
if( start_pos != string::npos )
|
|
{
|
|
headword = articleText.substr( 0, start_pos );
|
|
articleText = articleText.substr( start_pos + 1, string::npos );
|
|
}
|
|
|
|
// Parse headwords
|
|
|
|
start_pos = 0;
|
|
end_pos = 0;
|
|
for( ; ; )
|
|
{
|
|
end_pos = headword.find( '|', start_pos );
|
|
if( end_pos == wstring::npos )
|
|
{
|
|
string hw = headword.substr( start_pos );
|
|
if( !hw.empty() )
|
|
headwords.push_back( hw );
|
|
break;
|
|
}
|
|
headwords.push_back( headword.substr( start_pos, end_pos - start_pos ) );
|
|
start_pos = end_pos + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
void GlsDictionary::loadArticle( uint32_t address,
|
|
string & headword,
|
|
string & articleText )
|
|
{
|
|
string articleBody;
|
|
vector< string > headwords;
|
|
loadArticleText( address, headwords, articleBody );
|
|
|
|
QString article = QString::fromLatin1( "<div class=\"glsdict\">" );
|
|
if( headwords.size() )
|
|
{
|
|
// Headwords
|
|
article += "<div class=\"glsdict_headwords\"";
|
|
if( isFromLanguageRTL() )
|
|
article += " dir=\"rtl\"";
|
|
if( headwords.size() > 1 )
|
|
{
|
|
QString altHeadwords;
|
|
for( vector< string >::size_type i = 1; i < headwords.size(); i++ )
|
|
{
|
|
if( i > 1 )
|
|
altHeadwords += ", ";
|
|
altHeadwords += QString::fromUtf8( headwords[ i ].c_str(), headwords[ i ].size() );
|
|
}
|
|
article += " title=\"" + altHeadwords + "\"";
|
|
}
|
|
article += ">";
|
|
|
|
headword = headwords.front();
|
|
article += QString::fromUtf8( headword.c_str(), headword.size() );
|
|
|
|
article += "</div>";
|
|
}
|
|
|
|
if( isToLanguageRTL() )
|
|
article += "<div style=\"display:inline;\" dir=\"rtl\">";
|
|
|
|
QString text = QString::fromUtf8( articleBody.c_str(), articleBody.size() );
|
|
|
|
article += filterResource( text );
|
|
|
|
if( isToLanguageRTL() )
|
|
article += "</div>";
|
|
|
|
article +="</div>";
|
|
|
|
articleText = string( article.toUtf8().data() );
|
|
}
|
|
|
|
QString & GlsDictionary::filterResource( QString & article )
|
|
{
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
QRegularExpression imgRe( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?!(?:data|https?|ftp|qrcx):)",
|
|
QRegularExpression::CaseInsensitiveOption
|
|
| QRegularExpression::InvertedGreedinessOption );
|
|
QRegularExpression linkRe( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)",
|
|
QRegularExpression::CaseInsensitiveOption
|
|
| QRegularExpression::InvertedGreedinessOption );
|
|
#else
|
|
QRegExp imgRe( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?!(?:data|https?|ftp|qrcx):)", Qt::CaseInsensitive );
|
|
imgRe.setMinimal( true );
|
|
QRegExp linkRe( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)", Qt::CaseInsensitive );
|
|
linkRe.setMinimal( true );
|
|
#endif
|
|
article.replace( imgRe , "\\1bres://" + QString::fromStdString( getId() ) + "/" )
|
|
.replace( linkRe, "\\1bres://" + QString::fromStdString( getId() ) + "/" );
|
|
|
|
// Handle links to articles
|
|
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
QRegularExpression linksReg( "<a(\\s+[^>]*)href\\s*=\\s*['\"](bword://)?([^'\"]+)['\"]",
|
|
QRegularExpression::CaseInsensitiveOption );
|
|
#else
|
|
QRegExp linksReg( "<a(\\s*[^>]*)href\\s*=\\s*['\"](bword://)?([^'\"]+)['\"]" );
|
|
linksReg.setMinimal( true );
|
|
#endif
|
|
|
|
int pos = 0;
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
QString articleNewText;
|
|
QRegularExpressionMatchIterator it = linksReg.globalMatch( article );
|
|
while( it.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it.next();
|
|
articleNewText += article.midRef( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QString link = match.captured( 3 );
|
|
#else
|
|
while( pos >= 0 )
|
|
{
|
|
pos = linksReg.indexIn( article, pos );
|
|
if( pos < 0 )
|
|
break;
|
|
|
|
QString link = linksReg.cap( 3 );
|
|
#endif
|
|
if( link.indexOf( ':' ) < 0 )
|
|
{
|
|
QString newLink;
|
|
if( link.indexOf( '#' ) < 0 )
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"bword:" + link + "\"";
|
|
#else
|
|
newLink = QString( "<a" ) + linksReg.cap( 1 ) + "href=\"bword:" + link + "\"";
|
|
#endif
|
|
|
|
// Anchors
|
|
|
|
if( link.indexOf( '#' ) > 0 )
|
|
{
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
|
|
#else
|
|
newLink = QString( "<a" ) + linksReg.cap( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
|
|
#endif
|
|
newLink.replace( "#", "?gdanchor=" );
|
|
}
|
|
|
|
if( !newLink.isEmpty() )
|
|
{
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
articleNewText += newLink;
|
|
#else
|
|
article.replace( pos, linksReg.cap( 0 ).size(), newLink );
|
|
pos += newLink.size();
|
|
#endif
|
|
}
|
|
else
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
articleNewText += match.captured();
|
|
#else
|
|
pos += linksReg.cap( 0 ).size();
|
|
#endif
|
|
}
|
|
else
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
articleNewText += match.captured();
|
|
}
|
|
if( pos )
|
|
{
|
|
articleNewText += article.midRef( pos );
|
|
article = articleNewText;
|
|
articleNewText.clear();
|
|
}
|
|
#else
|
|
pos += linksReg.cap( 0 ).size();
|
|
}
|
|
#endif
|
|
|
|
// Handle "audio" tags
|
|
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
QRegularExpression audioRe( "<\\s*audio\\s+src\\s*=\\s*([\"']+)([^\"']+)([\"'])\\s*>(.*)</audio>",
|
|
QRegularExpression::CaseInsensitiveOption
|
|
| QRegularExpression::DotMatchesEverythingOption
|
|
| QRegularExpression::InvertedGreedinessOption );
|
|
#else
|
|
QRegExp audioRe( "<\\s*audio\\s+src\\s*=\\s*([\"']+)([^\"']+)([\"'])\\s*>(.*)</audio>", Qt::CaseInsensitive );
|
|
audioRe.setMinimal( true );
|
|
#endif
|
|
|
|
pos = 0;
|
|
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
it = audioRe.globalMatch( article );
|
|
while( it.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it.next();
|
|
articleNewText += article.midRef( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QString src = match.captured( 2 );
|
|
#else
|
|
while( pos >= 0 )
|
|
{
|
|
pos = audioRe.indexIn( article, pos );
|
|
if( pos < 0 )
|
|
break;
|
|
|
|
QString src = audioRe.cap( 2 );
|
|
#endif
|
|
if( src.indexOf( "://" ) >= 0 )
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
articleNewText += match.captured();
|
|
#else
|
|
pos += audioRe.cap( 0 ).length();
|
|
#endif
|
|
else
|
|
{
|
|
std::string href = "\"gdau://" + getId() + "/" + src.toUtf8().data() + "\"";
|
|
QString newTag = QString::fromUtf8( ( addAudioLink( href, getId() ) + "<span class=\"gls_wav\"><a href=" + href + ">" ).c_str() );
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
newTag += match.captured( 4 );
|
|
if( match.captured( 4 ).indexOf( "<img " ) < 0 )
|
|
newTag += " <img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\">";
|
|
newTag += "</a></span>";
|
|
|
|
articleNewText += newTag;
|
|
#else
|
|
newTag += audioRe.cap( 4 );
|
|
if( audioRe.cap( 4 ).indexOf( "<img " ) < 0 )
|
|
newTag += " <img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\">";
|
|
newTag += "</a></span>";
|
|
|
|
article.replace( pos, audioRe.cap( 0 ).length(), newTag );
|
|
pos += newTag.length();
|
|
#endif
|
|
}
|
|
}
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
if( pos )
|
|
{
|
|
articleNewText += article.midRef( pos );
|
|
article = articleNewText;
|
|
articleNewText.clear();
|
|
}
|
|
#endif
|
|
|
|
return article;
|
|
}
|
|
|
|
void GlsDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
|
|
{
|
|
try
|
|
{
|
|
vector< string > headwords;
|
|
string articleStr;
|
|
loadArticleText( articleAddress, headwords, articleStr );
|
|
|
|
if( !headwords.empty() )
|
|
headword = QString::fromUtf8( headwords.front().data(), headwords.front().size() );
|
|
|
|
wstring wstr = Utf8::decode( articleStr );
|
|
|
|
text = Html::unescape( gd::toQString( wstr ) );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Gls: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
}
|
|
}
|
|
|
|
/// GlsDictionary::findHeadwordsForSynonym()
|
|
|
|
class GlsHeadwordsRequest;
|
|
|
|
class GlsHeadwordsRequestRunnable: public QRunnable
|
|
{
|
|
GlsHeadwordsRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
GlsHeadwordsRequestRunnable( GlsHeadwordsRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~GlsHeadwordsRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
virtual void run();
|
|
};
|
|
|
|
class GlsHeadwordsRequest: public Dictionary::WordSearchRequest
|
|
{
|
|
friend class GlsHeadwordsRequestRunnable;
|
|
|
|
wstring word;
|
|
GlsDictionary & dict;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
GlsHeadwordsRequest( wstring const & word_, GlsDictionary & dict_ ):
|
|
word( word_ ), dict( dict_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new GlsHeadwordsRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by StardictHeadwordsRequestRunnable
|
|
|
|
virtual void cancel()
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~GlsHeadwordsRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void GlsHeadwordsRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void GlsHeadwordsRequest::run()
|
|
{
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
vector< WordArticleLink > chain = dict.findArticles( word );
|
|
|
|
wstring caseFolded = Folding::applySimpleCaseOnly( word );
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string articleText;
|
|
vector< string > headwords;
|
|
|
|
dict.loadArticleText( chain[ x ].articleOffset,
|
|
headwords, articleText );
|
|
|
|
wstring headwordDecoded = Utf8::decode( headwords.front() );
|
|
|
|
if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) )
|
|
{
|
|
// The headword seems to differ from the input word, which makes the
|
|
// input word its synonym.
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
matches.push_back( headwordDecoded );
|
|
}
|
|
}
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
setErrorString( QString::fromUtf8( e.what() ) );
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::WordSearchRequest >
|
|
GlsDictionary::findHeadwordsForSynonym( wstring const & word )
|
|
THROW_SPEC( std::exception )
|
|
{
|
|
return synonymSearchEnabled ? new GlsHeadwordsRequest( word, *this ) :
|
|
Class::findHeadwordsForSynonym( word );
|
|
}
|
|
|
|
|
|
/// GlsDictionary::getArticle()
|
|
|
|
class GlsArticleRequest;
|
|
|
|
class GlsArticleRequestRunnable: public QRunnable
|
|
{
|
|
GlsArticleRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
GlsArticleRequestRunnable( GlsArticleRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~GlsArticleRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
virtual void run();
|
|
};
|
|
|
|
class GlsArticleRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class GlsArticleRequestRunnable;
|
|
|
|
wstring word;
|
|
vector< wstring > alts;
|
|
GlsDictionary & dict;
|
|
bool ignoreDiacritics;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
GlsArticleRequest( wstring const & word_,
|
|
vector< wstring > const & alts_,
|
|
GlsDictionary & dict_, bool ignoreDiacritics_ ):
|
|
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new GlsArticleRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by GlsArticleRequestRunnable
|
|
|
|
virtual void cancel()
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~GlsArticleRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void GlsArticleRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void GlsArticleRequest::run()
|
|
{
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
try
|
|
{
|
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
|
|
|
for( unsigned x = 0; x < alts.size(); ++x )
|
|
{
|
|
/// Make an additional query for each alt
|
|
|
|
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
}
|
|
|
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
|
|
|
set< uint32_t > articlesIncluded; // Some synonims make it that the articles
|
|
// appear several times. We combat this
|
|
// by only allowing them to appear once.
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
|
if( ignoreDiacritics )
|
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
|
continue; // We already have this article in the body.
|
|
|
|
// Now grab that article
|
|
|
|
string headword, articleText;
|
|
|
|
dict.loadArticle( chain[ x ].articleOffset, headword, articleText );
|
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
// main ones first, and alternates after.
|
|
|
|
// We do the case-folded comparison here.
|
|
|
|
wstring headwordStripped =
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
|
|
if( ignoreDiacritics )
|
|
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
|
|
|
|
multimap< wstring, pair< string, string > > & mapToUse =
|
|
( wordCaseFolded == headwordStripped ) ?
|
|
mainArticles : alternateArticles;
|
|
|
|
mapToUse.insert( pair< wstring, pair< string, string > >(
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
|
|
pair< string, string >( headword, articleText ) ) );
|
|
|
|
articlesIncluded.insert( chain[ x ].articleOffset );
|
|
}
|
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() )
|
|
{
|
|
// No such word
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string result;
|
|
|
|
multimap< wstring, pair< string, string > >::const_iterator i;
|
|
|
|
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
|
|
{
|
|
result += i->second.second;
|
|
}
|
|
|
|
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
|
|
{
|
|
result += i->second.second;
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
data.resize( result.size() );
|
|
|
|
memcpy( &data.front(), result.data(), result.size() );
|
|
|
|
hasAnyData = true;
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
setErrorString( QString::fromUtf8( e.what() ) );
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > GlsDictionary::getArticle( wstring const & word,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
THROW_SPEC( std::exception )
|
|
{
|
|
return new GlsArticleRequest( word, alts, *this, ignoreDiacritics );
|
|
}
|
|
|
|
//////////////// GlsDictionary::getResource()
|
|
|
|
class GlsResourceRequest;
|
|
|
|
class GlsResourceRequestRunnable: public QRunnable
|
|
{
|
|
GlsResourceRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
GlsResourceRequestRunnable( GlsResourceRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~GlsResourceRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
virtual void run();
|
|
};
|
|
|
|
class GlsResourceRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class GlsResourceRequestRunnable;
|
|
|
|
GlsDictionary & dict;
|
|
|
|
string resourceName;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
GlsResourceRequest( GlsDictionary & dict_,
|
|
string const & resourceName_ ):
|
|
dict( dict_ ),
|
|
resourceName( resourceName_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new GlsResourceRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by GlsResourceRequestRunnable
|
|
|
|
virtual void cancel()
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~GlsResourceRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void GlsResourceRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void GlsResourceRequest::run()
|
|
{
|
|
// Some runnables linger enough that they are cancelled before they start
|
|
if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
string n =
|
|
FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
|
|
FsEncoding::separator() +
|
|
FsEncoding::encode( resourceName );
|
|
|
|
GD_DPRINTF( "n is %s\n", n.c_str() );
|
|
|
|
try
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
File::loadFromFile( n, data );
|
|
}
|
|
catch( File::exCantOpen & )
|
|
{
|
|
n = dict.getDictionaryFilenames()[ 0 ] + ".files" +
|
|
FsEncoding::separator() +
|
|
FsEncoding::encode( resourceName );
|
|
|
|
try
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
File::loadFromFile( n, data );
|
|
}
|
|
catch( File::exCantOpen & )
|
|
{
|
|
// Try reading from zip file
|
|
|
|
if ( dict.resourceZip.isOpen() )
|
|
{
|
|
Mutex::Lock _( dict.resourceZipMutex );
|
|
|
|
Mutex::Lock __( dataMutex );
|
|
|
|
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
|
|
throw; // Make it fail since we couldn't read the archive
|
|
}
|
|
else
|
|
throw;
|
|
}
|
|
}
|
|
|
|
if ( Filetype::isNameOfTiff( resourceName ) )
|
|
{
|
|
// Convert it
|
|
|
|
dataMutex.lock();
|
|
|
|
QImage img = QImage::fromData( (unsigned char *) &data.front(),
|
|
data.size() );
|
|
|
|
#ifdef MAKE_EXTRA_TIFF_HANDLER
|
|
if( img.isNull() )
|
|
GdTiff::tiffToQImage( &data.front(), data.size(), img );
|
|
#endif
|
|
|
|
dataMutex.unlock();
|
|
|
|
if ( !img.isNull() )
|
|
{
|
|
// Managed to load -- now store it back as BMP
|
|
|
|
QByteArray ba;
|
|
QBuffer buffer( &ba );
|
|
buffer.open( QIODevice::WriteOnly );
|
|
img.save( &buffer, "BMP" );
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
data.resize( buffer.size() );
|
|
|
|
memcpy( &data.front(), buffer.data(), data.size() );
|
|
}
|
|
}
|
|
|
|
if( Filetype::isNameOfCSS( resourceName ) )
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
QString css = QString::fromUtf8( data.data(), data.size() );
|
|
|
|
// Correct some url's
|
|
|
|
QString id = QString::fromUtf8( dict.getId().c_str() );
|
|
int pos = 0;
|
|
|
|
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
|
|
QRegularExpression links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)",
|
|
QRegularExpression::CaseInsensitiveOption );
|
|
|
|
QString newCSS;
|
|
QRegularExpressionMatchIterator it = links.globalMatch( css );
|
|
while( it.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it.next();
|
|
newCSS += css.midRef( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QString url = match.captured( 2 );
|
|
|
|
if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
|
|
{
|
|
// External link
|
|
newCSS += match.captured();
|
|
continue;
|
|
}
|
|
|
|
QString newUrl = QString( "url(" ) + match.captured( 1 ) + "bres://"
|
|
+ id + "/" + url + match.captured( 3 ) + ")";
|
|
newCSS += newUrl;
|
|
}
|
|
if( pos )
|
|
{
|
|
newCSS += css.midRef( pos );
|
|
css = newCSS;
|
|
newCSS.clear();
|
|
}
|
|
#else
|
|
QRegExp links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)", Qt::CaseInsensitive, QRegExp::RegExp );
|
|
for( ; ; )
|
|
{
|
|
pos = links.indexIn( css, pos );
|
|
if( pos < 0 )
|
|
break;
|
|
QString url = links.cap( 2 );
|
|
|
|
if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
|
|
{
|
|
// External link
|
|
pos += links.cap().size();
|
|
continue;
|
|
}
|
|
|
|
QString newUrl = QString( "url(" ) + links.cap( 1 ) + "bres://"
|
|
+ id + "/" + url + links.cap( 3 ) + ")";
|
|
css.replace( pos, links.cap().size(), newUrl );
|
|
pos += newUrl.size();
|
|
}
|
|
#endif
|
|
|
|
dict.isolateCSS( css );
|
|
QByteArray bytes = css.toUtf8();
|
|
data.resize( bytes.size() );
|
|
memcpy( &data.front(), bytes.constData(), bytes.size() );
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
hasAnyData = true;
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdCWarning( dictionaryResourceLc, "GLS: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
|
|
resourceName.c_str(), dict.getName().c_str(), ex.what() );
|
|
// Resource not loaded -- we don't set the hasAnyData flag then
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > GlsDictionary::getResource( string const & name )
|
|
THROW_SPEC( std::exception )
|
|
{
|
|
return new GlsResourceRequest( *this, name );
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > GlsDictionary::getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics,
|
|
QThreadPool * ftsThreadPoolPtr )
|
|
{
|
|
return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics, ftsThreadPoolPtr );
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
/// makeDictionaries
|
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries(
|
|
vector< string > const & fileNames,
|
|
string const & indicesDir,
|
|
Dictionary::Initializing & initializing )
|
|
THROW_SPEC( std::exception )
|
|
{
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
|
|
++i )
|
|
{
|
|
// Try .gls and .gls.dz suffixes
|
|
|
|
if( !( i->size() >= 4 && strcasecmp( i->c_str() + ( i->size() - 4 ), ".gls" ) == 0 )
|
|
&& !( i->size() >= 7 && strcasecmp( i->c_str() + ( i->size() - 7 ), ".gls.dz" ) == 0 ) )
|
|
continue;
|
|
|
|
unsigned atLine = 0; // Indicates current line in .gls, for debug purposes
|
|
|
|
try
|
|
{
|
|
vector< string > dictFiles( 1, *i );
|
|
|
|
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
|
|
|
// See if there's a zip file with resources present. If so, include it.
|
|
|
|
string baseName = ( (*i)[ i->size() - 4 ] == '.' ) ?
|
|
string( *i, 0, i->size() - 4 ) : string( *i, 0, i->size() - 7 );
|
|
|
|
string zipFileName;
|
|
|
|
if ( File::tryPossibleZipName( baseName + ".gls.files.zip", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".gls.dz.files.zip", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".GLS.FILES.ZIP", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + ".GLS.DZ.FILES.ZIP", zipFileName ) )
|
|
dictFiles.push_back( zipFileName );
|
|
|
|
string indexFile = indicesDir + dictId;
|
|
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
|
|
indexIsOldOrBad( indexFile, zipFileName.size() ) )
|
|
{
|
|
GlsScanner scanner( *i );
|
|
|
|
try { // Here we intercept any errors during the read to save line at
|
|
// which the incident happened. We need alive scanner for that.
|
|
|
|
// Building the index
|
|
initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) );
|
|
|
|
gdDebug( "Gls: Building the index for dictionary: %s\n",
|
|
gd::toQString( scanner.getDictionaryName() ).toUtf8().data() );
|
|
|
|
File::Class idx( indexFile, "wb" );
|
|
|
|
IdxHeader idxHeader;
|
|
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
// will be rewritten with the right values.
|
|
|
|
idx.write( idxHeader );
|
|
|
|
string dictionaryName = Utf8::encode( scanner.getDictionaryName() );
|
|
|
|
idx.write( (uint32_t) dictionaryName.size() );
|
|
idx.write( dictionaryName.data(), dictionaryName.size() );
|
|
|
|
idxHeader.glsEncoding = scanner.getEncoding();
|
|
|
|
IndexedWords indexedWords;
|
|
|
|
ChunkedStorage::Writer chunks( idx );
|
|
|
|
wstring curString;
|
|
size_t curOffset;
|
|
|
|
uint32_t articleCount = 0, wordCount = 0;
|
|
|
|
for( ; ; )
|
|
{
|
|
// Find the headwords
|
|
|
|
if ( !scanner.readNextLine( curString, curOffset ) )
|
|
break; // Clean end of file
|
|
|
|
if( curString.empty() )
|
|
continue;
|
|
|
|
uint32_t articleOffset = curOffset;
|
|
|
|
// Parse headwords
|
|
|
|
list< wstring > allEntryWords;
|
|
wstring::size_type start_pos = 0, end_pos = 0;
|
|
for( ; ; )
|
|
{
|
|
end_pos = curString.find( '|', start_pos );
|
|
if( end_pos == wstring::npos )
|
|
{
|
|
wstring headword = curString.substr( start_pos );
|
|
if( !headword.empty() )
|
|
allEntryWords.push_back( headword );
|
|
break;
|
|
}
|
|
allEntryWords.push_back( curString.substr( start_pos, end_pos - start_pos ) );
|
|
start_pos = end_pos + 1;
|
|
}
|
|
|
|
// Skip article body
|
|
|
|
for( ; ; )
|
|
{
|
|
if( !scanner.readNextLine( curString, curOffset ) )
|
|
break;
|
|
if( curString.empty() )
|
|
break;
|
|
}
|
|
|
|
// Insert new entry
|
|
|
|
uint32_t descOffset = chunks.startNewBlock();
|
|
chunks.addToBlock( &articleOffset, sizeof( articleOffset ) );
|
|
|
|
uint32_t articleSize = curOffset - articleOffset;
|
|
chunks.addToBlock( &articleSize, sizeof( articleSize ) );
|
|
|
|
for( list< wstring >::iterator j = allEntryWords.begin();
|
|
j != allEntryWords.end(); ++j )
|
|
indexedWords.addWord( *j, descOffset );
|
|
|
|
++articleCount;
|
|
wordCount += allEntryWords.size();
|
|
}
|
|
|
|
// Finish with the chunks
|
|
|
|
idxHeader.chunksOffset = chunks.finish();
|
|
|
|
// Build index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
|
|
|
indexedWords.clear(); // Release memory -- no need for this data
|
|
|
|
// If there was a zip file, index it too
|
|
|
|
if ( zipFileName.size() )
|
|
{
|
|
GD_DPRINTF( "Indexing zip file\n" );
|
|
|
|
idxHeader.hasZipFile = 1;
|
|
|
|
IndexedWords zipFileNames;
|
|
IndexedZip zipFile;
|
|
if( zipFile.openZipFile( QDir::fromNativeSeparators(
|
|
FsEncoding::decode( zipFileName.c_str() ) ) ) )
|
|
zipFile.indexFile( zipFileNames );
|
|
|
|
if( !zipFileNames.empty() )
|
|
{
|
|
// Build the resulting zip file index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
|
|
|
|
idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
|
|
}
|
|
else
|
|
{
|
|
// Bad zip file -- no index (though the mark that we have one
|
|
// remains)
|
|
idxHeader.zipIndexBtreeMaxElements = 0;
|
|
idxHeader.zipIndexRootOffset = 0;
|
|
}
|
|
}
|
|
else
|
|
idxHeader.hasZipFile = 0;
|
|
|
|
// That concludes it. Update the header.
|
|
|
|
idxHeader.signature = Signature;
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
|
idxHeader.zipSupportVersion = CurrentZipSupportVersion;
|
|
|
|
idxHeader.articleCount = articleCount;
|
|
idxHeader.wordCount = wordCount;
|
|
|
|
idxHeader.langFrom = LangCoder::findIdForLanguage( scanner.getLangFrom() );
|
|
idxHeader.langTo = LangCoder::findIdForLanguage( scanner.getLangTo() );
|
|
if( idxHeader.langFrom == 0 && idxHeader.langTo == 0 )
|
|
{
|
|
// if no languages found, try dictionary's file name
|
|
QPair<quint32,quint32> langs =
|
|
LangCoder::findIdsForFilename( QString::fromStdString( dictFiles[ 0 ] ) );
|
|
|
|
// if no languages found, try dictionary's name
|
|
if ( langs.first == 0 || langs.second == 0 )
|
|
{
|
|
langs =
|
|
LangCoder::findIdsForFilename( QString::fromStdString( dictionaryName ) );
|
|
}
|
|
idxHeader.langFrom = langs.first;
|
|
idxHeader.langTo = langs.second;
|
|
}
|
|
|
|
idx.rewind();
|
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
} // In-place try for saving line count
|
|
catch( ... )
|
|
{
|
|
atLine = scanner.getLinesRead();
|
|
throw;
|
|
}
|
|
|
|
} // if need to rebuild
|
|
dictionaries.push_back( new GlsDictionary( dictId,
|
|
indexFile,
|
|
dictFiles ) );
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
gdWarning( "GLS dictionary reading failed: %s:%u, error: %s\n",
|
|
i->c_str(), atLine, e.what() );
|
|
}
|
|
}
|
|
|
|
return dictionaries;
|
|
}
|
|
|
|
} // namespace Gls
|