mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 23:34:06 +00:00
540dda26ed
The replacement command: git grep -l 'qrcx://localhost' | xargs sed -i 's/qrcx:\/\/localhost/qrc:\/\//g' The qrcx:// URL scheme was introduced in 2009 or earlier - it is present in the first commit in GoldenDict's git history. Back then GoldenDict supported Qt versions earlier than 4.6, in which QWebSecurityOrigin::addLocalScheme() was introduced. Adding the qrc URL scheme as local obsoletes the qrcx URL scheme. GoldenDict does not compile against Qt versions earlier than 4.6, so there is no reason to use this custom URL scheme anymore. Co-authored-by: Igor Kushnir <igorkuo@gmail.com>
2197 lines
68 KiB
C++
2197 lines
68 KiB
C++
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#include "stardict.hh"
|
|
#include "btreeidx.hh"
|
|
#include "folding.hh"
|
|
#include "utf8.hh"
|
|
#include "chunkedstorage.hh"
|
|
#include "dictzip.h"
|
|
#include "xdxf2html.hh"
|
|
#include "htmlescape.hh"
|
|
#include "langcoder.hh"
|
|
#include "gddebug.hh"
|
|
#include "fsencoding.hh"
|
|
#include "filetype.hh"
|
|
#include "indexedzip.hh"
|
|
#include "tiff.hh"
|
|
#include "ftshelpers.hh"
|
|
#include "wstring_qt.hh"
|
|
#include "audiolink.hh"
|
|
|
|
#include <zlib.h>
|
|
#include <map>
|
|
#include <set>
|
|
#include <string>
|
|
// msvc defines _WIN32 https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170
|
|
// gcc also defines __WIN32, _WIN32, __WIN32__
|
|
// todo: unify how windows are detected on headers
|
|
#ifndef _WIN32
|
|
#include <arpa/inet.h>
|
|
#else
|
|
#include <winsock.h>
|
|
#endif
|
|
#include <stdlib.h>
|
|
|
|
#ifdef _MSC_VER
|
|
#include <stub_msvc.h>
|
|
#endif
|
|
|
|
#include <QString>
|
|
#include <QSemaphore>
|
|
#include <QThreadPool>
|
|
#include <QAtomicInt>
|
|
#include <QDebug>
|
|
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
|
|
#include <QtCore5Compat/QRegExp>
|
|
#else
|
|
#include <QRegExp>
|
|
#endif
|
|
#include <QStringList>
|
|
#include <QDomDocument>
|
|
#include <QDomNode>
|
|
#include "ufile.hh"
|
|
#include "utils.hh"
|
|
|
|
#include <QRegularExpression>
|
|
|
|
namespace Stardict {
|
|
|
|
using std::map;
|
|
using std::multimap;
|
|
using std::pair;
|
|
using std::set;
|
|
using std::string;
|
|
using gd::wstring;
|
|
|
|
using BtreeIndexing::WordArticleLink;
|
|
using BtreeIndexing::IndexedWords;
|
|
using BtreeIndexing::IndexInfo;
|
|
|
|
namespace {
|
|
|
|
DEF_EX( exNotAnIfoFile, "Not an .ifo file", Dictionary::Ex )
|
|
DEF_EX_STR( exBadFieldInIfo, "Bad field in .ifo file encountered:", Dictionary::Ex )
|
|
DEF_EX_STR( exNoIdxFile, "No corresponding .idx file was found for", Dictionary::Ex )
|
|
DEF_EX_STR( exNoDictFile, "No corresponding .dict file was found for", Dictionary::Ex )
|
|
DEF_EX_STR( exNoSynFile, "No corresponding .syn file was found for", Dictionary::Ex )
|
|
|
|
DEF_EX( ex64BitsNotSupported, "64-bit indices are not presently supported, sorry", Dictionary::Ex )
|
|
DEF_EX( exDicttypeNotSupported, "Dictionaries with dicttypes are not supported, sorry", Dictionary::Ex )
|
|
|
|
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
|
|
DEF_EX_STR( exWordIsTooLarge, "Enountered a word that is too large:", Dictionary::Ex )
|
|
DEF_EX_STR( exSuddenEndOfFile, "Sudden end of file", Dictionary::Ex )
|
|
DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
|
|
|
|
DEF_EX_STR( exIncorrectOffset, "Incorrect offset encountered in file", Dictionary::Ex )
|
|
|
|
/// Contents of an ifo file
|
|
struct Ifo
|
|
{
|
|
string version;
|
|
string bookname;
|
|
uint32_t wordcount, synwordcount, idxfilesize, idxoffsetbits;
|
|
string sametypesequence, dicttype, description;
|
|
string copyright, author, email, website, date;
|
|
|
|
Ifo( File::Class & );
|
|
};
|
|
|
|
enum
|
|
{
|
|
Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian
|
|
CurrentFormatVersion = 9 + BtreeIndexing::FormatVersion + Folding::Version
|
|
};
|
|
|
|
struct IdxHeader
|
|
{
|
|
uint32_t signature; // First comes the signature, SIDX
|
|
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
|
uint32_t chunksOffset; // The offset to chunks' storage
|
|
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
|
uint32_t indexRootOffset;
|
|
uint32_t wordCount; // Saved from Ifo::wordcount
|
|
uint32_t synWordCount; // Saved from Ifo::synwordcount
|
|
uint32_t bookNameSize; // Book name's length. Used to read it then.
|
|
uint32_t sameTypeSequenceSize; // That string's size. Used to read it then.
|
|
uint32_t langFrom; // Source language
|
|
uint32_t langTo; // Target language
|
|
uint32_t hasZipFile; // Non-zero means there's a zip file with resources present
|
|
uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
|
|
// resource index.
|
|
uint32_t zipIndexRootOffset;
|
|
}
|
|
#ifndef _MSC_VER
|
|
__attribute__((packed))
|
|
#endif
|
|
;
|
|
|
|
bool indexIsOldOrBad( string const & indexFile )
|
|
{
|
|
File::Class idx( indexFile, "rb" );
|
|
|
|
IdxHeader header;
|
|
|
|
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
|
|
header.signature != Signature ||
|
|
header.formatVersion != CurrentFormatVersion;
|
|
}
|
|
|
|
class StardictDictionary: public BtreeIndexing::BtreeDictionary
|
|
{
|
|
Mutex idxMutex;
|
|
File::Class idx;
|
|
IdxHeader idxHeader;
|
|
string bookName;
|
|
string sameTypeSequence;
|
|
ChunkedStorage::Reader chunks;
|
|
Mutex dzMutex;
|
|
dictData * dz;
|
|
Mutex resourceZipMutex;
|
|
IndexedZip resourceZip;
|
|
|
|
public:
|
|
|
|
StardictDictionary( string const & id, string const & indexFile,
|
|
vector< string > const & dictionaryFiles );
|
|
|
|
~StardictDictionary();
|
|
|
|
string getName() noexcept override
|
|
{ return bookName; }
|
|
|
|
map< Dictionary::Property, string > getProperties() noexcept override
|
|
{ return map< Dictionary::Property, string >(); }
|
|
|
|
unsigned long getArticleCount() noexcept override
|
|
{ return idxHeader.wordCount; }
|
|
|
|
unsigned long getWordCount() noexcept override
|
|
{ return idxHeader.wordCount + idxHeader.synWordCount; }
|
|
|
|
inline quint32 getLangFrom() const override
|
|
{ return idxHeader.langFrom; }
|
|
|
|
inline quint32 getLangTo() const override
|
|
{ return idxHeader.langTo; }
|
|
|
|
sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override
|
|
;
|
|
|
|
sptr< Dictionary::DataRequest > getArticle( wstring const &,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics ) override
|
|
;
|
|
|
|
sptr< Dictionary::DataRequest > getResource( string const & name ) override
|
|
;
|
|
|
|
QString const& getDescription() override;
|
|
|
|
QString getMainFilename() override;
|
|
|
|
sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics ) override;
|
|
void getArticleText( uint32_t articleAddress, QString & headword, QString & text ) override;
|
|
|
|
void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration ) override;
|
|
|
|
void setFTSParameters( Config::FullTextSearch const & fts ) override
|
|
{
|
|
can_FTS = fts.enabled
|
|
&& !fts.disabledTypes.contains( "STARDICT", Qt::CaseInsensitive )
|
|
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
|
|
}
|
|
protected:
|
|
|
|
void loadIcon() noexcept override;
|
|
|
|
private:
|
|
|
|
/// Retrieves the article's offset/size in .dict file, and its headword.
|
|
void getArticleProps( uint32_t articleAddress,
|
|
string & headword,
|
|
uint32_t & offset, uint32_t & size );
|
|
|
|
/// Loads the article, storing its headword and formatting the data it has
|
|
/// into an html.
|
|
void loadArticle( uint32_t address,
|
|
string & headword,
|
|
string & articleText );
|
|
|
|
string loadString( size_t size );
|
|
|
|
string handleResource( char type, char const * resource, size_t size );
|
|
|
|
void pangoToHtml( QString & text );
|
|
|
|
friend class StardictResourceRequest;
|
|
friend class StardictArticleRequest;
|
|
friend class StardictHeadwordsRequest;
|
|
};
|
|
|
|
StardictDictionary::StardictDictionary( string const & id,
|
|
string const & indexFile,
|
|
vector< string > const & dictionaryFiles ):
|
|
BtreeDictionary( id, dictionaryFiles ),
|
|
idx( indexFile, "rb" ),
|
|
idxHeader( idx.read< IdxHeader >() ),
|
|
bookName( loadString( idxHeader.bookNameSize ) ),
|
|
sameTypeSequence( loadString( idxHeader.sameTypeSequenceSize ) ),
|
|
chunks( idx, idxHeader.chunksOffset )
|
|
{
|
|
// Open the .dict file
|
|
|
|
DZ_ERRORS error;
|
|
dz = dict_data_open( dictionaryFiles[ 2 ].c_str(), &error, 0 );
|
|
|
|
if ( !dz )
|
|
throw exDictzipError( string( dz_error_str( error ) )
|
|
+ "(" + dictionaryFiles[ 2 ] + ")" );
|
|
|
|
// Initialize the index
|
|
|
|
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
|
idxHeader.indexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
// Open a resource zip file, if there's one
|
|
|
|
if ( idxHeader.hasZipFile &&
|
|
( idxHeader.zipIndexBtreeMaxElements ||
|
|
idxHeader.zipIndexRootOffset ) )
|
|
{
|
|
resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
|
|
idxHeader.zipIndexRootOffset ),
|
|
idx, idxMutex );
|
|
|
|
QString zipName = QDir::fromNativeSeparators(
|
|
FsEncoding::decode( getDictionaryFilenames().back().c_str() ) );
|
|
|
|
if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
|
|
resourceZip.openZipFile( zipName );
|
|
}
|
|
|
|
// Full-text search parameters
|
|
|
|
can_FTS = true;
|
|
|
|
ftsIdxName = indexFile + Dictionary::getFtsSuffix();
|
|
|
|
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
|
|
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
|
|
FTS_index_completed.ref();
|
|
}
|
|
|
|
StardictDictionary::~StardictDictionary()
|
|
{
|
|
if ( dz )
|
|
dict_data_close( dz );
|
|
}
|
|
|
|
void StardictDictionary::loadIcon() noexcept
|
|
{
|
|
if ( dictionaryIconLoaded )
|
|
return;
|
|
|
|
QString fileName =
|
|
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
|
|
|
|
// Remove the extension
|
|
fileName.chop( 3 );
|
|
|
|
if( !loadIconFromFile( fileName ) )
|
|
{
|
|
// Load failed -- use default icons
|
|
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_stardict.png");
|
|
}
|
|
|
|
dictionaryIconLoaded = true;
|
|
}
|
|
|
|
string StardictDictionary::loadString( size_t size )
|
|
{
|
|
if( size == 0 )
|
|
return string();
|
|
|
|
vector< char > data( size );
|
|
|
|
idx.read( &data.front(), data.size() );
|
|
|
|
return string( &data.front(), data.size() );
|
|
}
|
|
|
|
void StardictDictionary::getArticleProps( uint32_t articleAddress,
|
|
string & headword,
|
|
uint32_t & offset, uint32_t & size )
|
|
{
|
|
vector< char > chunk;
|
|
|
|
Mutex::Lock _( idxMutex );
|
|
|
|
char * articleData = chunks.getBlock( articleAddress, chunk );
|
|
|
|
memcpy( &offset, articleData, sizeof( uint32_t ) );
|
|
articleData += sizeof( uint32_t );
|
|
memcpy( &size, articleData, sizeof( uint32_t ) );
|
|
articleData += sizeof( uint32_t );
|
|
|
|
headword = articleData;
|
|
}
|
|
|
|
class PowerWordDataProcessor{
|
|
class PWSyntaxTranslate{
|
|
public:
|
|
PWSyntaxTranslate(const char* re, const char* replacement)
|
|
: _re(re, QRegularExpression::UseUnicodePropertiesOption )
|
|
|
|
, _replacement(replacement)
|
|
{
|
|
}
|
|
const QRegularExpression & re() const {
|
|
|
|
return _re;
|
|
}
|
|
const QString & replacement() const {
|
|
return _replacement;
|
|
}
|
|
private:
|
|
QRegularExpression _re;
|
|
|
|
QString _replacement;
|
|
};
|
|
public:
|
|
PowerWordDataProcessor(const char* resource, size_t size)
|
|
: _data(QString::fromUtf8(resource, size))
|
|
{
|
|
}
|
|
|
|
string process() {
|
|
QDomDocument doc;
|
|
QString ss;
|
|
ss = "<div class=\"sdct_k\">";
|
|
if (!doc.setContent(_data)) {
|
|
ss += _data ;
|
|
} else {
|
|
QStringList sl;
|
|
walkNode(doc.firstChild(), sl);
|
|
|
|
QStringListIterator itr(sl);
|
|
while (itr.hasNext()) {
|
|
QString s = itr.next();
|
|
translatePW(s);
|
|
ss += s;
|
|
ss += "<br>";
|
|
}
|
|
}
|
|
ss += "</div>";
|
|
QByteArray ba = ss.toUtf8();
|
|
return string(ba.data(), ba.size());
|
|
}
|
|
private:
|
|
void walkNode(const QDomNode& e, QStringList& sl) {
|
|
if (e.isNull()) {
|
|
return;
|
|
}
|
|
if (e.isText()) {
|
|
sl.append(e.toText().data());
|
|
} else {
|
|
QDomNodeList l = e.childNodes();
|
|
for (int i = 0; i < l.size(); ++i) {
|
|
QDomNode n = l.at(i);
|
|
if (n.isText()) {
|
|
sl.append(n.toText().data());
|
|
} else {
|
|
walkNode(n, sl);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void translatePW(QString& s){
|
|
const int TRANSLATE_TBL_SIZE=5;
|
|
static PWSyntaxTranslate t[TRANSLATE_TBL_SIZE]={
|
|
PWSyntaxTranslate(R"(&[bB]\s*\{([^\{}&]+)\})", "<B>\\1</B>"),
|
|
PWSyntaxTranslate(R"(&[iI]\s*\{([^\{}&]+)\})", "<I>\\1</I>"),
|
|
PWSyntaxTranslate(R"(&[uU]\s*\{([^\{}&]+)\})", "<U>\\1</U>"),
|
|
PWSyntaxTranslate(R"(&[lL]\s*\{([^\{}&]+)\})", R"(<SPAN style="color:#0000ff">\1</SPAN>)"),
|
|
PWSyntaxTranslate(R"(&[2]\s*\{([^\{}&]+)\})", R"(<SPAN style="color:#0000ff">\1</SPAN>)")
|
|
};
|
|
|
|
QString old;
|
|
while (s.compare(old) != 0) {
|
|
for (int i = 0; i < TRANSLATE_TBL_SIZE; ++i) {
|
|
PWSyntaxTranslate& a = t[i];
|
|
s.replace(a.re(), a.replacement());
|
|
}
|
|
old = s;
|
|
}
|
|
s.replace(QRegularExpression( "&.\\s*\\{",
|
|
QRegularExpression::UseUnicodePropertiesOption
|
|
| QRegularExpression::DotMatchesEverythingOption),
|
|
"");
|
|
s.replace("}", "");
|
|
}
|
|
private:
|
|
QString _data;
|
|
};
|
|
|
|
|
|
/// This function tries to make an html of the Stardict's resource typed
|
|
/// 'type', contained in a block pointed to by 'resource', 'size' bytes long.
|
|
string StardictDictionary::handleResource( char type, char const * resource, size_t size )
|
|
{
|
|
QString text;
|
|
switch( type )
|
|
{
|
|
case 'x': // Xdxf content
|
|
return Xdxf2Html::convert( string( resource, size ), Xdxf2Html::STARDICT, NULL, this, &resourceZip );
|
|
case 'h': // Html content
|
|
{
|
|
QString articleText = QString( "<div class=\"sdct_h\">" ) + QString::fromUtf8( resource, size ) + "</div>";
|
|
|
|
QRegularExpression imgRe( R"((<\s*img\s+[^>]*src\s*=\s*["']+)(?!(?:data|https?|ftp):))",
|
|
QRegularExpression::CaseInsensitiveOption
|
|
| QRegularExpression::InvertedGreedinessOption );
|
|
QRegularExpression linkRe( R"((<\s*link\s+[^>]*href\s*=\s*["']+)(?!(?:data|https?|ftp):))",
|
|
QRegularExpression::CaseInsensitiveOption
|
|
| QRegularExpression::InvertedGreedinessOption );
|
|
|
|
|
|
articleText.replace( imgRe , "\\1bres://" + QString::fromStdString( getId() ) + "/" )
|
|
.replace( linkRe, "\\1bres://" + QString::fromStdString( getId() ) + "/" );
|
|
|
|
// Handle links to articles
|
|
|
|
QRegularExpression linksReg( R"(<a(\s*[^>]*)href\s*=\s*['"](bword://)?([^'"]+)['"])",
|
|
QRegularExpression::CaseInsensitiveOption );
|
|
|
|
|
|
int pos = 0;
|
|
QString articleNewText;
|
|
QRegularExpressionMatchIterator it = linksReg.globalMatch( articleText );
|
|
while( it.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it.next();
|
|
articleNewText += articleText.mid( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QString link = match.captured( 3 );
|
|
|
|
if( link.indexOf( ':' ) < 0 )
|
|
{
|
|
QString newLink;
|
|
if( link.indexOf( '#' ) < 0 )
|
|
newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"bword:" + link + "\"";
|
|
|
|
|
|
// Anchors
|
|
|
|
if( link.indexOf( '#' ) > 0 )
|
|
{
|
|
newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
|
|
|
|
newLink.replace( "#", "?gdanchor=" );
|
|
}
|
|
|
|
if( !newLink.isEmpty() )
|
|
{
|
|
articleNewText += newLink;
|
|
|
|
}
|
|
else
|
|
articleNewText += match.captured();
|
|
|
|
}
|
|
else
|
|
articleNewText += match.captured();
|
|
}
|
|
if( pos )
|
|
{
|
|
articleNewText += articleText.mid( pos );
|
|
articleText = articleNewText;
|
|
articleNewText.clear();
|
|
}
|
|
|
|
|
|
// Handle "audio" tags
|
|
|
|
QRegularExpression audioRe( R"(<\s*audio\s*src\s*=\s*(["']+)([^"']+)(["'])\s*>(.*)</audio>)",
|
|
QRegularExpression::CaseInsensitiveOption
|
|
| QRegularExpression::DotMatchesEverythingOption
|
|
| QRegularExpression::InvertedGreedinessOption );
|
|
|
|
|
|
pos = 0;
|
|
|
|
it = audioRe.globalMatch( articleText );
|
|
while( it.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it.next();
|
|
articleNewText += articleText.mid( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QString src = match.captured( 2 );
|
|
|
|
if( src.indexOf( "://" ) >= 0 )
|
|
articleNewText += match.captured();
|
|
|
|
else
|
|
{
|
|
std::string href = "\"gdau://" + getId() + "/" + src.toUtf8().data() + "\"";
|
|
QString newTag = QString::fromUtf8( ( addAudioLink( href, getId() ) + "<span class=\"sdict_h_wav\"><a href=" + href + ">" ).c_str() );
|
|
newTag += match.captured( 4 );
|
|
if( match.captured( 4 ).indexOf( "<img " ) < 0 )
|
|
|
|
newTag += R"( <img src="qrc:///icons/playsound.png" border="0" alt="Play">)";
|
|
newTag += "</a></span>";
|
|
|
|
articleNewText += newTag;
|
|
|
|
}
|
|
}
|
|
if( pos )
|
|
{
|
|
articleNewText += articleText.mid( pos );
|
|
articleText = articleNewText;
|
|
articleNewText.clear();
|
|
}
|
|
|
|
return ( articleText.toUtf8().data() );
|
|
}
|
|
case 'm': // Pure meaning, usually means preformatted text
|
|
return "<div class=\"sdct_m\">" + Html::preformat( string( resource, size ), isToLanguageRTL() ) + "</div>";
|
|
case 'l': // Same as 'm', but not in utf8, instead in current locale's
|
|
// encoding.
|
|
// We just use Qt here, it should know better about system's
|
|
// locale.
|
|
return "<div class=\"sdct_l\">" + Html::preformat( QString::fromLocal8Bit( resource, size ).toUtf8().data(),
|
|
isToLanguageRTL() )
|
|
+ "</div>";
|
|
case 'g': // Pango markup.
|
|
text = QString::fromUtf8( resource, size );
|
|
pangoToHtml( text );
|
|
return "<div class=\"sdct_g\">" + string( text.toUtf8().data() ) + "</div>";
|
|
case 't': // Transcription
|
|
return "<div class=\"sdct_t\">" + Html::escape( string( resource, size ) ) + "</div>";
|
|
case 'y': // Chinese YinBiao or Japanese KANA. Examples are needed. For now,
|
|
// just output as pure escaped utf8.
|
|
return "<div class=\"sdct_y\">" + Html::escape( string( resource, size ) ) + "</div>";
|
|
case 'k': // KingSoft PowerWord data.
|
|
{
|
|
PowerWordDataProcessor pwdp(resource, size);
|
|
return pwdp.process();
|
|
}
|
|
case 'w': // MediaWiki markup. We don't handle this right now.
|
|
return "<div class=\"sdct_w\">" + Html::escape( string( resource, size ) ) + "</div>";
|
|
case 'n': // WordNet data. We don't know anything about it.
|
|
return "<div class=\"sdct_n\">" + Html::escape( string( resource, size ) ) + "</div>";
|
|
|
|
case 'r': // Resource file list. For now, resources aren't handled.
|
|
return "<div class=\"sdct_r\">" + Html::escape( string( resource, size ) ) + "</div>";
|
|
|
|
case 'W': // An embedded Wav file. Unhandled yet.
|
|
return "<div class=\"sdct_W\">(an embedded .wav file)</div>";
|
|
case 'P': // An embedded picture file. Unhandled yet.
|
|
return "<div class=\"sdct_P\">(an embedded picture file)</div>";
|
|
}
|
|
|
|
if ( islower( type ) )
|
|
{
|
|
return string( "<b>Unknown textual entry type " ) + string( 1, type ) + ":</b> " + Html::escape( string( resource, size ) ) + "<br>";
|
|
}
|
|
else
|
|
return string( "<b>Unknown blob entry type " ) + string( 1, type ) + "</b><br>";
|
|
}
|
|
|
|
void StardictDictionary::pangoToHtml( QString & text )
|
|
{
|
|
/*
|
|
* Partially support for Pango Markup Language
|
|
* Attributes "fallback", "lang", "gravity", "gravity_hint" just ignored
|
|
*/
|
|
|
|
QRegExp spanRegex( "<span\\s*([^>]*)>", Qt::CaseInsensitive );
|
|
QRegExp styleRegex( "(\\w+)=\"([^\"]*)\"" );
|
|
|
|
text.replace( "\n", "<br>" );
|
|
|
|
int pos = 0;
|
|
do
|
|
{
|
|
pos = spanRegex.indexIn( text, pos );
|
|
if( pos >= 0 )
|
|
{
|
|
QString styles = spanRegex.cap( 1 );
|
|
QString newSpan( "<span style=\"" );
|
|
int stylePos = 0;
|
|
do
|
|
{
|
|
stylePos = styleRegex.indexIn( styles, stylePos );
|
|
QString style = styleRegex.cap( 1 );
|
|
if( stylePos >= 0 )
|
|
{
|
|
if( style.compare( "font_desc", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "font", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
// Parse font description
|
|
|
|
QStringList list = styleRegex.cap( 2 ).split( " ", Qt::SkipEmptyParts );
|
|
int n;
|
|
QString sizeStr, stylesStr, familiesStr;
|
|
for( n = list.size() - 1; n >= 0; n-- )
|
|
{
|
|
QString str = list.at( n );
|
|
|
|
// font size
|
|
if( str[ 0 ].isNumber() )
|
|
{
|
|
sizeStr = QString( "font-size:" ) + str + ";";
|
|
continue;
|
|
}
|
|
|
|
// font style
|
|
if( str.compare( "normal", Qt::CaseInsensitive ) == 0
|
|
|| str.compare( "oblique", Qt::CaseInsensitive ) == 0
|
|
|| str.compare( "italic", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
if( !stylesStr.contains( "font-style:" ) )
|
|
stylesStr += QString( "font-style:" ) + str + ";";
|
|
continue;
|
|
}
|
|
|
|
// font variant
|
|
if( str.compare( "smallcaps", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-variant:small-caps" ) ;
|
|
continue;
|
|
}
|
|
|
|
// font weight
|
|
if( str.compare( "ultralight", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-weight:100;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "light", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-weight:200;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "bold", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-weight:bold;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "ultrabold", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-weight:800;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "heavy", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-weight:900" );
|
|
continue;
|
|
}
|
|
|
|
// font stretch
|
|
if( str.compare( "ultracondensed", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-stretch:ultra-condensed;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "extracondensed", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-stretch:extra-condensed;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "semicondensed", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-stretch:semi-condensed;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "semiexpanded", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-stretch:semi-expanded;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "extraexpanded", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-stretch:extra-expanded;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "ultraexpanded", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-stretch:ultra-expanded;" );
|
|
continue;
|
|
}
|
|
if( str.compare( "condensed", Qt::CaseInsensitive ) == 0
|
|
|| str.compare( "expanded", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
stylesStr += QString( "font-stretch:" ) + str + ";";
|
|
continue;
|
|
}
|
|
|
|
// gravity
|
|
if( str.compare( "south", Qt::CaseInsensitive ) == 0
|
|
|| str.compare( "east", Qt::CaseInsensitive ) == 0
|
|
|| str.compare( "north", Qt::CaseInsensitive ) == 0
|
|
|| str.compare( "west", Qt::CaseInsensitive ) == 0
|
|
|| str.compare( "auto", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// last words is families list
|
|
if( n >= 0 )
|
|
{
|
|
familiesStr = QString( "font-family:" );
|
|
for( int i = 0; i <= n; i++ )
|
|
{
|
|
if( i > 0 && !familiesStr.endsWith( ',' ) )
|
|
familiesStr += ",";
|
|
familiesStr += list.at( i );
|
|
}
|
|
familiesStr += ";";
|
|
}
|
|
|
|
newSpan += familiesStr + stylesStr + sizeStr;
|
|
}
|
|
else if( style.compare( "font_family", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "face", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-family:" ) + styleRegex.cap( 2 ) + ";";
|
|
else if( style.compare( "font_size", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "size", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
if( styleRegex.cap( 2 )[ 0 ].isLetter()
|
|
|| styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "%" ) )
|
|
newSpan += QString( "font-size:" ) + styleRegex.cap( 2 ) +";";
|
|
else
|
|
{
|
|
int size = styleRegex.cap( 2 ).toInt();
|
|
if( size )
|
|
newSpan += QString( "font-size:%1pt;" ).arg( size / 1024.0, 0, 'f', 3 );
|
|
}
|
|
}
|
|
else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "style", Qt::CaseInsensitive ) == 0)
|
|
newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";";
|
|
else if( style.compare( "weight", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "weight", Qt::CaseInsensitive ) == 0)
|
|
{
|
|
QString str = styleRegex.cap( 2 );
|
|
if( str.compare( "ultralight", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-weight:100;" );
|
|
else if( str.compare( "light", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-weight:200;" );
|
|
else if( str.compare( "ultrabold", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-weight:800;" );
|
|
else if( str.compare( "heavy", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-weight:900" );
|
|
else
|
|
newSpan += QString( "font-weight:" ) + str + ";";
|
|
}
|
|
else if( style.compare( "font_variant", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "variant", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
if( styleRegex.cap( 2 ).compare( "smallcaps", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-variant:small-caps" );
|
|
else
|
|
newSpan += QString( "font-variant:" ) + styleRegex.cap( 2 ) + ";";
|
|
}
|
|
else if( style.compare( "font_stretch", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "stretch", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
QString str = styleRegex.cap( 2 );
|
|
if( str.compare( "ultracondensed", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-stretch:ultra-condensed;" );
|
|
else if( str.compare( "extracondensed", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-stretch:extra-condensed;" );
|
|
else if( str.compare( "semicondensed", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-stretch:semi-condensed;" );
|
|
else if( str.compare( "semiexpanded", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-stretch:semi-expanded;" );
|
|
else if( str.compare( "extraexpanded", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-stretch:extra-expanded;" );
|
|
else if( str.compare( "ultraexpanded", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "font-stretch:ultra-expanded;" );
|
|
else
|
|
newSpan += QString( "font-stretch:" ) + str + ";";
|
|
}
|
|
else if( style.compare( "foreground", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "fgcolor", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "color", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "color:" ) + styleRegex.cap( 2 ) + ";";
|
|
else if( style.compare( "background", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "bgcolor", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "background-color:" ) + styleRegex.cap( 2 ) + ";";
|
|
else if( style.compare( "underline_color", Qt::CaseInsensitive ) == 0
|
|
|| style.compare( "strikethrough_color", Qt::CaseInsensitive ) == 0 )
|
|
newSpan += QString( "text-decoration-color:" ) + styleRegex.cap( 2 ) + ";";
|
|
else if( style.compare( "underline", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
if( styleRegex.cap( 2 ).compare( "none", Qt::CaseInsensitive ) )
|
|
newSpan += QString( "text-decoration-line:none;" );
|
|
else
|
|
{
|
|
newSpan += QString( "text-decoration-line:underline; " );
|
|
if( styleRegex.cap( 2 ).compare( "low", Qt::CaseInsensitive ) )
|
|
newSpan += QString( "text-decoration-style:dotted;" );
|
|
else if( styleRegex.cap( 2 ).compare( "single", Qt::CaseInsensitive ) )
|
|
newSpan += QString( "text-decoration-style:solid;" );
|
|
else if( styleRegex.cap( 2 ).compare( "error", Qt::CaseInsensitive ) )
|
|
newSpan += QString( "text-decoration-style:wavy;" );
|
|
else
|
|
newSpan += QString( "text-decoration-style:" ) + styleRegex.cap( 2 ) + ";";
|
|
}
|
|
}
|
|
else if( style.compare( "strikethrough", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
if( styleRegex.cap( 2 ).compare( "true", Qt::CaseInsensitive ) )
|
|
newSpan += QString( "text-decoration-line:line-through;" );
|
|
else
|
|
newSpan += QString( "text-decoration-line:none;" );
|
|
}
|
|
else if( style.compare( "rise", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
if( styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "%" ) )
|
|
newSpan += QString( "vertical-align:" ) + styleRegex.cap( 2 ) +";";
|
|
else
|
|
{
|
|
int riseValue = styleRegex.cap( 2 ).toInt();
|
|
if( riseValue )
|
|
newSpan += QString( "vertical-align:%1pt;" ).arg( riseValue / 1024.0, 0, 'f', 3 );
|
|
}
|
|
}
|
|
else if( style.compare( "letter_spacing", Qt::CaseInsensitive ) == 0 )
|
|
{
|
|
if( styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
|
|
|| styleRegex.cap( 2 ).endsWith( "%" ) )
|
|
newSpan += QString( "letter-spacing:" ) + styleRegex.cap( 2 ) +";";
|
|
else
|
|
{
|
|
int spacing = styleRegex.cap( 2 ).toInt();
|
|
if( spacing )
|
|
newSpan += QString( "letter-spacing:%1pt;" ).arg( spacing / 1024.0, 0, 'f', 3 );
|
|
}
|
|
}
|
|
|
|
stylePos += styleRegex.matchedLength();
|
|
}
|
|
}
|
|
while( stylePos >= 0 );
|
|
|
|
newSpan += "\">";
|
|
text.replace( pos, spanRegex.matchedLength(), newSpan );
|
|
pos += newSpan.size();
|
|
}
|
|
}
|
|
while( pos >= 0 );
|
|
|
|
text.replace( " ", " " );
|
|
}
|
|
|
|
void StardictDictionary::loadArticle( uint32_t address,
|
|
string & headword,
|
|
string & articleText )
|
|
{
|
|
uint32_t offset, size;
|
|
|
|
getArticleProps( address, headword, offset, size );
|
|
|
|
char * articleBody;
|
|
|
|
{
|
|
Mutex::Lock _( dzMutex );
|
|
|
|
// Note that the function always zero-pads the result.
|
|
articleBody = dict_data_read_( dz, offset, size, 0, 0 );
|
|
}
|
|
|
|
if ( !articleBody )
|
|
{
|
|
// throw exCantReadFile( getDictionaryFilenames()[ 2 ] );
|
|
articleText = string( "<div class=\"sdict_m\">DICTZIP error: " ) + dict_error_str( dz ) + "</div>";
|
|
return;
|
|
}
|
|
|
|
articleText.clear();
|
|
|
|
char * ptr = articleBody;
|
|
|
|
if ( sameTypeSequence.size() )
|
|
{
|
|
/// The sequence is known, it's not stored in the article itself
|
|
for( unsigned seq = 0; seq < sameTypeSequence.size(); ++seq )
|
|
{
|
|
// Last entry doesn't have size info -- it is inferred from
|
|
// the bytes left
|
|
bool entrySizeKnown = ( seq == sameTypeSequence.size() - 1 );
|
|
|
|
uint32_t entrySize = 0;
|
|
|
|
if ( entrySizeKnown )
|
|
entrySize = size;
|
|
else
|
|
if ( !size )
|
|
{
|
|
gdWarning( "Stardict: short entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
|
|
char type = sameTypeSequence[ seq ];
|
|
|
|
if ( islower( type ) )
|
|
{
|
|
// Zero-terminated entry, unless it's the last one
|
|
if ( !entrySizeKnown )
|
|
entrySize = strlen( ptr );
|
|
|
|
if ( size < entrySize )
|
|
{
|
|
gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
|
|
articleText += handleResource( type, ptr, entrySize );
|
|
|
|
if ( !entrySizeKnown )
|
|
++entrySize; // Need to skip the zero byte
|
|
|
|
ptr += entrySize;
|
|
size -= entrySize;
|
|
}
|
|
else
|
|
if ( isupper( *ptr ) )
|
|
{
|
|
// An entry which has its size before contents, unless it's the last one
|
|
|
|
if ( !entrySizeKnown )
|
|
{
|
|
if ( size < sizeof( uint32_t ) )
|
|
{
|
|
gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
|
|
memcpy( &entrySize, ptr, sizeof( uint32_t ) );
|
|
|
|
entrySize = ntohl( entrySize );
|
|
|
|
ptr += sizeof( uint32_t );
|
|
size -= sizeof( uint32_t );
|
|
}
|
|
|
|
if ( size < entrySize )
|
|
{
|
|
gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
|
|
articleText += handleResource( type, ptr, entrySize );
|
|
|
|
ptr += entrySize;
|
|
size -= entrySize;
|
|
}
|
|
else
|
|
{
|
|
gdWarning( "Stardict: non-alpha entry type 0x%x for the word %s encountered in \"%s\".\n",
|
|
type, headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// The sequence is stored in each article separately
|
|
while( size )
|
|
{
|
|
if ( islower( *ptr ) )
|
|
{
|
|
// Zero-terminated entry
|
|
size_t len = strlen( ptr + 1 );
|
|
|
|
if ( size < len + 2 )
|
|
{
|
|
gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
|
|
articleText += handleResource( *ptr, ptr + 1, len );
|
|
|
|
ptr += len + 2;
|
|
size -= len + 2;
|
|
}
|
|
else
|
|
if ( isupper( *ptr ) )
|
|
{
|
|
// An entry which havs its size before contents
|
|
if ( size < sizeof( uint32_t ) + 1 )
|
|
{
|
|
gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
|
|
uint32_t entrySize;
|
|
|
|
memcpy( &entrySize, ptr + 1, sizeof( uint32_t ) );
|
|
|
|
entrySize = ntohl( entrySize );
|
|
|
|
if ( size < sizeof( uint32_t ) + 1 + entrySize )
|
|
{
|
|
gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
|
|
articleText += handleResource( *ptr, ptr + 1 + sizeof( uint32_t ), entrySize );
|
|
|
|
ptr += sizeof( uint32_t ) + 1 + entrySize;
|
|
size -= sizeof( uint32_t ) + 1 + entrySize;
|
|
}
|
|
else
|
|
{
|
|
gdWarning( "Stardict: non-alpha entry type 0x%x for the word %s encountered in \"%s\".\n",
|
|
(unsigned)*ptr, headword.c_str(), getName().c_str() );
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
free( articleBody );
|
|
}
|
|
|
|
QString const& StardictDictionary::getDescription()
|
|
{
|
|
if( !dictionaryDescription.isEmpty() )
|
|
return dictionaryDescription;
|
|
|
|
File::Class ifoFile( getDictionaryFilenames()[ 0 ], "r" );
|
|
Ifo ifo( ifoFile );
|
|
|
|
if( !ifo.copyright.empty() )
|
|
{
|
|
QString copyright = QString::fromUtf8( ifo.copyright.c_str() )
|
|
.replace( "<br>", "\n", Qt::CaseInsensitive );
|
|
dictionaryDescription += QString( QObject::tr( "Copyright: %1%2" ) )
|
|
.arg( copyright )
|
|
.arg( "\n\n" );
|
|
}
|
|
|
|
if( !ifo.author.empty() )
|
|
{
|
|
QString author = QString::fromUtf8( ifo.author.c_str() );
|
|
dictionaryDescription += QString( QObject::tr( "Author: %1%2" ) )
|
|
.arg( author )
|
|
.arg( "\n\n" );
|
|
}
|
|
|
|
if( !ifo.email.empty() )
|
|
{
|
|
QString email = QString::fromUtf8( ifo.email.c_str() );
|
|
dictionaryDescription += QString( QObject::tr( "E-mail: %1%2" ) )
|
|
.arg( email )
|
|
.arg( "\n\n" );
|
|
}
|
|
|
|
if( !ifo.website.empty() )
|
|
{
|
|
QString website = QString::fromUtf8( ifo.website.c_str() );
|
|
dictionaryDescription += QString( QObject::tr( "Website: %1%2" ) )
|
|
.arg( website )
|
|
.arg( "\n\n" );
|
|
}
|
|
|
|
if( !ifo.date.empty() )
|
|
{
|
|
QString date = QString::fromUtf8( ifo.date.c_str() );
|
|
dictionaryDescription += QString( QObject::tr( "Date: %1%2" ) )
|
|
.arg( date )
|
|
.arg( "\n\n" );
|
|
}
|
|
|
|
if( !ifo.description.empty() )
|
|
{
|
|
QString desc = QString::fromUtf8( ifo.description.c_str() );
|
|
desc.replace( "\t", "<br/>" );
|
|
desc.replace( "\\n", "<br/>" );
|
|
desc.replace( "<br>", "<br/>", Qt::CaseInsensitive );
|
|
dictionaryDescription += Html::unescape( desc, true );
|
|
}
|
|
|
|
if( dictionaryDescription.isEmpty() )
|
|
dictionaryDescription = "NONE";
|
|
|
|
return dictionaryDescription;
|
|
}
|
|
|
|
QString StardictDictionary::getMainFilename()
|
|
{
|
|
return FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() );
|
|
}
|
|
|
|
void StardictDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
|
|
{
|
|
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|
|
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
|
|
FTS_index_completed.ref();
|
|
|
|
if( haveFTSIndex() )
|
|
return;
|
|
|
|
if( ensureInitDone().size() )
|
|
return;
|
|
|
|
if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
|
|
return;
|
|
|
|
gdDebug( "Stardict: Building the full-text index for dictionary: %s\n",
|
|
getName().c_str() );
|
|
|
|
try
|
|
{
|
|
FtsHelpers::makeFTSIndex( this, isCancelled );
|
|
FTS_index_completed.ref();
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Stardict: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
|
|
}
|
|
}
|
|
|
|
void StardictDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
|
|
{
|
|
try
|
|
{
|
|
string headwordStr, articleStr;
|
|
loadArticle( articleAddress, headwordStr, articleStr );
|
|
|
|
headword = QString::fromUtf8( headwordStr.data(), headwordStr.size() );
|
|
|
|
wstring wstr = Utf8::decode( articleStr );
|
|
|
|
text = Html::unescape( gd::toQString( wstr ) );
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Stardict: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
|
|
}
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > StardictDictionary::getSearchResults( QString const & searchString,
|
|
int searchMode, bool matchCase,
|
|
int distanceBetweenWords,
|
|
int maxResults,
|
|
bool ignoreWordsOrder,
|
|
bool ignoreDiacritics )
|
|
{
|
|
return std::make_shared<FtsHelpers::FTSResultsRequest>( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
|
|
}
|
|
|
|
/// StardictDictionary::findHeadwordsForSynonym()
|
|
|
|
class StardictHeadwordsRequest;
|
|
|
|
class StardictHeadwordsRequestRunnable: public QRunnable
|
|
{
|
|
StardictHeadwordsRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
StardictHeadwordsRequestRunnable( StardictHeadwordsRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~StardictHeadwordsRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
void run() override;
|
|
};
|
|
|
|
class StardictHeadwordsRequest: public Dictionary::WordSearchRequest
|
|
{
|
|
friend class StardictHeadwordsRequestRunnable;
|
|
|
|
wstring word;
|
|
StardictDictionary & dict;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
StardictHeadwordsRequest( wstring const & word_,
|
|
StardictDictionary & dict_ ):
|
|
word( word_ ), dict( dict_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new StardictHeadwordsRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by StardictHeadwordsRequestRunnable
|
|
|
|
void cancel() override
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~StardictHeadwordsRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void StardictHeadwordsRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void StardictHeadwordsRequest::run()
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
vector< WordArticleLink > chain = dict.findArticles( word );
|
|
|
|
wstring caseFolded = Folding::applySimpleCaseOnly( word );
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string headword, articleText;
|
|
|
|
dict.loadArticle( chain[ x ].articleOffset,
|
|
headword, articleText );
|
|
|
|
wstring headwordDecoded = Utf8::decode( headword );
|
|
|
|
if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) )
|
|
{
|
|
// The headword seems to differ from the input word, which makes the
|
|
// input word its synonym.
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
matches.push_back( headwordDecoded );
|
|
}
|
|
}
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
setErrorString( QString::fromUtf8( e.what() ) );
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::WordSearchRequest >
|
|
StardictDictionary::findHeadwordsForSynonym( wstring const & word )
|
|
|
|
{
|
|
return synonymSearchEnabled ? std::make_shared<StardictHeadwordsRequest>( word, *this ) :
|
|
Class::findHeadwordsForSynonym( word );
|
|
}
|
|
|
|
|
|
/// StardictDictionary::getArticle()
|
|
|
|
class StardictArticleRequest;
|
|
|
|
class StardictArticleRequestRunnable: public QRunnable
|
|
{
|
|
StardictArticleRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
StardictArticleRequestRunnable( StardictArticleRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~StardictArticleRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
void run() override;
|
|
};
|
|
|
|
class StardictArticleRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class StardictArticleRequestRunnable;
|
|
|
|
wstring word;
|
|
vector< wstring > alts;
|
|
StardictDictionary & dict;
|
|
bool ignoreDiacritics;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
StardictArticleRequest( wstring const & word_,
|
|
vector< wstring > const & alts_,
|
|
StardictDictionary & dict_,
|
|
bool ignoreDiacritics_ ):
|
|
word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new StardictArticleRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by StardictArticleRequestRunnable
|
|
|
|
void cancel() override
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~StardictArticleRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void StardictArticleRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void StardictArticleRequest::run()
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
|
|
|
for( unsigned x = 0; x < alts.size(); ++x )
|
|
{
|
|
/// Make an additional query for each alt
|
|
|
|
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
|
|
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
|
}
|
|
|
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
|
|
|
set< uint32_t > articlesIncluded; // Some synonims make it that the articles
|
|
// appear several times. We combat this
|
|
// by only allowing them to appear once.
|
|
|
|
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
|
|
if( ignoreDiacritics )
|
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
|
|
|
for( unsigned x = 0; x < chain.size(); ++x )
|
|
{
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
|
|
continue; // We already have this article in the body.
|
|
|
|
// Now grab that article
|
|
|
|
string headword, articleText;
|
|
|
|
dict.loadArticle( chain[ x ].articleOffset, headword, articleText );
|
|
|
|
// Ok. Now, does it go to main articles, or to alternate ones? We list
|
|
// main ones first, and alternates after.
|
|
|
|
// We do the case-folded comparison here.
|
|
|
|
wstring headwordStripped =
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
|
|
if( ignoreDiacritics )
|
|
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
|
|
|
|
multimap< wstring, pair< string, string > > & mapToUse =
|
|
( wordCaseFolded == headwordStripped ) ?
|
|
mainArticles : alternateArticles;
|
|
|
|
mapToUse.insert( pair< wstring, pair< string, string > >(
|
|
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
|
|
pair< string, string >( headword, articleText ) ) );
|
|
|
|
articlesIncluded.insert( chain[ x ].articleOffset );
|
|
}
|
|
|
|
if ( mainArticles.empty() && alternateArticles.empty() )
|
|
{
|
|
// No such word
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
string result;
|
|
|
|
multimap< wstring, pair< string, string > >::const_iterator i;
|
|
|
|
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</font>""</font>""</font>""</font>""</font>""</font>"
|
|
"</b></b></b></b></b></b></b></b>"
|
|
"</i></i></i></i></i></i></i></i>";
|
|
|
|
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
|
|
{
|
|
result += dict.isFromLanguageRTL() ? R"(<h3 class="sdct_headwords" dir="rtl">)" : "<h3 class=\"sdct_headwords\">";
|
|
result += i->second.first;
|
|
result += "</h3>";
|
|
if( dict.isToLanguageRTL() )
|
|
result += R"(<div style="display:inline;" dir="rtl">)";
|
|
result += i->second.second;
|
|
result += cleaner;
|
|
if( dict.isToLanguageRTL() )
|
|
result += "</div>";
|
|
}
|
|
|
|
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
|
|
{
|
|
result += dict.isFromLanguageRTL() ? R"(<h3 class="sdct_headwords" dir="rtl">)" : "<h3 class=\"sdct_headwords\">";
|
|
result += i->second.first;
|
|
result += "</h3>";
|
|
if( dict.isToLanguageRTL() )
|
|
result += R"(<div style="display:inline;" dir="rtl">)";
|
|
result += i->second.second;
|
|
result += cleaner;
|
|
if( dict.isToLanguageRTL() )
|
|
result += "</div>";
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
data.resize( result.size() );
|
|
|
|
memcpy( &data.front(), result.data(), result.size() );
|
|
|
|
hasAnyData = true;
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
setErrorString( QString::fromUtf8( e.what() ) );
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > StardictDictionary::getArticle( wstring const & word,
|
|
vector< wstring > const & alts,
|
|
wstring const &,
|
|
bool ignoreDiacritics )
|
|
|
|
{
|
|
return std::make_shared<StardictArticleRequest>( word, alts, *this, ignoreDiacritics );
|
|
}
|
|
|
|
|
|
static char const * beginsWith( char const * substr, char const * str )
|
|
{
|
|
size_t len = strlen( substr );
|
|
|
|
return strncmp( str, substr, len ) == 0 ? str + len : 0;
|
|
}
|
|
|
|
Ifo::Ifo( File::Class & f ):
|
|
wordcount( 0 ), synwordcount( 0 ), idxfilesize( 0 ), idxoffsetbits( 32 )
|
|
{
|
|
static string const versionEq( "version=" );
|
|
|
|
static string const booknameEq( "bookname=" );
|
|
|
|
//GD_DPRINTF( "%s<\n", f.gets().c_str() );
|
|
//GD_DPRINTF( "%s<\n", f.gets().c_str() );
|
|
|
|
if ( QString::fromUtf8(f.gets().c_str()) != "StarDict's dict ifo file" ||
|
|
f.gets().compare( 0, versionEq.size(), versionEq ) )
|
|
throw exNotAnIfoFile();
|
|
|
|
/// Now go through the file and parse options
|
|
|
|
try
|
|
{
|
|
char option[ 16384 ];
|
|
|
|
for( ; ; )
|
|
{
|
|
if ( !f.gets( option, sizeof( option ), true ) )
|
|
break;
|
|
|
|
if ( char const * val = beginsWith( "bookname=", option ) )
|
|
bookname = val;
|
|
else
|
|
if ( char const * val = beginsWith( "wordcount=", option ) )
|
|
{
|
|
if ( sscanf( val, "%u", & wordcount ) != 1 )
|
|
throw exBadFieldInIfo( option );
|
|
}
|
|
else
|
|
if ( char const * val = beginsWith( "synwordcount=", option ) )
|
|
{
|
|
if ( sscanf( val, "%u", & synwordcount ) != 1 )
|
|
throw exBadFieldInIfo( option );
|
|
}
|
|
else
|
|
if ( char const * val = beginsWith( "idxfilesize=", option ) )
|
|
{
|
|
if ( sscanf( val, "%u", & idxfilesize ) != 1 )
|
|
throw exBadFieldInIfo( option );
|
|
}
|
|
else
|
|
if ( char const * val = beginsWith( "idxoffsetbits=", option ) )
|
|
{
|
|
if ( sscanf( val, "%u", & idxoffsetbits ) != 1 || ( idxoffsetbits != 32
|
|
&& idxoffsetbits != 64 ) )
|
|
throw exBadFieldInIfo( option );
|
|
}
|
|
else
|
|
if ( char const * val = beginsWith( "sametypesequence=", option ) )
|
|
sametypesequence = val;
|
|
else
|
|
if ( char const * val = beginsWith( "dicttype=", option ) )
|
|
dicttype = val;
|
|
else
|
|
if ( char const * val = beginsWith( "description=", option ) )
|
|
description = val;
|
|
else
|
|
if ( char const * val = beginsWith( "copyright=", option ) )
|
|
copyright = val;
|
|
else
|
|
if ( char const * val = beginsWith( "author=", option ) )
|
|
author = val;
|
|
else
|
|
if ( char const * val = beginsWith( "email=", option ) )
|
|
email = val;
|
|
else
|
|
if ( char const * val = beginsWith( "website=", option ) )
|
|
website = val;
|
|
else
|
|
if ( char const * val = beginsWith( "date=", option ) )
|
|
date = val;
|
|
}
|
|
}
|
|
catch( File::exReadError & )
|
|
{
|
|
}
|
|
}
|
|
|
|
//// StardictDictionary::getResource()
|
|
|
|
class StardictResourceRequest;
|
|
|
|
class StardictResourceRequestRunnable: public QRunnable
|
|
{
|
|
StardictResourceRequest & r;
|
|
QSemaphore & hasExited;
|
|
|
|
public:
|
|
|
|
StardictResourceRequestRunnable( StardictResourceRequest & r_,
|
|
QSemaphore & hasExited_ ): r( r_ ),
|
|
hasExited( hasExited_ )
|
|
{}
|
|
|
|
~StardictResourceRequestRunnable()
|
|
{
|
|
hasExited.release();
|
|
}
|
|
|
|
void run() override;
|
|
};
|
|
|
|
class StardictResourceRequest: public Dictionary::DataRequest
|
|
{
|
|
friend class StardictResourceRequestRunnable;
|
|
|
|
StardictDictionary & dict;
|
|
|
|
string resourceName;
|
|
|
|
QAtomicInt isCancelled;
|
|
QSemaphore hasExited;
|
|
|
|
public:
|
|
|
|
StardictResourceRequest( StardictDictionary & dict_,
|
|
string const & resourceName_ ):
|
|
dict( dict_ ),
|
|
resourceName( resourceName_ )
|
|
{
|
|
QThreadPool::globalInstance()->start(
|
|
new StardictResourceRequestRunnable( *this, hasExited ) );
|
|
}
|
|
|
|
void run(); // Run from another thread by StardictResourceRequestRunnable
|
|
|
|
void cancel() override
|
|
{
|
|
isCancelled.ref();
|
|
}
|
|
|
|
~StardictResourceRequest()
|
|
{
|
|
isCancelled.ref();
|
|
hasExited.acquire();
|
|
}
|
|
};
|
|
|
|
void StardictResourceRequestRunnable::run()
|
|
{
|
|
r.run();
|
|
}
|
|
|
|
void StardictResourceRequest::run()
|
|
{
|
|
// Some runnables linger enough that they are cancelled before they start
|
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
|
{
|
|
finish();
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
if( resourceName.at( 0 ) == '\x1E' )
|
|
resourceName = resourceName.erase( 0, 1 );
|
|
if( resourceName.at( resourceName.length() - 1 ) == '\x1F' )
|
|
resourceName.erase( resourceName.length() - 1, 1 );
|
|
|
|
string n =
|
|
FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
|
|
FsEncoding::separator() +
|
|
"res" +
|
|
FsEncoding::separator() +
|
|
FsEncoding::encode( resourceName );
|
|
|
|
GD_DPRINTF( "n is %s\n", n.c_str() );
|
|
|
|
try
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
File::loadFromFile( n, data );
|
|
}
|
|
catch( File::exCantOpen & )
|
|
{
|
|
// Try reading from zip file
|
|
|
|
if ( dict.resourceZip.isOpen() )
|
|
{
|
|
Mutex::Lock _( dict.resourceZipMutex );
|
|
|
|
Mutex::Lock __( dataMutex );
|
|
|
|
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
|
|
throw; // Make it fail since we couldn't read the archive
|
|
}
|
|
else
|
|
throw;
|
|
}
|
|
|
|
if ( Filetype::isNameOfTiff( resourceName ) )
|
|
{
|
|
// Convert it
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
GdTiff::tiff2img( data );
|
|
}
|
|
|
|
if( Filetype::isNameOfCSS( resourceName ) )
|
|
{
|
|
Mutex::Lock _( dataMutex );
|
|
|
|
QString css = QString::fromUtf8( data.data(), data.size() );
|
|
|
|
// Correct some url's
|
|
|
|
QString id = QString::fromUtf8( dict.getId().c_str() );
|
|
int pos = 0;
|
|
|
|
QRegularExpression links( R"(url\(\s*(['"]?)([^'"]*)(['"]?)\s*\))",
|
|
QRegularExpression::CaseInsensitiveOption );
|
|
|
|
QString newCSS;
|
|
QRegularExpressionMatchIterator it = links.globalMatch( css );
|
|
while( it.hasNext() )
|
|
{
|
|
QRegularExpressionMatch match = it.next();
|
|
newCSS += css.mid( pos, match.capturedStart() - pos );
|
|
pos = match.capturedEnd();
|
|
|
|
QString url = match.captured( 2 );
|
|
|
|
if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
|
|
{
|
|
// External link
|
|
newCSS += match.captured();
|
|
continue;
|
|
}
|
|
|
|
QString newUrl = QString( "url(" ) + match.captured( 1 ) + "bres://"
|
|
+ id + "/" + url + match.captured( 3 ) + ")";
|
|
newCSS += newUrl;
|
|
}
|
|
if( pos )
|
|
{
|
|
newCSS += css.mid( pos );
|
|
css = newCSS;
|
|
newCSS.clear();
|
|
}
|
|
|
|
dict.isolateCSS( css );
|
|
QByteArray bytes = css.toUtf8();
|
|
data.resize( bytes.size() );
|
|
memcpy( &data.front(), bytes.constData(), bytes.size() );
|
|
}
|
|
|
|
Mutex::Lock _( dataMutex );
|
|
hasAnyData = true;
|
|
}
|
|
catch( std::exception &ex )
|
|
{
|
|
gdWarning( "Stardict: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
|
|
resourceName.c_str(), dict.getName().c_str(), ex.what() );
|
|
// Resource not loaded -- we don't set the hasAnyData flag then
|
|
}
|
|
catch( ... )
|
|
{
|
|
}
|
|
|
|
finish();
|
|
}
|
|
|
|
sptr< Dictionary::DataRequest > StardictDictionary::getResource( string const & name )
|
|
|
|
{
|
|
return std::make_shared<StardictResourceRequest>( *this, name );
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
static void findCorrespondingFiles( string const & ifo,
|
|
string & idx, string & dict, string & syn )
|
|
{
|
|
string base( ifo, 0, ifo.size() - 3 );
|
|
|
|
if ( !(
|
|
File::tryPossibleName( base + "idx", idx ) ||
|
|
File::tryPossibleName( base + "idx.gz", idx ) ||
|
|
File::tryPossibleName( base + "idx.dz", idx ) ||
|
|
File::tryPossibleName( base + "IDX", idx ) ||
|
|
File::tryPossibleName( base + "IDX.GZ", idx ) ||
|
|
File::tryPossibleName( base + "IDX.DZ", idx )
|
|
) )
|
|
throw exNoIdxFile( ifo );
|
|
|
|
if ( !(
|
|
File::tryPossibleName( base + "dict", dict ) ||
|
|
File::tryPossibleName( base + "dict.dz", dict ) ||
|
|
File::tryPossibleName( base + "DICT", dict ) ||
|
|
File::tryPossibleName( base + "dict.DZ", dict )
|
|
) )
|
|
throw exNoDictFile( ifo );
|
|
|
|
if ( !(
|
|
File::tryPossibleName( base + "syn", syn ) ||
|
|
File::tryPossibleName( base + "syn.gz", syn ) ||
|
|
File::tryPossibleName( base + "syn.dz", syn ) ||
|
|
File::tryPossibleName( base + "SYN", syn ) ||
|
|
File::tryPossibleName( base + "SYN.GZ", syn ) ||
|
|
File::tryPossibleName( base + "SYN.DZ", syn )
|
|
) )
|
|
syn.clear();
|
|
}
|
|
|
|
static void handleIdxSynFile( string const & fileName,
|
|
IndexedWords & indexedWords,
|
|
ChunkedStorage::Writer & chunks,
|
|
vector< uint32_t > * articleOffsets,
|
|
bool isSynFile, bool parseHeadwords )
|
|
{
|
|
gzFile stardictIdx = gd_gzopen( fileName.c_str() );
|
|
if ( !stardictIdx )
|
|
throw exCantReadFile( fileName );
|
|
|
|
vector< char > image;
|
|
|
|
for( ; ; )
|
|
{
|
|
size_t oldSize = image.size();
|
|
|
|
image.resize( oldSize + 65536 );
|
|
|
|
int rd = gzread( stardictIdx, &image.front() + oldSize, 65536 );
|
|
|
|
if ( rd < 0 )
|
|
{
|
|
gzclose( stardictIdx );
|
|
throw exCantReadFile( fileName );
|
|
}
|
|
|
|
if ( rd != 65536 )
|
|
{
|
|
image.resize( oldSize + rd + 1 );
|
|
break;
|
|
}
|
|
}
|
|
gzclose( stardictIdx );
|
|
|
|
// We append one zero byte to catch runaway string at the end, if any
|
|
|
|
image.back() = 0;
|
|
|
|
// Now parse it
|
|
|
|
for( char const * ptr = &image.front(); ptr != &image.back(); )
|
|
{
|
|
size_t wordLen = strlen( ptr );
|
|
|
|
if ( ptr + wordLen + 1 + ( isSynFile ? sizeof( uint32_t ) :
|
|
sizeof( uint32_t ) * 2 ) >
|
|
&image.back() )
|
|
{
|
|
GD_FDPRINTF( stderr, "Warning: sudden end of file %s\n", fileName.c_str() );
|
|
break;
|
|
}
|
|
|
|
char const * word = ptr;
|
|
|
|
ptr += wordLen + 1;
|
|
|
|
uint32_t offset;
|
|
|
|
if( strstr( word, "&#" ) )
|
|
{
|
|
// Decode some html-coded symbols in headword
|
|
string unescapedWord = Html::unescapeUtf8( word );
|
|
strncpy( (char *)word, unescapedWord.c_str(), wordLen );
|
|
wordLen = strlen( word );
|
|
}
|
|
|
|
if ( !isSynFile )
|
|
{
|
|
// We're processing the .idx file
|
|
uint32_t articleOffset, articleSize;
|
|
|
|
memcpy( &articleOffset, ptr, sizeof( uint32_t ) );
|
|
ptr += sizeof( uint32_t );
|
|
memcpy( &articleSize, ptr, sizeof( uint32_t ) );
|
|
ptr += sizeof( uint32_t );
|
|
|
|
articleOffset = ntohl( articleOffset );
|
|
articleSize = ntohl( articleSize );
|
|
|
|
// Create an entry for the article in the chunked storage
|
|
|
|
offset = chunks.startNewBlock();
|
|
|
|
if ( articleOffsets )
|
|
articleOffsets->push_back( offset );
|
|
|
|
chunks.addToBlock( &articleOffset, sizeof( uint32_t ) );
|
|
chunks.addToBlock( &articleSize, sizeof( uint32_t ) );
|
|
chunks.addToBlock( word, wordLen + 1 );
|
|
}
|
|
else
|
|
{
|
|
// We're processing the .syn file
|
|
uint32_t offsetInIndex;
|
|
|
|
memcpy( &offsetInIndex, ptr, sizeof( uint32_t ) );
|
|
ptr += sizeof( uint32_t );
|
|
|
|
offsetInIndex = ntohl( offsetInIndex );
|
|
|
|
if ( offsetInIndex >= articleOffsets->size() )
|
|
throw exIncorrectOffset( fileName );
|
|
|
|
offset = (*articleOffsets)[ offsetInIndex ];
|
|
|
|
// Some StarDict dictionaries are in fact badly converted Babylon ones.
|
|
// They contain a lot of superfluous slashed entries with dollar signs.
|
|
// We try to filter them out here, since those entries become much more
|
|
// apparent in GoldenDict than they were in StarDict because of
|
|
// punctuation folding. Hopefully there are not a whole lot of valid
|
|
// synonyms which really start from slash and contain dollar signs, or
|
|
// end with dollar and contain slashes.
|
|
if ( *word == '/' )
|
|
{
|
|
if ( strchr( word, '$' ) )
|
|
continue; // Skip this entry
|
|
}
|
|
else
|
|
if ( wordLen && word[ wordLen - 1 ] == '$' )
|
|
{
|
|
if ( strchr( word, '/' ) )
|
|
continue; // Skip this entry
|
|
}
|
|
}
|
|
|
|
// Insert new entry into an index
|
|
|
|
if( parseHeadwords )
|
|
indexedWords.addWord( Utf8::decode( word ), offset );
|
|
else
|
|
indexedWords.addSingleWord( Utf8::decode( word ), offset );
|
|
}
|
|
|
|
GD_DPRINTF( "%u entires made\n", (unsigned) indexedWords.size() );
|
|
}
|
|
|
|
|
|
vector< sptr< Dictionary::Class > > makeDictionaries(
|
|
vector< string > const & fileNames,
|
|
string const & indicesDir,
|
|
Dictionary::Initializing & initializing,
|
|
unsigned maxHeadwordsToExpand )
|
|
|
|
{
|
|
vector< sptr< Dictionary::Class > > dictionaries;
|
|
|
|
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
|
|
++i )
|
|
{
|
|
if ( i->size() < 4 ||
|
|
strcasecmp( i->c_str() + ( i->size() - 4 ), ".ifo" ) != 0 )
|
|
continue;
|
|
|
|
try
|
|
{
|
|
vector< string > dictFiles( 1, *i );
|
|
|
|
string idxFileName, dictFileName, synFileName;
|
|
|
|
findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName );
|
|
|
|
dictFiles.push_back( idxFileName );
|
|
dictFiles.push_back( dictFileName );
|
|
|
|
if ( synFileName.size() )
|
|
dictFiles.push_back( synFileName );
|
|
|
|
// See if there's a zip file with resources present. If so, include it.
|
|
|
|
string zipFileName;
|
|
string baseName = FsEncoding::dirname( idxFileName ) + FsEncoding::separator();
|
|
|
|
if ( File::tryPossibleZipName( baseName + "res.zip", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + "RES.ZIP", zipFileName ) ||
|
|
File::tryPossibleZipName( baseName + "res" + FsEncoding::separator() + "res.zip", zipFileName ) )
|
|
dictFiles.push_back( zipFileName );
|
|
|
|
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
|
|
|
string indexFile = indicesDir + dictId;
|
|
|
|
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
|
|
indexIsOldOrBad( indexFile ) )
|
|
{
|
|
// Building the index
|
|
|
|
File::Class ifoFile( *i, "r" );
|
|
|
|
Ifo ifo( ifoFile );
|
|
|
|
gdDebug( "Stardict: Building the index for dictionary: %s\n", ifo.bookname.c_str() );
|
|
|
|
if ( ifo.idxoffsetbits == 64 )
|
|
throw ex64BitsNotSupported();
|
|
|
|
if ( ifo.dicttype.size() )
|
|
throw exDicttypeNotSupported();
|
|
|
|
if( synFileName.empty() )
|
|
{
|
|
if ( ifo.synwordcount )
|
|
{
|
|
GD_DPRINTF( "Warning: dictionary has synwordcount specified, but no "
|
|
"corresponding .syn file was found\n" );
|
|
ifo.synwordcount = 0; // Pretend it wasn't there
|
|
}
|
|
}
|
|
else
|
|
if ( !ifo.synwordcount )
|
|
{
|
|
GD_DPRINTF( "Warning: ignoring .syn file %s, since there's no synwordcount in .ifo specified\n",
|
|
synFileName.c_str() );
|
|
}
|
|
|
|
|
|
GD_DPRINTF( "bookname = %s\n", ifo.bookname.c_str() );
|
|
GD_DPRINTF( "wordcount = %u\n", ifo.wordcount );
|
|
|
|
initializing.indexingDictionary( ifo.bookname );
|
|
|
|
File::Class idx( indexFile, "wb" );
|
|
|
|
IdxHeader idxHeader;
|
|
|
|
memset( &idxHeader, 0, sizeof( idxHeader ) );
|
|
|
|
// We write a dummy header first. At the end of the process the header
|
|
// will be rewritten with the right values.
|
|
|
|
idx.write( idxHeader );
|
|
|
|
idx.write( ifo.bookname.data(), ifo.bookname.size() );
|
|
idx.write( ifo.sametypesequence.data(), ifo.sametypesequence.size() );
|
|
|
|
IndexedWords indexedWords;
|
|
|
|
ChunkedStorage::Writer chunks( idx );
|
|
|
|
// Load indices
|
|
if ( !ifo.synwordcount )
|
|
handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false,
|
|
!maxHeadwordsToExpand || ifo.wordcount < maxHeadwordsToExpand );
|
|
else
|
|
{
|
|
vector< uint32_t > articleOffsets;
|
|
|
|
articleOffsets.reserve( ifo.wordcount );
|
|
|
|
handleIdxSynFile( idxFileName, indexedWords, chunks, &articleOffsets,
|
|
false,
|
|
!maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
|
|
|
|
handleIdxSynFile( synFileName, indexedWords, chunks, &articleOffsets,
|
|
true,
|
|
!maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
|
|
}
|
|
|
|
// Finish with the chunks
|
|
|
|
idxHeader.chunksOffset = chunks.finish();
|
|
|
|
// Build index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
|
|
|
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
|
|
|
// That concludes it. Update the header.
|
|
|
|
idxHeader.signature = Signature;
|
|
idxHeader.formatVersion = CurrentFormatVersion;
|
|
|
|
idxHeader.wordCount = ifo.wordcount;
|
|
idxHeader.synWordCount = ifo.synwordcount;
|
|
idxHeader.bookNameSize = ifo.bookname.size();
|
|
idxHeader.sameTypeSequenceSize = ifo.sametypesequence.size();
|
|
|
|
// read languages
|
|
QPair<quint32,quint32> langs =
|
|
LangCoder::findIdsForFilename( QString::fromStdString( dictFileName ) );
|
|
|
|
// if no languages found, try dictionary's name
|
|
if ( langs.first == 0 || langs.second == 0 )
|
|
{
|
|
langs =
|
|
LangCoder::findIdsForFilename( QString::fromStdString( ifo.bookname ) );
|
|
}
|
|
|
|
idxHeader.langFrom = langs.first;
|
|
idxHeader.langTo = langs.second;
|
|
|
|
// If there was a zip file, index it too
|
|
|
|
if ( zipFileName.size() )
|
|
{
|
|
GD_DPRINTF( "Indexing zip file\n" );
|
|
|
|
idxHeader.hasZipFile = 1;
|
|
|
|
IndexedWords zipFileNames;
|
|
IndexedZip zipFile;
|
|
if( zipFile.openZipFile( QDir::fromNativeSeparators(
|
|
FsEncoding::decode( zipFileName.c_str() ) ) ) )
|
|
zipFile.indexFile( zipFileNames );
|
|
|
|
if( !zipFileNames.empty() )
|
|
{
|
|
// Build the resulting zip file index
|
|
|
|
IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
|
|
|
|
idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
|
|
idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
|
|
}
|
|
else
|
|
{
|
|
// Bad zip file -- no index (though the mark that we have one
|
|
// remains)
|
|
idxHeader.zipIndexBtreeMaxElements = 0;
|
|
idxHeader.zipIndexRootOffset = 0;
|
|
}
|
|
}
|
|
else
|
|
idxHeader.hasZipFile = 0;
|
|
|
|
// That concludes it. Update the header.
|
|
|
|
idx.rewind();
|
|
|
|
idx.write( &idxHeader, sizeof( idxHeader ) );
|
|
}
|
|
|
|
dictionaries.push_back( std::make_shared<StardictDictionary>( dictId,
|
|
indexFile,
|
|
dictFiles ) );
|
|
}
|
|
catch( std::exception & e )
|
|
{
|
|
gdWarning( "Stardict dictionary initializing failed: %s, error: %s\n",
|
|
i->c_str(), e.what() );
|
|
}
|
|
}
|
|
|
|
return dictionaries;
|
|
}
|
|
|
|
|
|
}
|