goldendict-ng/slob.cc

1315 lines
34 KiB
C++
Raw Normal View History

2015-01-22 15:17:05 +00:00
/* This file is (c) 2015 Abs62
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifdef MAKE_ZIM_SUPPORT
#include "slob.hh"
#include "btreeidx.hh"
#include "fsencoding.hh"
#include "folding.hh"
#include "gddebug.hh"
#include "utf8.hh"
#include "decompress.hh"
#include "langcoder.hh"
#include "wstring.hh"
#include "wstring_qt.hh"
#include "ftshelpers.hh"
#include "htmlescape.hh"
#include "filetype.hh"
#include "tiff.hh"
#ifdef _MSC_VER
#include <stub_msvc.h>
#endif
#include <QString>
#include <QFile>
#include <QTextCodec>
#include <QMap>
#include <QPair>
#include <QRegExp>
#include <string>
#include <vector>
#include <map>
#include <set>
namespace Slob {
using std::string;
using std::map;
using std::vector;
using std::multimap;
using std::pair;
using std::set;
using gd::wstring;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
DEF_EX_STR( exNotSlobFile, "Not an Slob file", Dictionary::Ex )
DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
DEF_EX_STR( exCantDecodeFile, "Can't decode file", Dictionary::Ex )
DEF_EX_STR( exNoCodecFound, "No text codec found", Dictionary::Ex )
DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
#pragma pack( push, 1 )
enum
{
Signature = 0x58424C53, // SLBX on little-endian, XBLS on big-endian
CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version
};
struct IdxHeader
{
quint32 signature; // First comes the signature, SLBX
quint32 formatVersion; // File format version (CurrentFormatVersion)
quint32 indexBtreeMaxElements; // Two fields from IndexInfo
quint32 indexRootOffset;
quint32 resourceIndexBtreeMaxElements; // Two fields from IndexInfo
quint32 resourceIndexRootOffset;
quint32 wordCount;
quint32 articleCount;
quint32 langFrom; // Source language
quint32 langTo; // Target language
}
#ifndef _MSC_VER
__attribute__((packed))
#endif
;
#pragma pack( pop )
const char SLOB_MAGIC[ 8 ] = { 0x21, 0x2d, 0x31, 0x53, 0x4c, 0x4f, 0x42, 0x1f };
struct RefEntry
{
QString key;
quint32 binIndex;
quint16 itemIndex;
QString fragment;
};
bool indexIsOldOrBad( string const & indexFile )
{
File::Class idx( indexFile, "rb" );
IdxHeader header;
return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
header.signature != Signature ||
header.formatVersion != CurrentFormatVersion;
}
class SlobFile
{
enum Compressions
{ UNKNOWN = 0, ZLIB, BZ2, LZMA2 };
QFile file;
QString fileName, dictionaryName;
Compressions compression;
QString encoding;
unsigned char uuid[ 16 ];
QTextCodec *codec;
QMap< QString, QString > tags;
QVector< QString > contentTypes;
quint32 blobCount;
quint64 storeOffset, fileSize, refsOffset;
quint32 refsCount, itemsCount;
quint64 itemsOffset, itemsDataOffset;
quint32 currentItem;
string currentItemData;
QString readTinyText();
QString readText();
QString readLargeText();
QString readString( unsigned lenght );
public:
SlobFile() :
compression( UNKNOWN )
, codec( 0 )
, blobCount( 0 )
, storeOffset( 0 )
, fileSize( 0 )
, refsOffset( 0 )
, refsCount( 0 )
, itemsCount( 0 )
, itemsOffset( 0 )
, itemsDataOffset( 0 )
, currentItem( 0xFFFFFFFF )
{}
~SlobFile();
Compressions getCompression() const
{ return compression; }
QString const & getEncoding() const
{ return encoding; }
QString const & getDictionaryName() const
{ return dictionaryName; }
quint32 blobsCount() const
{ return blobCount; }
quint64 dataOffset() const
{ return storeOffset; }
quint32 getRefsCount() const
{ return refsCount; }
QTextCodec * getCodec() const
{ return codec; }
QString getContentType( quint8 content_id ) const
{ return content_id < contentTypes.size() ? contentTypes[ content_id ] : QString(); }
QMap< QString, QString > const & getTags() const
{ return tags; }
void open( const QString & name );
void getRefEntry( quint32 ref_nom, RefEntry & entry );
quint8 getItem( RefEntry const & entry, string * data );
};
SlobFile::~SlobFile()
{
file.close();
}
QString SlobFile::readString( unsigned length )
{
QByteArray data = file.read( length );
QString str;
if( codec != 0 && !data.isEmpty() )
str = codec->toUnicode( data );
else
str = QString( data );
char term = 0;
int n = str.indexOf( term );
if( n > 0 )
str.resize( n );
return str;
}
QString SlobFile::readTinyText()
{
unsigned char len;
if( !file.getChar( ( char * )&len ) )
{
QString error = fileName + ": " + file.errorString();
throw exCantReadFile( string( error.toUtf8().data() ) );
}
return readString( len );
}
QString SlobFile::readText()
{
quint16 len;
if( file.read( ( char * )&len, sizeof( len ) ) != sizeof( len ) )
{
QString error = fileName + ": " + file.errorString();
throw exCantReadFile( string( error.toUtf8().data() ) );
}
return readString( qFromBigEndian( len ) );
}
QString SlobFile::readLargeText()
{
quint32 len;
if( file.read( ( char * )&len, sizeof( len ) ) != sizeof( len ) )
{
QString error = fileName + ": " + file.errorString();
throw exCantReadFile( string( error.toUtf8().data() ) );
}
return readString( qFromBigEndian( len ) );
}
void SlobFile::open( const QString & name )
{
QString error( name + ": " );
if( file.isOpen() )
file.close();
fileName = name;
file.setFileName( name );
{
QFileInfo fi( name );
dictionaryName = fi.fileName();
}
for( ; ; )
{
if( !file.open( QFile::ReadOnly ) )
break;
char magic[ 8 ];
if( file.read( magic, sizeof( magic ) ) != sizeof( magic ) )
break;
if( memcmp( magic, SLOB_MAGIC, sizeof( magic ) ) != 0 )
throw exNotSlobFile( string( name.toUtf8().data() ) );
if( file.read( ( char * )uuid, sizeof( uuid ) ) != sizeof( uuid ) )
break;
// Read encoding
encoding = readTinyText();
codec = QTextCodec::codecForName( encoding.toLatin1() );
if( codec == 0 )
{
error = QString( "for encoding \"") + encoding + "\"";
throw exNoCodecFound( string( error.toUtf8().data() ) );
}
// Read compression type
QString compr = readTinyText();
if( compr.compare( "zlib", Qt::CaseInsensitive ) == 0 )
compression = ZLIB;
else
if( compr.compare( "bz2", Qt::CaseInsensitive ) == 0 )
compression = BZ2;
else
if( compr.compare( "lzma2", Qt::CaseInsensitive ) == 0 )
compression = LZMA2;
// Read tags
unsigned char count;
if( !file.getChar( ( char * )&count ) )
break;
for( unsigned i = 0; i < count; i++ )
{
QString key = readTinyText();
QString value = readTinyText();
tags[ key ] = value;
if( key.compare( "label", Qt::CaseInsensitive ) == 0
|| key.compare( "name", Qt::CaseInsensitive ) == 0)
dictionaryName = value;
}
// Read content types
if( !file.getChar( ( char * )&count ) )
break;
for( unsigned i = 0; i < count; i++ )
{
QString type = readText();
contentTypes.append( type );
}
// Read data parameters
quint32 cnt;
if( file.read( ( char * )&cnt, sizeof( cnt ) ) != sizeof( cnt ) )
break;
blobCount = qFromBigEndian( cnt );
quint64 tmp;
if( file.read( ( char * )&tmp, sizeof( tmp ) ) != sizeof( tmp ) )
break;
storeOffset = qFromBigEndian( tmp );
if( file.read( ( char * )&tmp, sizeof( tmp ) ) != sizeof( tmp ) )
break;
fileSize = qFromBigEndian( tmp );
if( file.read( ( char * )&cnt, sizeof( cnt ) ) != sizeof( cnt ) )
break;
refsCount = qFromBigEndian( cnt );
refsOffset = file.pos();
if( !file.seek( storeOffset ) )
break;
if( file.read( ( char * )&cnt, sizeof( cnt ) ) != sizeof( cnt ) )
break;
itemsCount = qFromBigEndian( cnt );
itemsOffset = storeOffset + sizeof( itemsCount );
itemsDataOffset = itemsOffset + itemsCount * sizeof( quint64 );
return;
}
error += file.errorString();
throw exCantReadFile( string( error.toUtf8().data() ) );
}
void SlobFile::getRefEntry( quint32 ref_nom, RefEntry & entry )
{
quint64 pos = refsOffset + ref_nom * sizeof( quint64 );
quint64 offset, tmp;
for( ; ; )
{
if( !file.seek( pos ) || file.read( ( char * )&tmp, sizeof( tmp ) ) != sizeof( tmp ) )
break;
offset = qFromBigEndian( tmp ) + refsOffset + refsCount * sizeof( quint64 );
if( !file.seek( offset ) )
break;
entry.key = readText();
quint32 index;
if( file.read( ( char * )&index, sizeof( index ) ) != sizeof( index ) )
break;
entry.itemIndex = qFromBigEndian( index );
quint16 binIndex;
if( file.read( ( char * )&binIndex, sizeof( binIndex ) ) != sizeof( binIndex ) )
break;
entry.binIndex = qFromBigEndian( binIndex );
entry.fragment = readTinyText();
return;
}
QString error = fileName + ": " + file.errorString();
throw exCantReadFile( string( error.toUtf8().data() ) );
}
quint8 SlobFile::getItem( RefEntry const & entry, string * data )
{
quint64 pos = itemsOffset + entry.itemIndex * sizeof( quint64 );
quint64 offset, tmp;
for( ; ; )
{
// Read item data types
if( !file.seek( pos ) || file.read( ( char * )&tmp, sizeof( tmp ) ) != sizeof( tmp ) )
break;
offset = qFromBigEndian( tmp ) + itemsDataOffset;
if( !file.seek( offset ) )
break;
quint32 bins, bins_be;
if( file.read( ( char * )&bins_be, sizeof( bins_be ) ) != sizeof( bins_be ) )
break;
bins = qFromBigEndian( bins_be );
if( entry.binIndex >= bins )
return 0xFF;
QVector< quint8 > ids;
ids.resize( bins );
if( file.read( ( char * )ids.data(), bins ) != bins )
break;
quint8 id = ids[ entry.binIndex ];
if( id >= (unsigned)contentTypes.size() )
return 0xFF;
if( data != 0 )
{
// Read item data
if( currentItem != entry.itemIndex )
{
currentItemData.clear();
quint32 length, length_be;
if( file.read( ( char * )&length_be, sizeof( length_be ) ) != sizeof( length_be ) )
break;
length = qFromBigEndian( length_be );
QByteArray compressedData = file.read( length );
if( compression == ZLIB )
currentItemData = decompressZlib( compressedData.data(), length );
else
if( compression == ZLIB )
currentItemData = decompressBzip2( compressedData.data(), length );
else
currentItemData = decompressLzma2( compressedData.data(), length, true );
if( currentItemData.empty() )
{
currentItem = 0xFFFFFFFF;
return 0xFF;
}
currentItem = entry.itemIndex;
}
// Find bin data inside item
const char * ptr = currentItemData.c_str();
quint32 pos = entry.binIndex * sizeof( quint32 );
if( pos >= currentItemData.length() - sizeof( quint32 ) )
return 0xFF;
quint32 offset, offset_be;
memcpy( &offset_be, ptr + pos, sizeof( offset_be ) );
offset = qFromBigEndian( offset_be );
pos = bins * sizeof( quint32 ) + offset;
if( pos >= currentItemData.length() - sizeof( quint32 ) )
return 0xFF;
quint32 length, len_be;
memcpy( &len_be, ptr + pos, sizeof( len_be ) );
length = qFromBigEndian( len_be );
*data = currentItemData.substr( pos + sizeof( len_be ), length );
}
return ids[ entry.binIndex ];
}
QString error = fileName + ": " + file.errorString();
throw exCantReadFile( string( error.toUtf8().data() ) );
}
// SlobDictionary
class SlobDictionary: public BtreeIndexing::BtreeDictionary
{
Mutex idxMutex;
Mutex slobMutex, idxResourceMutex;
File::Class idx;
BtreeIndex resourceIndex;
IdxHeader idxHeader;
string dictionaryName;
SlobFile sf;
public:
SlobDictionary( string const & id, string const & indexFile,
vector< string > const & dictionaryFiles );
~SlobDictionary();
virtual string getName() throw()
{ return dictionaryName; }
virtual map< Dictionary::Property, string > getProperties() throw()
{ return map< Dictionary::Property, string >(); }
virtual unsigned long getArticleCount() throw()
{ return idxHeader.articleCount; }
virtual unsigned long getWordCount() throw()
{ return idxHeader.wordCount; }
inline virtual quint32 getLangFrom() const
{ return idxHeader.langFrom; }
inline virtual quint32 getLangTo() const
{ return idxHeader.langTo; }
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
vector< wstring > const & alts,
wstring const & )
throw( std::exception );
virtual sptr< Dictionary::DataRequest > getResource( string const & name )
throw( std::exception );
virtual QString const& getDescription();
/// Loads the resource.
void loadResource( std::string &resourceName, string & data );
virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
int searchMode, bool matchCase,
int distanceBetweenWords,
int maxResults );
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
virtual void setFTSParameters( Config::FullTextSearch const & fts )
{
can_FTS = fts.enabled
&& !fts.disabledTypes.contains( "SLOB", Qt::CaseInsensitive )
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
}
protected:
virtual void loadIcon() throw();
private:
/// Loads the article.
void loadArticle( quint32 address,
string & articleText,
bool rawText = false );
quint32 readArticle( quint32 address,
string & articleText );
string convert( string const & in_data );
friend class SlobArticleRequest;
friend class SlobResourceRequest;
};
SlobDictionary::SlobDictionary( string const & id,
string const & indexFile,
vector< string > const & dictionaryFiles ):
BtreeDictionary( id, dictionaryFiles ),
idx( indexFile, "rb" ),
idxHeader( idx.read< IdxHeader >() )
{
// Open data file
try
{
sf.open( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) );
}
catch( std::exception & e )
{
gdWarning( "Slob dictionary initializing failed: %s, error: %s\n",
dictionaryFiles[ 0 ].c_str(), e.what() );
}
// Initialize the indexes
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
resourceIndex.openIndex( IndexInfo( idxHeader.resourceIndexBtreeMaxElements,
idxHeader.resourceIndexRootOffset ),
idx, idxResourceMutex );
// Read dictionary name
dictionaryName = string( sf.getDictionaryName().toUtf8().constData() );
if( dictionaryName.empty() )
{
QString name = QDir::fromNativeSeparators( FsEncoding::decode( dictionaryFiles[ 0 ].c_str() ) );
int n = name.lastIndexOf( '/' );
dictionaryName = string( name.mid( n + 1 ).toUtf8().constData() );
}
// Full-text search parameters
can_FTS = true;
ftsIdxName = indexFile + "_FTS";
if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
&& !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
FTS_index_completed.ref();
}
SlobDictionary::~SlobDictionary()
{
}
void SlobDictionary::loadIcon() throw()
{
if ( dictionaryIconLoaded )
return;
QString fileName =
QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
// Remove the extension
fileName.chop( 4 );
if( !loadIconFromFile( fileName ) )
{
// Load failed -- use default icons
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_slob.png");
}
dictionaryIconLoaded = true;
}
QString const& SlobDictionary::getDescription()
{
if( !dictionaryDescription.isEmpty() )
return dictionaryDescription;
QMap< QString, QString > const & tags = sf.getTags();
QMap< QString, QString >::const_iterator it;
for( it = tags.begin(); it != tags.end(); ++it )
{
if( it != tags.begin() )
dictionaryDescription += "\n\n";
dictionaryDescription += it.key() + ": " +it.value();
}
return dictionaryDescription;
}
void SlobDictionary::loadArticle( quint32 address,
string & articleText,
bool rawText )
{
articleText.clear();
readArticle( address, articleText );
if( !articleText.empty() )
{
if( rawText )
return;
articleText = convert( articleText );
}
else
articleText = string( QObject::tr( "Article decoding error" ).toUtf8().constData() );
// See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
string prefix( "<div class=\"slobdict\"" );
if( isToLanguageRTL() )
prefix += " dir=\"rtl\"";
prefix += ">";
articleText = prefix + articleText + cleaner + "</div>";
}
string SlobDictionary::convert( const string & in )
{
QString text = QString::fromUtf8( in.c_str() );
// pattern of img and script
text.replace( QRegExp( "<\\s*(img|script)\\s*([^>]*)src=(\"[^\"]*)" ),
QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
// pattern <link... href="..." ...>
text.replace( QRegExp( "<\\s*link\\s*([^>]*)href=\"" ),
QString( "<link \\1href=\"bres://%1/").arg( getId().c_str() ) );
// pattern <a href="..." ...>, excluding any known protocols such as http://, mailto:, #(comment)
// these links will be translated into local definitions
QRegExp rxLink( "<\\s*a\\s+([^>]*)href=\"(?!(\\w+://|#|mailto:|tel:))(/|)([^\"]*)\"\\s*(title=\"[^\"]*\")?[^>]*>",
Qt::CaseSensitive,
QRegExp::RegExp2 );
int pos = 0;
while( (pos = rxLink.indexIn( text, pos )) >= 0 )
{
QStringList list = rxLink.capturedTexts();
QString tag = list[3];
if ( !list[4].isEmpty() )
tag = list[4].split("\"")[1];
tag.remove( QRegExp(".*/") ).
remove( QRegExp( "\\.(s|)htm(l|)$", Qt::CaseInsensitive ) ).
replace( "_", "%20" ).
prepend( "<a href=\"gdlookup://localhost/" ).
append( "\" " + list[4] + ">" );
text.replace( pos, list[0].length(), tag );
pos += tag.length() + 1;
}
// Fix outstanding elements
text += "<br style=\"clear:both;\" />";
return text.toUtf8().data();
}
void SlobDictionary::loadResource( std::string & resourceName, string & data )
{
vector< WordArticleLink > link;
string resData;
link = resourceIndex.findArticles( Utf8::decode( resourceName ) );
if( link.empty() )
return;
readArticle( link[ 0 ].articleOffset, data );
}
quint32 SlobDictionary::readArticle( quint32 articleNumber, std::string & result )
{
RefEntry entry;
string data;
quint8 contentId;
{
Mutex::Lock _( slobMutex );
sf.getRefEntry( articleNumber, entry );
contentId = sf.getItem( entry, &data );
}
if( contentId == 0xFF )
return 0xFFFFFFFF;
QString contentType = sf.getContentType( contentId );
if( contentType.contains( "text/html", Qt::CaseInsensitive )
|| contentType.contains( "text/plain", Qt::CaseInsensitive )
|| contentType.contains( "/css", Qt::CaseInsensitive )
|| contentType.contains( "/javascript", Qt::CaseInsensitive )
|| contentType.contains( "/json", Qt::CaseInsensitive ))
{
QTextCodec *codec = sf.getCodec();
QString content = codec->toUnicode( data.c_str(), data.size() );
result = string( content.toUtf8().data() );
}
else
result = data;
return contentId;
}
void SlobDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
{
if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
|| FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
FTS_index_completed.ref();
if( haveFTSIndex() )
return;
if( ensureInitDone().size() )
return;
if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
return;
gdDebug( "Slob: Building the full-text index for dictionary: %s\n",
getName().c_str() );
try
{
FtsHelpers::makeFTSIndex( this, isCancelled );
FTS_index_completed.ref();
}
catch( std::exception &ex )
{
gdWarning( "Slob: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
}
}
void SlobDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
{
try
{
headword.clear();
string articleText;
loadArticle( articleAddress, articleText, true );
text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
}
catch( std::exception &ex )
{
gdWarning( "Slob: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
}
}
sptr< Dictionary::DataRequest > SlobDictionary::getSearchResults( QString const & searchString,
int searchMode, bool matchCase,
int distanceBetweenWords,
int maxResults )
{
return new FtsHelpers::FTSResultsRequest( *this, searchString, searchMode, matchCase, distanceBetweenWords, maxResults );
}
/// SlobDictionary::getArticle()
class SlobArticleRequest;
class SlobArticleRequestRunnable: public QRunnable
{
SlobArticleRequest & r;
QSemaphore & hasExited;
public:
SlobArticleRequestRunnable( SlobArticleRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~SlobArticleRequestRunnable()
{
hasExited.release();
}
virtual void run();
};
class SlobArticleRequest: public Dictionary::DataRequest
{
friend class SlobArticleRequestRunnable;
wstring word;
vector< wstring > alts;
SlobDictionary & dict;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
SlobArticleRequest( wstring const & word_,
vector< wstring > const & alts_,
SlobDictionary & dict_ ):
word( word_ ), alts( alts_ ), dict( dict_ )
{
QThreadPool::globalInstance()->start(
new SlobArticleRequestRunnable( *this, hasExited ) );
}
void run(); // Run from another thread by DslArticleRequestRunnable
virtual void cancel()
{
isCancelled.ref();
}
~SlobArticleRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void SlobArticleRequestRunnable::run()
{
r.run();
}
void SlobArticleRequest::run()
{
if ( isCancelled )
{
finish();
return;
}
vector< WordArticleLink > chain = dict.findArticles( word );
for( unsigned x = 0; x < alts.size(); ++x )
{
/// Make an additional query for each alt
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ] );
chain.insert( chain.end(), altChain.begin(), altChain.end() );
}
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
set< quint32 > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this
// by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
for( unsigned x = 0; x < chain.size(); ++x )
{
if ( isCancelled )
{
finish();
return;
}
if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
continue; // We already have this article in the body.
// Now grab that article
string headword, articleText;
headword = chain[ x ].word;
try
{
dict.loadArticle( chain[ x ].articleOffset, articleText );
}
catch(...)
{
}
// Ok. Now, does it go to main articles, or to alternate ones? We list
// main ones first, and alternates after.
// We do the case-folded comparison here.
wstring headwordStripped =
Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
multimap< wstring, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ?
mainArticles : alternateArticles;
mapToUse.insert( pair< wstring, pair< string, string > >(
Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
pair< string, string >( headword, articleText ) ) );
articlesIncluded.insert( chain[ x ].articleOffset );
}
if ( mainArticles.empty() && alternateArticles.empty() )
{
// No such word
finish();
return;
}
string result;
multimap< wstring, pair< string, string > >::const_iterator i;
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{
result += "<h3>";
result += i->second.first;
result += "</h3>";
result += i->second.second;
}
for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
{
result += "<h3>";
result += i->second.first;
result += "</h3>";
result += i->second.second;
}
Mutex::Lock _( dataMutex );
data.resize( result.size() );
memcpy( &data.front(), result.data(), result.size() );
hasAnyData = true;
finish();
}
sptr< Dictionary::DataRequest > SlobDictionary::getArticle( wstring const & word,
vector< wstring > const & alts,
wstring const & )
throw( std::exception )
{
return new SlobArticleRequest( word, alts, *this );
}
//// SlobDictionary::getResource()
class SlobResourceRequest;
class SlobResourceRequestRunnable: public QRunnable
{
SlobResourceRequest & r;
QSemaphore & hasExited;
public:
SlobResourceRequestRunnable( SlobResourceRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~SlobResourceRequestRunnable()
{
hasExited.release();
}
virtual void run();
};
class SlobResourceRequest: public Dictionary::DataRequest
{
friend class SlobResourceRequestRunnable;
SlobDictionary & dict;
string resourceName;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
SlobResourceRequest( SlobDictionary & dict_,
string const & resourceName_ ):
dict( dict_ ),
resourceName( resourceName_ )
{
QThreadPool::globalInstance()->start(
new SlobResourceRequestRunnable( *this, hasExited ) );
}
void run(); // Run from another thread by ZimResourceRequestRunnable
virtual void cancel()
{
isCancelled.ref();
}
~SlobResourceRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void SlobResourceRequestRunnable::run()
{
r.run();
}
void SlobResourceRequest::run()
{
// Some runnables linger enough that they are cancelled before they start
if ( isCancelled )
{
finish();
return;
}
try
{
string resource;
dict.loadResource( resourceName, resource );
if( resource.empty() )
throw File::Ex();
if( Filetype::isNameOfCSS( resourceName ) )
{
QString css = QString::fromUtf8( resource.data(), resource.size() );
dict.isolateCSS( css, ".slobdict" );
QByteArray bytes = css.toUtf8();
data.resize( bytes.size() );
memcpy( &data.front(), bytes.constData(), bytes.size() );
}
else
if ( Filetype::isNameOfTiff( resourceName ) )
{
// Convert it
dataMutex.lock();
QImage img = QImage::fromData( reinterpret_cast< const uchar * >( resource.data() ), resource.size() );
#ifdef MAKE_EXTRA_TIFF_HANDLER
if( img.isNull() )
GdTiff::tiffToQImage( &data.front(), data.size(), img );
#endif
dataMutex.unlock();
if ( !img.isNull() )
{
// Managed to load -- now store it back as BMP
QByteArray ba;
QBuffer buffer( &ba );
buffer.open( QIODevice::WriteOnly );
img.save( &buffer, "BMP" );
Mutex::Lock _( dataMutex );
data.resize( buffer.size() );
memcpy( &data.front(), buffer.data(), data.size() );
}
}
else
{
Mutex::Lock _( dataMutex );
data.resize( resource.size() );
memcpy( &data.front(), resource.data(), data.size() );
}
hasAnyData = true;
}
catch( std::exception &ex )
{
gdWarning( "SLOB: Failed loading resource \"%s\" from \"%s\", reason: %s\n",
resourceName.c_str(), dict.getName().c_str(), ex.what() );
// Resource not loaded -- we don't set the hasAnyData flag then
}
finish();
}
sptr< Dictionary::DataRequest > SlobDictionary::getResource( string const & name )
throw( std::exception )
{
return new SlobResourceRequest( *this, name );
}
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing )
throw( std::exception )
{
vector< sptr< Dictionary::Class > > dictionaries;
for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
++i )
{
// Skip files with the extensions different to .slob to speed up the
// scanning
QString firstName = QDir::fromNativeSeparators( FsEncoding::decode( i->c_str() ) );
if( !firstName.endsWith( ".slob") )
continue;
// Got the file -- check if we need to rebuid the index
vector< string > dictFiles( 1, *i );
string dictId = Dictionary::makeDictionaryId( dictFiles );
string indexFile = indicesDir + dictId;
try
{
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
indexIsOldOrBad( indexFile ) )
{
SlobFile sf;
2015-01-24 19:06:39 +00:00
gdDebug( "Slob: Building the index for dictionary: %s\n", i->c_str() );
2015-01-22 15:17:05 +00:00
sf.open( firstName );
initializing.indexingDictionary( sf.getDictionaryName().toUtf8().constData() );
File::Class idx( indexFile, "wb" );
IdxHeader idxHeader;
memset( &idxHeader, 0, sizeof( idxHeader ) );
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
idx.write( idxHeader );
RefEntry refEntry;
quint32 entries = sf.getRefsCount();
IndexedWords indexedWords, indexedResources;
set< quint64 > articlesPos;
quint32 articleCount = 0, wordCount = 0;
for( quint32 i = 0; i < entries; i++ )
{
sf.getRefEntry( i, refEntry );
quint8 type = sf.getItem( refEntry, 0 );
QString contentType = sf.getContentType( type );
if( contentType.startsWith( "text/html", Qt::CaseInsensitive )
|| contentType.startsWith( "text/plain", Qt::CaseInsensitive ) )
{
//Article
indexedWords.addWord( gd::toWString( refEntry.key ), i );
wordCount += 1;
quint64 pos = ( ( (quint64)refEntry.itemIndex ) << 32 ) + refEntry.binIndex;
if( articlesPos.find( pos ) == articlesPos.end() )
{
articleCount += 1;
articlesPos.insert( pos );
}
}
else
{
indexedResources.addSingleWord( gd::toWString( refEntry.key ), i );
}
}
// Build index
{
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
indexedWords.clear(); // Release memory -- no need for this data
}
{
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedResources, idx );
idxHeader.resourceIndexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.resourceIndexRootOffset = idxInfo.rootOffset;
indexedResources.clear(); // Release memory -- no need for this data
}
idxHeader.signature = Signature;
idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.articleCount = articleCount;
idxHeader.wordCount = wordCount;
QPair<quint32,quint32> langs =
LangCoder::findIdsForFilename( QString::fromStdString( dictFiles[ 0 ] ) );
idxHeader.langFrom = langs.first;
idxHeader.langTo = langs.second;
idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );
}
dictionaries.push_back( new SlobDictionary( dictId,
indexFile,
dictFiles ) );
}
catch( std::exception & e )
{
gdWarning( "Slob dictionary initializing failed: %s, error: %s\n",
i->c_str(), e.what() );
continue;
}
catch( ... )
{
qWarning( "Slob dictionary initializing failed\n" );
continue;
}
}
return dictionaries;
}
} // namespace Slob
#endif