goldendict-ng/website.cc

391 lines
11 KiB
C++
Raw Normal View History

2012-02-20 21:47:14 +00:00
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "website.hh"
#include "wstring_qt.hh"
#include "utf8.hh"
#include <QUrl>
#include <QTextCodec>
#include <QDir>
2012-12-03 12:48:31 +00:00
#include <QFileInfo>
#include <QRegExp>
namespace WebSite {
using namespace Dictionary;
namespace {
class WebSiteDictionary: public Dictionary::Class
{
string name;
QByteArray urlTemplate;
2012-12-03 12:48:31 +00:00
QString iconFilename;
bool inside_iframe;
QNetworkAccessManager & netMgr;
public:
WebSiteDictionary( string const & id, string const & name_,
2012-12-03 12:48:31 +00:00
QString const & urlTemplate_,
QString const & iconFilename_,
bool inside_iframe_,
QNetworkAccessManager & netMgr_ ):
Dictionary::Class( id, vector< string >() ),
name( name_ ),
2012-12-03 12:48:31 +00:00
urlTemplate( QUrl( urlTemplate_ ).toEncoded() ),
iconFilename( iconFilename_ ),
inside_iframe( inside_iframe_ ),
netMgr( netMgr_ )
{
dictionaryDescription = urlTemplate_;
}
virtual string getName() throw()
{ return name; }
virtual map< Property, string > getProperties() throw()
{ return map< Property, string >(); }
virtual unsigned long getArticleCount() throw()
{ return 0; }
virtual unsigned long getWordCount() throw()
{ return 0; }
virtual sptr< WordSearchRequest > prefixMatch( wstring const & word,
unsigned long ) throw( std::exception );
virtual sptr< DataRequest > getArticle( wstring const &,
vector< wstring > const & alts,
wstring const & context )
throw( std::exception );
protected:
2012-12-03 12:48:31 +00:00
virtual void loadIcon() throw();
};
sptr< WordSearchRequest > WebSiteDictionary::prefixMatch( wstring const & /*word*/,
unsigned long ) throw( std::exception )
{
sptr< WordSearchRequestInstant > sr = new WordSearchRequestInstant;
sr->setUncertain( true );
return sr;
}
class WebSiteArticleRequest: public WebSiteDataRequestSlots
{
typedef std::list< std::pair< QNetworkReply *, bool > > NetReplies;
QNetworkReply * netReply;
QString url;
Class * dictPtr;
QNetworkAccessManager & mgr;
public:
WebSiteArticleRequest( QString const & url, QNetworkAccessManager & _mgr,
Class * dictPtr_ );
~WebSiteArticleRequest()
{}
virtual void cancel();
private:
virtual void requestFinished( QNetworkReply * );
};
void WebSiteArticleRequest::cancel()
{
finish();
}
WebSiteArticleRequest::WebSiteArticleRequest( QString const & url_,
QNetworkAccessManager & _mgr,
Class * dictPtr_ ):
url( url_ ), dictPtr( dictPtr_ ), mgr( _mgr )
{
connect( &mgr, SIGNAL( finished( QNetworkReply * ) ),
this, SLOT( requestFinished( QNetworkReply * ) ),
Qt::QueuedConnection );
QUrl reqUrl( url );
netReply = mgr.get( QNetworkRequest( reqUrl ) );
#ifndef QT_NO_OPENSSL
connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
netReply, SLOT( ignoreSslErrors() ) );
#endif
}
void WebSiteArticleRequest::requestFinished( QNetworkReply * r )
{
if ( isFinished() ) // Was cancelled
return;
if ( r != netReply )
{
// Well, that's not our reply, don't do anything
return;
}
if ( netReply->error() == QNetworkReply::NoError )
{
// Check for redirect reply
QVariant possibleRedirectUrl = netReply->attribute( QNetworkRequest::RedirectionTargetAttribute );
QUrl redirectUrl = possibleRedirectUrl.toUrl();
if( !redirectUrl.isEmpty() )
{
netReply->deleteLater();
netReply = mgr.get( QNetworkRequest( redirectUrl ) );
#ifndef QT_NO_OPENSSL
connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
netReply, SLOT( ignoreSslErrors() ) );
#endif
return;
}
// Handle reply data
QByteArray replyData = netReply->readAll();
QString articleString;
QTextCodec * codec = QTextCodec::codecForHtml( replyData, 0 );
if( codec )
articleString = codec->toUnicode( replyData );
else
articleString = QString::fromUtf8( replyData );
QString divName = QString( "website_" ) + dictPtr->getId().c_str();
// Change links from relative to absolute
QString root = netReply->url().scheme() + "://" + netReply->url().host();
QString base = root + netReply->url().path();
while( !base.isEmpty() && !base.endsWith( "/" ) )
base.chop( 1 );
QRegExp tags( "<\\s*(a|link|img|script)\\s+[^>]*(src|href)\\s*=\\s*['\"][^>]+>",
Qt::CaseInsensitive, QRegExp::RegExp2 );
QRegExp links( "\\b(src|href)\\s*=\\s*(['\"])([^'\"]+['\"])",
Qt::CaseInsensitive, QRegExp::RegExp2 );
int pos = 0;
while( pos >= 0 )
{
pos = articleString.indexOf( tags, pos );
if( pos < 0 )
break;
QString tag = tags.cap();
int linkPos = tag.indexOf( links );
if( linkPos < 0 )
{
pos += tag.size();
continue;
}
QString url = links.cap( 3 );
if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0
|| url.indexOf( "mailto:" ) >= 0 || url.startsWith( "#" ) )
{
// External link, anchor or base64-encoded data
pos += tag.size();
continue;
}
QString newUrl = links.cap( 1 ) + "=" + links.cap( 2 );
if( url.startsWith( "//" ) )
newUrl += netReply->url().scheme() + ":";
else
if( url.startsWith( "/" ) )
newUrl += root;
else
newUrl += base;
newUrl += links.cap( 3 );
tag.replace( linkPos, links.cap().size(), newUrl );
articleString.replace( pos, tags.cap().size(), tag );
pos += tag.size();
}
// Check for unclosed <span> and <div>
int openTags = articleString.count( QRegExp( "<\\s*span\\b", Qt::CaseInsensitive ) );
int closedTags = articleString.count( QRegExp( "<\\s*/span\\s*>", Qt::CaseInsensitive ) );
while( openTags > closedTags )
{
articleString += "</span>";
closedTags += 1;
}
openTags = articleString.count( QRegExp( "<\\s*div\\b", Qt::CaseInsensitive ) );
closedTags = articleString.count( QRegExp( "<\\s*/div\\s*>", Qt::CaseInsensitive ) );
while( openTags > closedTags )
{
articleString += "</div>";
closedTags += 1;
}
// See Issue #271: A mechanism to clean-up invalid HTML cards.
articleString += "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
QByteArray articleBody = articleString.toUtf8();
QString divStr = QString( "<div class=\"" ) + divName + "\"";
divStr += dictPtr->isToLanguageRTL() ? " dir=\"rtl\">" : ">";
articleBody.prepend( divStr.toUtf8() );
articleBody.append( "</div>" );
Mutex::Lock _( dataMutex );
size_t prevSize = data.size();
data.resize( prevSize + articleBody.size() );
memcpy( &data.front() + prevSize, articleBody.data(), articleBody.size() );
hasAnyData = true;
}
else
setErrorString( netReply->errorString() );
netReply->deleteLater();
finish();
}
sptr< DataRequest > WebSiteDictionary::getArticle( wstring const & str,
vector< wstring > const &,
wstring const & context )
throw( std::exception )
{
sptr< DataRequestInstant > dr = new DataRequestInstant( true );
QByteArray urlString;
// Context contains the right url to go to
if ( context.size() )
urlString = Utf8::encode( context ).c_str();
else
{
urlString = urlTemplate;
QString inputWord = gd::toQString( str );
urlString.replace( "%25GDWORD%25", inputWord.toUtf8().toPercentEncoding() );
2011-07-28 13:32:34 +00:00
QTextCodec *codec = QTextCodec::codecForName( "Windows-1251" );
if( codec )
urlString.replace( "%25GD1251%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
codec = QTextCodec::codecForName( "Big-5" );
if( codec )
urlString.replace( "%25GDBIG5%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
codec = QTextCodec::codecForName( "Big5-HKSCS" );
if( codec )
urlString.replace( "%25GDBIG5HKSCS%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
codec = QTextCodec::codecForName( "Shift-JIS" );
if( codec )
urlString.replace( "%25GDSHIFTJIS%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
codec = QTextCodec::codecForName( "GB18030" );
if( codec )
urlString.replace( "%25GDGBK%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
// Handle all ISO-8859 encodings
for( int x = 1; x <= 16; ++x )
{
codec = QTextCodec::codecForName( QString( "ISO 8859-%1" ).arg( x ).toLatin1() );
if( codec )
urlString.replace( QString( "%25GDISO%1%25" ).arg( x ), codec->fromUnicode( inputWord ).toPercentEncoding() );
if ( x == 10 )
x = 12; // Skip encodings 11..12, they don't exist
}
}
if( inside_iframe )
{
// Just insert link in <iframe> tag
string result = "<div class=\"website_padding\"></div>";
result += string( "<iframe id=\"gdexpandframe-" ) + getId() +
"\" src=\"" + urlString.data() +
"\" onmouseover=\"processIframeMouseOver('gdexpandframe-" + getId() + "');\" "
"onmouseout=\"processIframeMouseOut();\" "
"scrolling=\"no\" marginwidth=\"0\" marginheight=\"0\" "
"frameborder=\"0\" vspace=\"0\" hspace=\"0\" "
"style=\"overflow:visible; width:100%; display:none;\">"
"</iframe>";
dr->getData().resize( result.size() );
memcpy( &( dr->getData().front() ), result.data(), result.size() );
return dr;
}
// To load data from site
return new WebSiteArticleRequest( urlString, netMgr, this );
}
2012-12-03 12:48:31 +00:00
void WebSiteDictionary::loadIcon() throw()
{
if ( dictionaryIconLoaded )
return;
2012-12-03 12:48:31 +00:00
if( !iconFilename.isEmpty() )
{
QFileInfo fInfo( QDir( Config::getConfigDir() ), iconFilename );
if( fInfo.isFile() )
loadIconFromFile( fInfo.absoluteFilePath(), true );
}
if( dictionaryIcon.isNull() )
dictionaryIcon = dictionaryNativeIcon = QIcon(":/icons/internet.png");
dictionaryIconLoaded = true;
}
}
vector< sptr< Dictionary::Class > > makeDictionaries( Config::WebSites const & ws,
QNetworkAccessManager & mgr )
throw( std::exception )
{
vector< sptr< Dictionary::Class > > result;
2012-12-10 12:49:45 +00:00
for( int x = 0; x < ws.size(); ++x )
{
if ( ws[ x ].enabled )
result.push_back( new WebSiteDictionary( ws[ x ].id.toUtf8().data(),
ws[ x ].name.toUtf8().data(),
2012-12-03 12:48:31 +00:00
ws[ x ].url,
ws[ x ].iconFilename,
ws[ x ].inside_iframe,
mgr )
);
}
return result;
}
}