goldendict-ng/htmlescape.cc

/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */

#include <QString>
#include <QTextDocumentFragment>

#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
#include <QRegularExpression>
#else
#include <QRegExp>
#endif

#include "htmlescape.hh"

namespace Html {

string escape( string const & str )
{
  string result( str );

  for( size_t x = result.size(); x--; )
    switch ( result[ x ] )
    {
      case '&':
        result.erase( x, 1 );
        result.insert( x, "&amp;" );
      break;

      case '<':
        result.erase( x, 1 );
        result.insert( x, "&lt;" );
      break;

      case '>':
        result.erase( x, 1 );
        result.insert( x, "&gt;" );
      break;

      case '"':
        result.erase( x, 1 );
        result.insert( x, "&quot;" );
      break;

      default:
      break;
    }

  return result;
}

static void storeLineInDiv( string & result, string const & line, bool baseRightToLeft )
{
  result += "<div";
  if( unescape( QString::fromUtf8( line.c_str(), line.size() ) ).isRightToLeft() != baseRightToLeft )
  {
    result += " dir=\"";
    result += baseRightToLeft ? "ltr\"" : "rtl\"";
  }
  result += ">";
  result += line + "</div>";
}

string preformat(string const & str , bool baseRightToLeft )
{
  string escaped = escape( str ), result, line;

  line.reserve( escaped.size() );
  result.reserve( escaped.size() );

  bool leading = true;

  for( char const * nextChar = escaped.c_str(); *nextChar; ++nextChar )
  {
    if ( leading )
    {
      if ( *nextChar == ' ' )
      {
        line += "&nbsp;";
        continue;
      }
      else
      if ( *nextChar == '\t' )
      {
        line += "&nbsp;&nbsp;&nbsp;&nbsp;";
        continue;
      }
    }

    if ( *nextChar == '\n' )
    {
      storeLineInDiv( result, line, baseRightToLeft );
      line.clear();
      leading = true;
      continue;
    }

    if ( *nextChar == '\r' )
      continue; // Just skip all \r

    line.push_back( *nextChar );

    leading = false;
  }

  if( !line.empty() )
    storeLineInDiv( result, line, baseRightToLeft );

  return result;
}

string escapeForJavaScript( string const & str )
{
  string result( str );

  for( size_t x = result.size(); x--; )
    switch ( result[ x ] )
    {
      case '\\':
      case '"':
      case '\'':
        result.insert( x, 1, '\\' );
      break;

      case '\n':
        result.erase( x, 1 );
        result.insert( x, "\\n" );
      break;

      case '\r':
        result.erase( x, 1 );
        result.insert( x, "\\r" );
      break;

      case '\t':
        result.erase( x, 1 );
        result.insert( x, "\\t" );
      break;

      default:
      break;
    }

  return result;
}

QString unescape( QString const & str, bool saveFormat )
{
  // Does it contain HTML? If it does, we need to strip it
  if ( str.contains( '<' ) || str.contains( '&' ) )
  {
    QString tmp = str;
    if( !saveFormat )
    {
#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
        tmp.replace( QRegularExpression( "<(?:\\s*/?(?:div|h[1-6r]|p(?![alr])|br|li(?![ns])|td|blockquote|[uo]l|pre|d[dl]|nav|address))[^>]{0,}>",
                                         QRegularExpression::CaseInsensitiveOption ), " " );
        tmp.remove( QRegularExpression( "<[^>]*>" ) );
#else
      tmp.replace( QRegExp( "<(?:\\s*/?(?:div|h[1-6r]|p(?![alr])|br|li(?![ns])|td|blockquote|[uo]l|pre|d[dl]|nav|address))[^>]{0,}>",
                            Qt::CaseInsensitive, QRegExp::RegExp2 ), " " );
      tmp.remove( QRegExp( "<[^>]*>", Qt::CaseSensitive, QRegExp::RegExp2 ) );
#endif
    }
    return QTextDocumentFragment::fromHtml( tmp.trimmed() ).toPlainText();
  }
  return str;
}

string unescapeUtf8( const string &str, bool saveFormat )
{
  return string( unescape( QString::fromUtf8( str.c_str(), str.size() ) ).toUtf8().data(), saveFormat );
}

}
Update year in copyright notices. 2012-02-20 21:47:14 +00:00			`/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */`

Show decription for Stardict dictionaries 2012-09-07 13:58:45 +00:00			`#include <QString>`
			`#include <QTextDocumentFragment>`
Qt5: Use QRegularExpression instead of QRegExp in many cases 2018-02-21 14:43:35 +00:00
			`#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )`
			`#include <QRegularExpression>`
			`#else`
Implement full-text search 2014-04-16 16:18:28 +00:00			`#include <QRegExp>`
Qt5: Use QRegularExpression instead of QRegExp in many cases 2018-02-21 14:43:35 +00:00			`#endif`

Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`#include "htmlescape.hh"`

			`namespace Html {`

			`string escape( string const & str )`
			`{`
			`string result( str );`

			`for( size_t x = result.size(); x--; )`
			`switch ( result[ x ] )`
			`{`
			`case '&':`
			`result.erase( x, 1 );`
			`result.insert( x, "&" );`
			`break;`

			`case '<':`
			`result.erase( x, 1 );`
			`result.insert( x, "<" );`
			`break;`

			`case '>':`
			`result.erase( x, 1 );`
			`result.insert( x, ">" );`
			`break;`

			`case '"':`
			`result.erase( x, 1 );`
			`result.insert( x, """ );`
			`break;`

			`default:`
			`break;`
			`}`

			`return result;`
			`}`

Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`static void storeLineInDiv( string & result, string const & line, bool baseRightToLeft )`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00			`{`
Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`result += "<div";`
Some more support for RTL languages in articles 2013-07-14 15:37:38 +00:00			`if( unescape( QString::fromUtf8( line.c_str(), line.size() ) ).isRightToLeft() != baseRightToLeft )`
Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`{`
			`result += " dir=\"";`
			`result += baseRightToLeft ? "ltr\"" : "rtl\"";`
			`}`
			`result += ">";`
			`result += line + "</div>";`
			`}`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00
Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`string preformat(string const & str , bool baseRightToLeft )`
			`{`
			`string escaped = escape( str ), result, line;`

			`line.reserve( escaped.size() );`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00			`result.reserve( escaped.size() );`

			`bool leading = true;`

			`for( char const * nextChar = escaped.c_str(); *nextChar; ++nextChar )`
			`{`
			`if ( leading )`
			`{`
			`if ( *nextChar == ' ' )`
			`{`
Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`line += " ";`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00			`continue;`
			`}`
			`else`
			`if ( *nextChar == '\t' )`
			`{`
Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`line += "    ";`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00			`continue;`
			`}`
			`}`

			`if ( *nextChar == '\n' )`
			`{`
Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`storeLineInDiv( result, line, baseRightToLeft );`
			`line.clear();`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00			`leading = true;`
			`continue;`
			`}`

			`if ( *nextChar == '\r' )`
			`continue; // Just skip all \r`

Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`line.push_back( *nextChar );`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00
			`leading = false;`
			`}`

Additional detection for text direction in plain text articles 2013-07-13 11:24:58 +00:00			`if( !line.empty() )`
			`storeLineInDiv( result, line, baseRightToLeft );`

Enhanced support for RTL languages in articles 2013-07-10 13:48:09 +00:00			`return result;`
+ Support for dictd files (.index/.dict(.dz)) added, among with other small accompanying changes. 2009-04-09 18:50:49 +00:00			`}`

+ Add escapeForJavaScript() function, to be used later. 2009-04-12 11:28:56 +00:00			`string escapeForJavaScript( string const & str )`
			`{`
			`string result( str );`

			`for( size_t x = result.size(); x--; )`
			`switch ( result[ x ] )`
			`{`
			`case '\\':`
			`case '"':`
			`case '\'':`
			`result.insert( x, 1, '\\' );`
			`break;`

			`case '\n':`
			`result.erase( x, 1 );`
			`result.insert( x, "\\n" );`
			`break;`

			`case '\r':`
			`result.erase( x, 1 );`
			`result.insert( x, "\\r" );`
			`break;`

			`case '\t':`
			`result.erase( x, 1 );`
			`result.insert( x, "\\t" );`
			`break;`

			`default:`
			`break;`
			`}`

			`return result;`
			`}`

DictD: One more fix for dictionary description 2015-02-26 14:37:20 +00:00			`QString unescape( QString const & str, bool saveFormat )`
Show decription for Stardict dictionaries 2012-09-07 13:58:45 +00:00			`{`
			`// Does it contain HTML? If it does, we need to strip it`
			`if ( str.contains( '<' ) \|\| str.contains( '&' ) )`
Implement full-text search 2014-04-16 16:18:28 +00:00			`{`
			`QString tmp = str;`
DictD: One more fix for dictionary description 2015-02-26 14:37:20 +00:00			`if( !saveFormat )`
			`{`
Qt5: Use QRegularExpression instead of QRegExp in many cases 2018-02-21 14:43:35 +00:00			`#if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )`
Full-text search: handle more block tags 2018-08-14 21:58:19 +00:00			`tmp.replace( QRegularExpression( "<(?:\\s*/?(?:div\|h[1-6r]\|p(?![alr])\|br\|li(?![ns])\|td\|blockquote\|[uo]l\|pre\|d[dl]\|nav\|address))[^>]{0,}>",`
Qt5: Use QRegularExpression instead of QRegExp in many cases 2018-02-21 14:43:35 +00:00			`QRegularExpression::CaseInsensitiveOption ), " " );`
			`tmp.remove( QRegularExpression( "<[^>]*>" ) );`
			`#else`
Full-text search: handle more block tags 2018-08-14 21:58:19 +00:00			`tmp.replace( QRegExp( "<(?:\\s*/?(?:div\|h[1-6r]\|p(?![alr])\|br\|li(?![ns])\|td\|blockquote\|[uo]l\|pre\|d[dl]\|nav\|address))[^>]{0,}>",`
DictD: One more fix for dictionary description 2015-02-26 14:37:20 +00:00			`Qt::CaseInsensitive, QRegExp::RegExp2 ), " " );`
			`tmp.remove( QRegExp( "<[^>]*>", Qt::CaseSensitive, QRegExp::RegExp2 ) );`
Qt5: Use QRegularExpression instead of QRegExp in many cases 2018-02-21 14:43:35 +00:00			`#endif`
DictD: One more fix for dictionary description 2015-02-26 14:37:20 +00:00			`}`
Implement full-text search 2014-04-16 16:18:28 +00:00			`return QTextDocumentFragment::fromHtml( tmp.trimmed() ).toPlainText();`
			`}`
Show decription for Stardict dictionaries 2012-09-07 13:58:45 +00:00			`return str;`
			`}`

DictD: One more fix for dictionary description 2015-02-26 14:37:20 +00:00			`string unescapeUtf8( const string &str, bool saveFormat )`
Prevent some potential crashes on broken bgl 2012-11-14 12:54:31 +00:00			`{`
DictD: One more fix for dictionary description 2015-02-26 14:37:20 +00:00			`return string( unescape( QString::fromUtf8( str.c_str(), str.size() ) ).toUtf8().data(), saveFormat );`
Prevent some potential crashes on broken bgl 2012-11-14 12:54:31 +00:00			`}`

Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`}`