2012-02-20 21:47:14 +00:00
|
|
|
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
2009-01-28 20:55:45 +00:00
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#include "dsl_details.hh"
|
2013-08-04 19:19:57 +00:00
|
|
|
|
2009-04-17 22:34:45 +00:00
|
|
|
#include "folding.hh"
|
2009-04-23 11:43:20 +00:00
|
|
|
#include "langcoder.hh"
|
2013-11-16 18:34:09 +00:00
|
|
|
#include "gddebug.hh"
|
2011-09-09 12:05:28 +00:00
|
|
|
#include "ufile.hh"
|
2012-10-31 13:58:35 +00:00
|
|
|
#include "wstring_qt.hh"
|
2014-04-18 12:34:44 +00:00
|
|
|
#include "utf8.hh"
|
2013-08-04 19:19:57 +00:00
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <wctype.h>
|
2012-10-31 13:58:35 +00:00
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
namespace Dsl {
|
|
|
|
namespace Details {
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
using gd::wstring;
|
2009-01-28 20:55:45 +00:00
|
|
|
using std::list;
|
|
|
|
|
2011-08-08 01:04:53 +00:00
|
|
|
#ifndef __linux__
|
2009-02-02 01:10:16 +00:00
|
|
|
|
|
|
|
// wcscasecmp() function is a GNU extension, we need to reimplement it
|
|
|
|
// for non-GNU systems.
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
int wcscasecmp( const wchar *s1, const wchar *s2 )
|
2009-02-02 01:10:16 +00:00
|
|
|
{
|
|
|
|
for( ; ; ++s1, ++s2 )
|
|
|
|
{
|
|
|
|
if ( towlower( *s1 ) != towlower( *s2 ) )
|
|
|
|
return towlower( *s1 ) > towlower( *s2 ) ? 1 : -1;
|
|
|
|
|
|
|
|
if ( !*s1 )
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2016-12-22 19:58:01 +00:00
|
|
|
static DSLLangCode LangCodes[] =
|
|
|
|
{
|
|
|
|
{ 1, "en" },
|
|
|
|
{ 1033, "en" },
|
|
|
|
{ 2, "ru" },
|
|
|
|
{ 1049, "ru" },
|
|
|
|
{ 1068, "az" },
|
|
|
|
{ 1025, "ar" },
|
|
|
|
{ 1067, "am" },
|
|
|
|
{ 15, "af" },
|
|
|
|
{ 1078, "af" },
|
|
|
|
{ 9, "eu" },
|
|
|
|
{ 1069, "eu" },
|
|
|
|
{ 1133, "ba" },
|
|
|
|
{ 21, "be" },
|
|
|
|
{ 1059, "be" },
|
|
|
|
{ 22, "bg" },
|
|
|
|
{ 1026, "bg" },
|
|
|
|
{ 19, "hu" },
|
|
|
|
{ 1038, "hu" },
|
|
|
|
{ 10, "nl" },
|
|
|
|
{ 1043, "nl" },
|
|
|
|
{ 1032, "el" },
|
|
|
|
{ 1079, "ka" },
|
|
|
|
{ 13, "da" },
|
|
|
|
{ 1030, "da" },
|
|
|
|
{ 16, "id" },
|
|
|
|
{ 1057, "id" },
|
|
|
|
{ 1039, "is" },
|
|
|
|
{ 6, "es" },
|
|
|
|
{ 7, "es" },
|
|
|
|
{ 3082, "es" },
|
|
|
|
{ 1034, "es" },
|
|
|
|
{ 5, "it" },
|
|
|
|
{ 1040, "it" },
|
|
|
|
{ 1087, "kk" },
|
|
|
|
{ 1595, "ky" },
|
|
|
|
{ 28, "ch" },
|
|
|
|
{ 29, "ch" },
|
|
|
|
{ 1028, "ch" },
|
|
|
|
{ 2052, "ch" },
|
|
|
|
{ 30, "la" },
|
|
|
|
{ 1540, "la" },
|
|
|
|
{ 1142, "la" },
|
|
|
|
{ 1062, "lv" },
|
|
|
|
{ 1063, "lt" },
|
|
|
|
{ 1086, "ms" },
|
|
|
|
{ 3, "de" },
|
|
|
|
{ 26, "de" },
|
|
|
|
{ 1031, "de" },
|
|
|
|
{ 32775, "de" },
|
|
|
|
{ 14, "nb" },
|
|
|
|
{ 1044, "nb" },
|
|
|
|
{ 25, "nn" },
|
|
|
|
{ 2068, "nn" },
|
|
|
|
{ 20, "pl" },
|
|
|
|
{ 1045, "pl" },
|
|
|
|
{ 8, "pt" },
|
|
|
|
{ 2070, "pt" },
|
|
|
|
{ 1048, "ro" },
|
|
|
|
{ 23, "sr" },
|
|
|
|
{ 3098, "sr" },
|
|
|
|
{ 1051, "sk" },
|
|
|
|
{ 1060, "sl" },
|
|
|
|
{ 17, "sw" },
|
|
|
|
{ 1089, "sw" },
|
|
|
|
{ 1064, "tg" },
|
|
|
|
{ 1092, "tt" },
|
|
|
|
{ 27, "tr" },
|
|
|
|
{ 1055, "tr" },
|
|
|
|
{ 1090, "tk" },
|
|
|
|
{ 1091, "tz" },
|
|
|
|
{ 24, "uk" },
|
|
|
|
{ 1058, "uk" },
|
|
|
|
{ 11, "fi" },
|
|
|
|
{ 1035, "fi" },
|
|
|
|
{ 4, "fr" },
|
|
|
|
{ 1036, "fr" },
|
|
|
|
{ 18, "cs" },
|
|
|
|
{ 1029, "cs" },
|
|
|
|
{ 12, "sv" },
|
|
|
|
{ 1053, "sv" },
|
|
|
|
{ 1061, "et" },
|
|
|
|
{ 0, "" },
|
|
|
|
};
|
|
|
|
|
|
|
|
string findCodeForDslId( int id )
|
|
|
|
{
|
|
|
|
for( DSLLangCode const * lc = LangCodes; lc->code_id; ++lc )
|
|
|
|
{
|
|
|
|
if ( id == lc->code_id )
|
|
|
|
{
|
|
|
|
// We've got a match
|
|
|
|
return string( lc->code );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return string();
|
|
|
|
}
|
|
|
|
|
2017-07-04 14:41:38 +00:00
|
|
|
bool isAtSignFirst( wstring const & str )
|
|
|
|
{
|
|
|
|
// Test if '@' is first in string except spaces and dsl tags
|
|
|
|
QRegExp reg( "[ \\t]*(?:\\[[^\\]]+\\][ \\t]*)*@", Qt::CaseInsensitive, QRegExp::RegExp2 );
|
|
|
|
return reg.indexIn( gd::toQString( str ) ) == 0;
|
|
|
|
}
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/////////////// ArticleDom
|
|
|
|
|
2014-04-16 16:18:28 +00:00
|
|
|
wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
if ( !isTag )
|
|
|
|
return text;
|
|
|
|
|
|
|
|
wstring result;
|
|
|
|
|
|
|
|
for( list< Node >::const_iterator i = begin(); i != end(); ++i )
|
2014-04-16 16:18:28 +00:00
|
|
|
if( !stripTrsTag || i->tagName != GD_NATIVE_TO_WS( L"!trs" ) )
|
|
|
|
result += i->renderAsText( stripTrsTag );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns true if src == 'm' and dest is 'mX', where X is a digit
|
|
|
|
static inline bool checkM( wstring const & dest, wstring const & src )
|
|
|
|
{
|
2009-04-18 17:20:12 +00:00
|
|
|
return ( src == GD_NATIVE_TO_WS( L"m" ) && dest.size() == 2 &&
|
2009-01-28 20:55:45 +00:00
|
|
|
dest[ 0 ] == L'm' && iswdigit( dest[ 1 ] ) );
|
|
|
|
}
|
|
|
|
|
2014-04-18 12:34:44 +00:00
|
|
|
ArticleDom::ArticleDom( wstring const & str, string const & dictName,
|
|
|
|
wstring const & headword_):
|
2009-05-07 12:30:36 +00:00
|
|
|
root( Node::Tag(), wstring(), wstring() ), stringPos( str.c_str() ),
|
2017-07-03 15:12:22 +00:00
|
|
|
lineStartPos( str.c_str() ),
|
2014-04-18 12:34:44 +00:00
|
|
|
transcriptionCount( 0 ),
|
|
|
|
dictionaryName( dictName ),
|
|
|
|
headword( headword_ )
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
list< Node * > stack; // Currently opened tags
|
|
|
|
|
|
|
|
Node * textNode = 0; // A leaf node which currently accumulates text.
|
|
|
|
|
|
|
|
try
|
|
|
|
{
|
|
|
|
for( ;; )
|
|
|
|
{
|
|
|
|
nextChar();
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2012-11-22 13:03:07 +00:00
|
|
|
if ( ch == L'@' && !escaped )
|
|
|
|
{
|
2017-07-04 14:41:38 +00:00
|
|
|
if( !atSignFirstInLine() )
|
2012-11-22 13:03:07 +00:00
|
|
|
{
|
2017-07-03 15:12:22 +00:00
|
|
|
// Not insided card
|
|
|
|
if( dictName.empty() )
|
|
|
|
gdWarning( "Unescaped '@' symbol found" );
|
|
|
|
else
|
|
|
|
gdWarning( "Unescaped '@' symbol found in \"%s\"", dictName.c_str() );
|
2012-11-22 13:03:07 +00:00
|
|
|
}
|
2017-07-03 15:12:22 +00:00
|
|
|
else
|
2012-11-22 13:03:07 +00:00
|
|
|
{
|
2017-07-03 15:12:22 +00:00
|
|
|
// Insided card
|
|
|
|
wstring linkTo;
|
|
|
|
nextChar();
|
|
|
|
for( ; ; nextChar() )
|
|
|
|
{
|
|
|
|
if( ch == L'\n' )
|
|
|
|
break;
|
|
|
|
if( ch != L'\r' )
|
|
|
|
linkTo.push_back( ch );
|
|
|
|
}
|
|
|
|
linkTo = Folding::trimWhitespace( linkTo );
|
2013-06-07 12:58:10 +00:00
|
|
|
|
2017-07-03 15:12:22 +00:00
|
|
|
if( !linkTo.empty() )
|
2012-11-22 13:03:07 +00:00
|
|
|
{
|
2017-07-03 15:12:22 +00:00
|
|
|
list< wstring > allLinkEntries;
|
|
|
|
expandOptionalParts( linkTo, &allLinkEntries );
|
|
|
|
|
|
|
|
for( list< wstring >::iterator entry = allLinkEntries.begin();
|
|
|
|
entry != allLinkEntries.end(); )
|
2013-06-07 12:58:10 +00:00
|
|
|
{
|
2017-07-03 15:12:22 +00:00
|
|
|
if ( !textNode )
|
|
|
|
{
|
|
|
|
Node text = Node( Node::Text(), wstring() );
|
|
|
|
|
|
|
|
if ( stack.empty() )
|
|
|
|
{
|
|
|
|
root.push_back( text );
|
|
|
|
stack.push_back( &root.back() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
stack.back()->push_back( text );
|
|
|
|
stack.push_back( &stack.back()->back() );
|
|
|
|
}
|
|
|
|
|
|
|
|
textNode = stack.back();
|
|
|
|
}
|
|
|
|
textNode->text.push_back( L'-' );
|
|
|
|
textNode->text.push_back( L' ' );
|
|
|
|
|
|
|
|
// Close the currently opened text node
|
|
|
|
stack.pop_back();
|
|
|
|
textNode = 0;
|
|
|
|
|
|
|
|
wstring linkText = Folding::trimWhitespace( *entry );
|
|
|
|
processUnsortedParts( linkText, true );
|
|
|
|
ArticleDom nodeDom( linkText, dictName, headword_ );
|
|
|
|
|
|
|
|
Node link( Node::Tag(), GD_NATIVE_TO_WS( L"@" ), wstring() );
|
|
|
|
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
|
|
|
|
link.push_back( *n );
|
|
|
|
|
|
|
|
++entry;
|
2013-06-07 12:58:10 +00:00
|
|
|
|
|
|
|
if ( stack.empty() )
|
|
|
|
{
|
2017-07-03 15:12:22 +00:00
|
|
|
root.push_back( link );
|
|
|
|
if( entry != allLinkEntries.end() ) // Add line break before next entry
|
|
|
|
root.push_back( Node( Node::Tag(), GD_NATIVE_TO_WS( L"br" ), wstring() ) );
|
2013-06-07 12:58:10 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-07-03 15:12:22 +00:00
|
|
|
stack.back()->push_back( link );
|
|
|
|
if( entry != allLinkEntries.end() )
|
|
|
|
stack.back()->push_back( Node( Node::Tag(), GD_NATIVE_TO_WS( L"br" ), wstring() ) );
|
2013-06-07 12:58:10 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-03 15:12:22 +00:00
|
|
|
// Skip to next '@'
|
2013-06-07 12:58:10 +00:00
|
|
|
|
2017-07-04 14:41:38 +00:00
|
|
|
while( !( ch == L'@' && !escaped && atSignFirstInLine() ) )
|
2017-07-03 15:12:22 +00:00
|
|
|
nextChar();
|
2012-11-22 13:03:07 +00:00
|
|
|
|
2017-07-03 15:12:22 +00:00
|
|
|
stringPos--;
|
|
|
|
ch = L'\n';
|
|
|
|
escaped = false;
|
2012-11-22 13:03:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} // if ( ch == L'@' )
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
if ( ch == L'[' && !escaped )
|
|
|
|
{
|
|
|
|
// Beginning of a tag.
|
|
|
|
do
|
|
|
|
{
|
|
|
|
nextChar();
|
2009-04-17 22:34:45 +00:00
|
|
|
} while( Folding::isWhitespace( ch ) );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
bool isClosing;
|
|
|
|
|
|
|
|
if ( ch == L'/' && !escaped )
|
|
|
|
{
|
|
|
|
// A closing tag.
|
|
|
|
isClosing = true;
|
|
|
|
nextChar();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
isClosing = false;
|
|
|
|
|
|
|
|
// Read tag's name
|
|
|
|
wstring name;
|
|
|
|
|
2009-04-17 22:34:45 +00:00
|
|
|
while( ( ch != L']' || escaped ) && !Folding::isWhitespace( ch ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
name.push_back( ch );
|
|
|
|
nextChar();
|
|
|
|
}
|
|
|
|
|
2009-04-17 22:34:45 +00:00
|
|
|
while( Folding::isWhitespace( ch ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
nextChar();
|
|
|
|
|
|
|
|
// Read attrs
|
|
|
|
|
|
|
|
wstring attrs;
|
|
|
|
|
|
|
|
while( ch != L']' || escaped )
|
|
|
|
{
|
|
|
|
attrs.push_back( ch );
|
|
|
|
nextChar();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add the tag, or close it
|
|
|
|
|
|
|
|
if ( textNode )
|
|
|
|
{
|
|
|
|
// Close the currently opened text node
|
|
|
|
stack.pop_back();
|
|
|
|
textNode = 0;
|
|
|
|
}
|
|
|
|
|
2009-05-07 12:30:36 +00:00
|
|
|
// If the tag is [t], we update the transcriptionCount
|
|
|
|
if ( name == GD_NATIVE_TO_WS( L"t" ) )
|
|
|
|
{
|
|
|
|
if ( isClosing )
|
|
|
|
{
|
|
|
|
if ( transcriptionCount )
|
|
|
|
--transcriptionCount;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
++transcriptionCount;
|
|
|
|
}
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
if ( !isClosing )
|
|
|
|
{
|
2013-07-03 13:59:32 +00:00
|
|
|
if ( name == GD_NATIVE_TO_WS( L"m" ) ||
|
|
|
|
( name.size() == 2 && name[ 0 ] == L'm' && iswdigit( name[ 1 ] ) ) )
|
2009-05-07 14:54:26 +00:00
|
|
|
{
|
2013-07-03 13:59:32 +00:00
|
|
|
// Opening an 'mX' or 'm' tag closes any previous 'm' tag
|
2009-05-07 14:54:26 +00:00
|
|
|
closeTag( GD_NATIVE_TO_WS( L"m" ), stack, false );
|
|
|
|
}
|
2013-11-08 12:53:22 +00:00
|
|
|
openTag( name, attrs, stack );
|
2014-10-29 14:46:42 +00:00
|
|
|
if ( name == GD_NATIVE_TO_WS( L"br" ) )
|
|
|
|
{
|
|
|
|
// [br] tag don't have closing tag
|
|
|
|
closeTag( name, stack );
|
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2009-05-07 14:54:26 +00:00
|
|
|
closeTag( name, stack );
|
2009-01-28 20:55:45 +00:00
|
|
|
} // if ( isClosing )
|
|
|
|
continue;
|
|
|
|
} // if ( ch == '[' )
|
|
|
|
|
|
|
|
if ( ch == L'<' && !escaped )
|
|
|
|
{
|
|
|
|
// Special case: the <<name>> link
|
|
|
|
|
|
|
|
nextChar();
|
|
|
|
|
|
|
|
if ( ch != L'<' || escaped )
|
|
|
|
{
|
|
|
|
// Ok, it's not it.
|
|
|
|
--stringPos;
|
|
|
|
|
|
|
|
if ( escaped )
|
|
|
|
{
|
|
|
|
--stringPos;
|
|
|
|
escaped = false;
|
|
|
|
}
|
|
|
|
ch = L'<';
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Get the link's body
|
|
|
|
do
|
|
|
|
{
|
|
|
|
nextChar();
|
2009-04-17 22:34:45 +00:00
|
|
|
} while( Folding::isWhitespace( ch ) );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2015-04-14 15:01:16 +00:00
|
|
|
wstring linkTo, linkText;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
for( ; ; nextChar() )
|
|
|
|
{
|
|
|
|
// Is it the end?
|
|
|
|
if ( ch == L'>' && !escaped )
|
|
|
|
{
|
|
|
|
nextChar();
|
|
|
|
|
|
|
|
if ( ch == L'>' && !escaped )
|
|
|
|
break;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
linkTo.push_back( L'>' );
|
|
|
|
linkTo.push_back( ch );
|
2015-04-14 15:01:16 +00:00
|
|
|
|
|
|
|
linkText.push_back( L'>' );
|
|
|
|
if( escaped )
|
|
|
|
linkText.push_back( L'\\' );
|
|
|
|
linkText.push_back( ch );
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2015-04-14 15:01:16 +00:00
|
|
|
{
|
2009-01-28 20:55:45 +00:00
|
|
|
linkTo.push_back( ch );
|
2015-04-14 15:01:16 +00:00
|
|
|
|
|
|
|
if( escaped )
|
|
|
|
linkText.push_back( L'\\' );
|
|
|
|
linkText.push_back( ch );
|
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add the corresponding node
|
|
|
|
|
|
|
|
if ( textNode )
|
|
|
|
{
|
|
|
|
// Close the currently opened text node
|
|
|
|
stack.pop_back();
|
|
|
|
textNode = 0;
|
|
|
|
}
|
|
|
|
|
2015-04-14 15:01:16 +00:00
|
|
|
linkText = Folding::trimWhitespace( linkText );
|
2014-10-02 13:53:12 +00:00
|
|
|
processUnsortedParts( linkText, true );
|
|
|
|
ArticleDom nodeDom( linkText, dictName, headword_ );
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
Node link( Node::Tag(), GD_NATIVE_TO_WS( L"ref" ), wstring() );
|
2014-10-02 13:53:12 +00:00
|
|
|
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
|
|
|
|
link.push_back( *n );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
if ( stack.empty() )
|
|
|
|
root.push_back( link );
|
|
|
|
else
|
|
|
|
stack.back()->push_back( link );
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} // if ( ch == '<' )
|
|
|
|
|
2009-05-14 21:16:50 +00:00
|
|
|
if ( ch == L'{' && !escaped )
|
|
|
|
{
|
|
|
|
// Special case: {{comment}}
|
|
|
|
|
|
|
|
nextChar();
|
|
|
|
|
|
|
|
if ( ch != L'{' || escaped )
|
|
|
|
{
|
|
|
|
// Ok, it's not it.
|
|
|
|
--stringPos;
|
|
|
|
|
|
|
|
if ( escaped )
|
|
|
|
{
|
|
|
|
--stringPos;
|
|
|
|
escaped = false;
|
|
|
|
}
|
|
|
|
ch = L'{';
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Skip the comment's body
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
nextChar();
|
|
|
|
|
|
|
|
// Is it the end?
|
|
|
|
if ( ch == L'}' && !escaped )
|
|
|
|
{
|
|
|
|
nextChar();
|
|
|
|
|
|
|
|
if ( ch == L'}' && !escaped )
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} // if ( ch == '{' )
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
// If we're here, we've got a normal symbol, to be saved as text.
|
|
|
|
|
|
|
|
// If there's currently no text node, open one
|
|
|
|
if ( !textNode )
|
|
|
|
{
|
2009-04-18 17:20:12 +00:00
|
|
|
Node text = Node( Node::Text(), wstring() );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
if ( stack.empty() )
|
|
|
|
{
|
|
|
|
root.push_back( text );
|
|
|
|
stack.push_back( &root.back() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
stack.back()->push_back( text );
|
|
|
|
stack.push_back( &stack.back()->back() );
|
|
|
|
}
|
|
|
|
|
|
|
|
textNode = stack.back();
|
|
|
|
}
|
|
|
|
|
2009-05-07 12:30:36 +00:00
|
|
|
// If we're inside the transcription, do old-encoding conversion
|
|
|
|
if ( transcriptionCount )
|
|
|
|
{
|
|
|
|
switch ( ch )
|
|
|
|
{
|
|
|
|
case 0x2021: ch = 0xE6; break;
|
|
|
|
case 0x407: ch = 0x72; break;
|
|
|
|
case 0xB0: ch = 0x6B; break;
|
|
|
|
case 0x20AC: ch = 0x254; break;
|
|
|
|
case 0x404: ch = 0x7A; break;
|
|
|
|
case 0x40F: ch = 0x283; break;
|
|
|
|
case 0xAB: ch = 0x74; break;
|
|
|
|
case 0xAC: ch = 0x64; break;
|
|
|
|
case 0x2020: ch = 0x259; break;
|
|
|
|
case 0x490: ch = 0x6D; break;
|
|
|
|
case 0xA7: ch = 0x66; break;
|
|
|
|
case 0xAE: ch = 0x6C; break;
|
|
|
|
case 0xB1: ch = 0x67; break;
|
|
|
|
case 0x45E: ch = 0x65; break;
|
|
|
|
case 0xAD: ch = 0x6E; break;
|
|
|
|
case 0xA9: ch = 0x73; break;
|
|
|
|
case 0xA6: ch = 0x77; break;
|
|
|
|
case 0x2026: ch = 0x28C; break;
|
|
|
|
case 0x452: ch = 0x76; break;
|
|
|
|
case 0x408: ch = 0x70; break;
|
|
|
|
case 0x40C: ch = 0x75; break;
|
|
|
|
case 0x406: ch = 0x68; break;
|
|
|
|
case 0xB5: ch = 0x61; break;
|
|
|
|
case 0x491: ch = 0x25B; break;
|
|
|
|
case 0x40A: ch = 0x14B; break;
|
|
|
|
case 0x2030: ch = 0xF0; break;
|
|
|
|
case 0x456: ch = 0x6A; break;
|
|
|
|
case 0xA4: ch = 0x62; break;
|
|
|
|
case 0x409: ch = 0x292; break;
|
|
|
|
case 0x40E: ch = 0x69; break;
|
|
|
|
//case 0x44D: ch = 0x131; break;
|
|
|
|
case 0x40B: ch = 0x4E8; break;
|
|
|
|
case 0xB6: ch = 0x28A; break;
|
|
|
|
case 0x2018: ch = 0x251; break;
|
|
|
|
case 0x457: ch = 0x265; break;
|
|
|
|
case 0x458: ch = 0x153; break;
|
|
|
|
case 0x405: textNode->text.push_back( 0x153 ); ch = 0x303; break;
|
|
|
|
case 0x441: ch = 0x272; break;
|
|
|
|
case 0x442: textNode->text.push_back( 0x254 ); ch = 0x303; break;
|
|
|
|
case 0x443: ch = 0xF8; break;
|
|
|
|
case 0x445: textNode->text.push_back(0x25B ); ch = 0x303; break;
|
|
|
|
case 0x446: ch = 0xE7; break;
|
|
|
|
case 0x44C: textNode->text.push_back( 0x251 ); ch = 0x303; break;
|
|
|
|
case 0x44D: ch = 0x26A; break;
|
|
|
|
case 0x44F: ch = 0x252; break;
|
|
|
|
case 0x30: ch = 0x3B2; break;
|
|
|
|
case 0x31: textNode->text.push_back( 0x65 ); ch = 0x303; break;
|
|
|
|
case 0x32: ch = 0x25C; break;
|
|
|
|
case 0x33: ch = 0x129; break;
|
|
|
|
case 0x34: ch = 0xF5; break;
|
|
|
|
case 0x36: ch = 0x28E; break;
|
|
|
|
case 0x37: ch = 0x263; break;
|
|
|
|
case 0x38: ch = 0x1DD; break;
|
|
|
|
case 0x3A: ch = 0x2D0; break;
|
|
|
|
case 0x27: ch = 0x2C8; break;
|
|
|
|
case 0x455: ch = 0x1D0; break;
|
|
|
|
case 0xB7: ch = 0xE3; break;
|
|
|
|
|
|
|
|
case 0x00a0: ch = 0x02A7; break;
|
|
|
|
//case 0x00b1: ch = 0x0261; break;
|
|
|
|
case 0x0402: textNode->text.push_back( 0x0069 ); ch = L':'; break;
|
|
|
|
case 0x0403: textNode->text.push_back( 0x0251 ); ch = L':'; break;
|
|
|
|
//case 0x040b: ch = 0x03b8; break;
|
|
|
|
//case 0x040e: ch = 0x026a; break;
|
|
|
|
case 0x0428: ch = 0x0061; break;
|
|
|
|
case 0x0453: textNode->text.push_back( 0x0075 ); ch = L':'; break;
|
|
|
|
case 0x201a: ch = 0x0254; break;
|
|
|
|
case 0x201e: ch = 0x0259; break;
|
|
|
|
case 0x2039: textNode->text.push_back( 0x0064 ); ch = 0x0292; break;
|
|
|
|
}
|
|
|
|
}
|
2009-05-08 19:11:24 +00:00
|
|
|
|
|
|
|
if ( escaped && ch == L' ' )
|
|
|
|
ch = 0xA0; // Escaped spaces turn into non-breakable ones in Lingvo
|
2009-05-07 12:30:36 +00:00
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
textNode->text.push_back( ch );
|
|
|
|
} // for( ; ; )
|
|
|
|
}
|
|
|
|
catch( eot )
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( textNode )
|
|
|
|
stack.pop_back();
|
|
|
|
|
|
|
|
if ( stack.size() )
|
2013-02-04 13:46:30 +00:00
|
|
|
{
|
2014-05-10 21:02:31 +00:00
|
|
|
GD_FDPRINTF( stderr, "Warning: %u tags were unclosed.\n", (unsigned) stack.size() );
|
2013-02-04 13:46:30 +00:00
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
|
2013-11-08 12:53:22 +00:00
|
|
|
void ArticleDom::openTag( wstring const & name,
|
|
|
|
wstring const & attrs,
|
|
|
|
list<Node *> &stack )
|
|
|
|
{
|
|
|
|
list< Node > nodesToReopen;
|
|
|
|
|
|
|
|
if( name == GD_NATIVE_TO_WS( L"m" ) || checkM( name, GD_NATIVE_TO_WS( L"m" ) ) )
|
|
|
|
{
|
|
|
|
// All tags above [m] tag will be closed and reopened after
|
|
|
|
// to avoid break this tag by closing some other tag.
|
|
|
|
|
|
|
|
while( stack.size() )
|
|
|
|
{
|
|
|
|
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
|
|
|
|
stack.back()->tagAttrs ) );
|
|
|
|
|
|
|
|
if ( stack.back()->empty() )
|
|
|
|
{
|
|
|
|
// Empty nodes are deleted since they're no use
|
|
|
|
|
|
|
|
stack.pop_back();
|
|
|
|
|
|
|
|
Node * parent = stack.size() ? stack.back() : &root;
|
|
|
|
|
|
|
|
parent->pop_back();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
stack.pop_back();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add tag
|
|
|
|
|
|
|
|
Node node( Node::Tag(), name, attrs );
|
|
|
|
|
|
|
|
if ( stack.empty() )
|
|
|
|
{
|
|
|
|
root.push_back( node );
|
|
|
|
stack.push_back( &root.back() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
stack.back()->push_back( node );
|
|
|
|
stack.push_back( &stack.back()->back() );
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reopen tags if needed
|
|
|
|
|
|
|
|
while( nodesToReopen.size() )
|
|
|
|
{
|
|
|
|
if ( stack.empty() )
|
|
|
|
{
|
|
|
|
root.push_back( nodesToReopen.back() );
|
|
|
|
stack.push_back( &root.back() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
stack.back()->push_back( nodesToReopen.back() );
|
|
|
|
stack.push_back( &stack.back()->back() );
|
|
|
|
}
|
|
|
|
|
|
|
|
nodesToReopen.pop_back();
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2009-05-07 14:54:26 +00:00
|
|
|
void ArticleDom::closeTag( wstring const & name,
|
|
|
|
list< Node * > & stack,
|
|
|
|
bool warn )
|
|
|
|
{
|
|
|
|
// Find the tag which is to be closed
|
|
|
|
|
|
|
|
list< Node * >::reverse_iterator n;
|
|
|
|
|
|
|
|
for( n = stack.rbegin(); n != stack.rend(); ++n )
|
|
|
|
{
|
|
|
|
if ( (*n)->tagName == name || checkM( (*n)->tagName, name ) )
|
|
|
|
{
|
|
|
|
// Found it
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( n != stack.rend() )
|
|
|
|
{
|
|
|
|
// If there is a corresponding tag, close all tags above it,
|
|
|
|
// then close the tag itself, then reopen all the tags which got
|
|
|
|
// closed.
|
|
|
|
|
|
|
|
list< Node > nodesToReopen;
|
|
|
|
|
|
|
|
while( stack.size() )
|
|
|
|
{
|
|
|
|
bool found = stack.back()->tagName == name ||
|
|
|
|
checkM( stack.back()->tagName, name );
|
|
|
|
|
|
|
|
if ( !found )
|
|
|
|
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
|
|
|
|
stack.back()->tagAttrs ) );
|
|
|
|
|
2014-10-29 14:46:42 +00:00
|
|
|
if ( stack.back()->empty() && stack.back()->tagName != GD_NATIVE_TO_WS( L"br" ) )
|
2009-05-07 14:54:26 +00:00
|
|
|
{
|
2014-10-29 14:46:42 +00:00
|
|
|
// Empty nodes except [br] tag are deleted since they're no use
|
2009-05-07 14:54:26 +00:00
|
|
|
|
|
|
|
stack.pop_back();
|
|
|
|
|
|
|
|
Node * parent = stack.size() ? stack.back() : &root;
|
|
|
|
|
|
|
|
parent->pop_back();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
stack.pop_back();
|
|
|
|
|
|
|
|
if ( found )
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
while( nodesToReopen.size() )
|
|
|
|
{
|
|
|
|
if ( stack.empty() )
|
|
|
|
{
|
|
|
|
root.push_back( nodesToReopen.back() );
|
|
|
|
stack.push_back( &root.back() );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
stack.back()->push_back( nodesToReopen.back() );
|
|
|
|
stack.push_back( &stack.back()->back() );
|
|
|
|
}
|
|
|
|
|
|
|
|
nodesToReopen.pop_back();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( warn )
|
|
|
|
{
|
2014-04-18 12:34:44 +00:00
|
|
|
if( !dictionaryName.empty() )
|
2017-06-22 15:02:04 +00:00
|
|
|
gdWarning( "No corresponding opening tag for closing tag \"%s\" found in \"%s\", article \"%s\".",
|
2014-04-18 12:34:44 +00:00
|
|
|
gd::toQString( name ).toUtf8().data(), dictionaryName.c_str(),
|
|
|
|
gd::toQString( headword ).toUtf8().data() );
|
|
|
|
else
|
2017-06-22 15:02:04 +00:00
|
|
|
gdWarning( "No corresponding opening tag for closing tag \"%s\" found.",
|
2014-04-18 12:34:44 +00:00
|
|
|
gd::toQString( name ).toUtf8().data() );
|
2009-05-07 14:54:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
void ArticleDom::nextChar() throw( eot )
|
|
|
|
{
|
|
|
|
if ( !*stringPos )
|
|
|
|
throw eot();
|
|
|
|
|
|
|
|
ch = *stringPos++;
|
|
|
|
|
|
|
|
if ( ch == L'\\' )
|
|
|
|
{
|
|
|
|
if ( !*stringPos )
|
|
|
|
throw eot();
|
|
|
|
|
|
|
|
ch = *stringPos++;
|
|
|
|
|
|
|
|
escaped = true;
|
|
|
|
}
|
2009-06-06 12:01:59 +00:00
|
|
|
else
|
|
|
|
if ( ch == L'[' && *stringPos == L'[' )
|
|
|
|
{
|
|
|
|
++stringPos;
|
|
|
|
escaped = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( ch == L']' && *stringPos == L']' )
|
|
|
|
{
|
|
|
|
++stringPos;
|
|
|
|
escaped = true;
|
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
else
|
|
|
|
escaped = false;
|
2017-07-03 15:12:22 +00:00
|
|
|
|
|
|
|
if( ch == '\n' || ch == '\r' )
|
|
|
|
lineStartPos = stringPos;
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
|
2017-07-04 14:41:38 +00:00
|
|
|
bool ArticleDom::atSignFirstInLine()
|
2017-07-03 15:12:22 +00:00
|
|
|
{
|
2017-07-04 14:41:38 +00:00
|
|
|
// Check if '@' sign is first after '\n', leading spaces and dsl tags
|
2017-07-03 15:12:22 +00:00
|
|
|
if( stringPos <= lineStartPos )
|
|
|
|
return true;
|
2017-07-04 14:41:38 +00:00
|
|
|
|
|
|
|
return isAtSignFirst( wstring( lineStartPos ) );
|
2017-07-03 15:12:22 +00:00
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/////////////// DslScanner
|
|
|
|
|
|
|
|
DslScanner::DslScanner( string const & fileName ) throw( Ex, Iconv::Ex ):
|
|
|
|
encoding( Windows1252 ), iconv( encoding ), readBufferPtr( readBuffer ),
|
2013-08-04 19:19:57 +00:00
|
|
|
readBufferLeft( 0 ), wcharBuffer( 64 ), linesRead( 0 )
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
|
|
|
// read it -- they are much nicer than the dict_data- ones.
|
2012-01-25 16:35:00 +00:00
|
|
|
|
|
|
|
f = gd_gzopen( fileName.c_str() );
|
2009-01-28 20:55:45 +00:00
|
|
|
if ( !f )
|
|
|
|
throw exCantOpen( fileName );
|
|
|
|
|
|
|
|
// Now try guessing the encoding by reading the first two bytes
|
|
|
|
|
|
|
|
unsigned char firstBytes[ 2 ];
|
|
|
|
|
|
|
|
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
|
|
|
|
{
|
|
|
|
// Apparently the file's too short
|
|
|
|
gzclose( f );
|
|
|
|
throw exMalformedDslFile( fileName );
|
|
|
|
}
|
|
|
|
|
|
|
|
bool needExactEncoding = false;
|
|
|
|
|
|
|
|
|
|
|
|
// If the file begins with the dedicated Unicode marker, we just consume
|
|
|
|
// it. If, on the other hand, it's not, we return the bytes back
|
|
|
|
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
|
|
|
encoding = Utf16LE;
|
|
|
|
else
|
|
|
|
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
|
|
|
encoding = Utf16BE;
|
|
|
|
else
|
2009-05-08 10:21:03 +00:00
|
|
|
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
|
|
|
{
|
|
|
|
// Looks like Utf8, read one more byte
|
|
|
|
if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
|
|
|
|
{
|
|
|
|
// Either the file's too short, or the BOM is weird
|
|
|
|
gzclose( f );
|
|
|
|
throw exMalformedDslFile( fileName );
|
|
|
|
}
|
|
|
|
|
|
|
|
encoding = Utf8;
|
|
|
|
}
|
|
|
|
else
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
|
|
|
|
encoding = Utf16LE;
|
|
|
|
else
|
|
|
|
if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
|
|
|
|
encoding = Utf16BE;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Ok, this doesn't look like 16-bit Unicode. We will start with a
|
|
|
|
// 8-bit encoding with an intent to find out the exact one from
|
|
|
|
// the header.
|
|
|
|
needExactEncoding = true;
|
|
|
|
encoding = Windows1251;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( gzrewind( f ) )
|
|
|
|
{
|
|
|
|
gzclose( f );
|
|
|
|
throw exCantOpen( fileName );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
iconv.reinit( encoding );
|
|
|
|
|
|
|
|
// We now can use our own readNextLine() function
|
|
|
|
|
|
|
|
wstring str;
|
|
|
|
size_t offset;
|
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
if ( !readNextLine( str, offset ) )
|
|
|
|
{
|
|
|
|
gzclose( f );
|
|
|
|
throw exMalformedDslFile( fileName );
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( str.empty() || str[ 0 ] != L'#' )
|
|
|
|
break;
|
|
|
|
|
|
|
|
bool isName = false;
|
2009-04-22 21:37:32 +00:00
|
|
|
bool isLangFrom = false;
|
|
|
|
bool isLangTo = false;
|
2017-07-09 17:15:35 +00:00
|
|
|
bool isSoundDict = false;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
if ( !str.compare( 0, 5, GD_NATIVE_TO_WS( L"#NAME" ), 5 ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
isName = true;
|
|
|
|
else
|
2009-04-22 21:37:32 +00:00
|
|
|
if ( !str.compare( 0, 15, GD_NATIVE_TO_WS( L"#INDEX_LANGUAGE" ), 15 ) )
|
|
|
|
isLangFrom = true;
|
|
|
|
else
|
|
|
|
if ( !str.compare( 0, 18, GD_NATIVE_TO_WS( L"#CONTENTS_LANGUAGE" ), 18 ) )
|
|
|
|
isLangTo = true;
|
|
|
|
else
|
2017-07-09 17:15:35 +00:00
|
|
|
if ( !str.compare( 0, 17, GD_NATIVE_TO_WS( L"#SOUND_DICTIONARY" ), 17 ) )
|
|
|
|
isSoundDict = true;
|
|
|
|
else
|
2009-04-18 17:20:12 +00:00
|
|
|
if ( str.compare( 0, 17, GD_NATIVE_TO_WS( L"#SOURCE_CODE_PAGE" ), 17 ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Locate the argument
|
|
|
|
|
|
|
|
size_t beg = str.find_first_of( L'"' );
|
|
|
|
|
|
|
|
if ( beg == wstring::npos )
|
|
|
|
throw exMalformedDslFile( fileName );
|
|
|
|
|
|
|
|
size_t end = str.find_last_of( L'"' );
|
|
|
|
|
|
|
|
if ( end == beg )
|
|
|
|
throw exMalformedDslFile( fileName );
|
|
|
|
|
|
|
|
wstring arg( str, beg + 1, end - beg - 1 );
|
|
|
|
|
|
|
|
if ( isName )
|
|
|
|
dictionaryName = arg;
|
2009-04-22 21:37:32 +00:00
|
|
|
else if ( isLangFrom )
|
2009-04-23 11:43:20 +00:00
|
|
|
langFrom = arg;
|
2009-04-22 21:37:32 +00:00
|
|
|
else if ( isLangTo )
|
2009-04-23 11:43:20 +00:00
|
|
|
langTo = arg;
|
2017-07-09 17:15:35 +00:00
|
|
|
else if ( isSoundDict )
|
|
|
|
soundDictionary = arg;
|
2009-01-28 20:55:45 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
// The encoding
|
|
|
|
if ( !needExactEncoding )
|
|
|
|
{
|
|
|
|
// We don't need that!
|
2014-05-10 21:02:31 +00:00
|
|
|
GD_FDPRINTF( stderr, "Warning: encoding was specified in a Unicode file, ignoring.\n" );
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
else
|
2009-04-18 17:20:12 +00:00
|
|
|
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
encoding = Windows1252;
|
|
|
|
else
|
2009-04-18 17:20:12 +00:00
|
|
|
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
encoding = Windows1251;
|
|
|
|
else
|
2009-04-18 17:20:12 +00:00
|
|
|
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
encoding = Windows1250;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
gzclose( f );
|
|
|
|
throw exUnknownCodePage();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The loop will always end up reading a line which was not a #-directive.
|
|
|
|
// We need to rewind to that line so readNextLine() would return it again
|
|
|
|
// next time it's called. To do that, we just use the slow gzseek() and
|
|
|
|
// empty the read buffer.
|
2012-11-05 16:05:58 +00:00
|
|
|
if( gzdirect( f ) ) // Without this ZLib 1.2.7 gzread() return 0
|
|
|
|
gzrewind( f ); // after gzseek() call on uncompressed files
|
2009-01-28 20:55:45 +00:00
|
|
|
gzseek( f, offset, SEEK_SET );
|
|
|
|
readBufferPtr = readBuffer;
|
|
|
|
readBufferLeft = 0;
|
|
|
|
|
|
|
|
if ( needExactEncoding )
|
|
|
|
iconv.reinit( encoding );
|
|
|
|
}
|
|
|
|
|
|
|
|
DslScanner::~DslScanner() throw()
|
|
|
|
{
|
|
|
|
gzclose( f );
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DslScanner::readNextLine( wstring & out, size_t & offset ) throw( Ex,
|
|
|
|
Iconv::Ex )
|
|
|
|
{
|
|
|
|
offset = (size_t)( gztell( f ) - readBufferLeft );
|
|
|
|
|
|
|
|
// For now we just read one char at a time
|
|
|
|
size_t readMultiple = distanceToBytes( 1 );
|
|
|
|
|
|
|
|
size_t leftInOut = wcharBuffer.size();
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
wchar * outPtr = &wcharBuffer.front();
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
// Check that we have bytes to read
|
2009-04-03 14:04:56 +00:00
|
|
|
if ( readBufferLeft < 4 ) // To convert one char, we need at most 4 bytes
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
2009-05-12 15:39:18 +00:00
|
|
|
if ( !gzeof( f ) )
|
2009-04-03 14:04:56 +00:00
|
|
|
{
|
|
|
|
// To avoid having to deal with ring logic, we move the remaining bytes
|
|
|
|
// to the beginning
|
|
|
|
memmove( readBuffer, readBufferPtr, readBufferLeft );
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2009-04-03 14:04:56 +00:00
|
|
|
// Read some more bytes to readBuffer
|
|
|
|
int result = gzread( f, readBuffer + readBufferLeft,
|
|
|
|
sizeof( readBuffer ) - readBufferLeft );
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2009-04-03 14:04:56 +00:00
|
|
|
if ( result == -1 )
|
|
|
|
throw exCantReadDslFile();
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2009-04-03 14:04:56 +00:00
|
|
|
readBufferPtr = readBuffer;
|
|
|
|
readBufferLeft += (size_t) result;
|
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if ( readBufferLeft < readMultiple )
|
|
|
|
{
|
|
|
|
// No more data. Return what we've got so far, forget the last byte if
|
|
|
|
// it was a 16-bit Unicode and a file had an odd number of bytes.
|
|
|
|
readBufferLeft = 0;
|
|
|
|
|
|
|
|
if ( outPtr != &wcharBuffer.front() )
|
|
|
|
{
|
|
|
|
// If there was a stray \r, remove it
|
|
|
|
if ( outPtr[ -1 ] == L'\r' )
|
|
|
|
--outPtr;
|
|
|
|
|
|
|
|
out = wstring( &wcharBuffer.front(), outPtr - &wcharBuffer.front() );
|
|
|
|
|
2009-08-01 10:05:24 +00:00
|
|
|
++linesRead;
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that we have chars to write
|
2009-04-03 14:04:56 +00:00
|
|
|
if ( leftInOut < 2 ) // With 16-bit wchars, 2 is needed for a surrogate pair
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
wcharBuffer.resize( wcharBuffer.size() + 64 );
|
2009-04-03 14:04:56 +00:00
|
|
|
outPtr = &wcharBuffer.front() + wcharBuffer.size() - 64 - leftInOut;
|
2009-01-28 20:55:45 +00:00
|
|
|
leftInOut += 64;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ok, now convert one char
|
2009-04-18 17:20:12 +00:00
|
|
|
size_t outBytesLeft = sizeof( wchar );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2009-04-03 14:04:56 +00:00
|
|
|
Iconv::Result r =
|
|
|
|
iconv.convert( (void const *&)readBufferPtr, readBufferLeft,
|
|
|
|
(void *&)outPtr, outBytesLeft );
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
if ( r == Iconv::NeedMoreOut && outBytesLeft == sizeof( wchar ) )
|
2009-04-03 14:04:56 +00:00
|
|
|
{
|
|
|
|
// Seems to be a surrogate pair with a 16-bit target wchar
|
|
|
|
|
|
|
|
outBytesLeft *= 2;
|
|
|
|
r = iconv.convert( (void const *&)readBufferPtr, readBufferLeft,
|
|
|
|
(void *&)outPtr, outBytesLeft );
|
|
|
|
--leftInOut; // Complements the next decremention
|
|
|
|
}
|
|
|
|
|
Fix for #23: Failure to index a dictionary in UTF-8 format on Windows.
Here's what happens. At some moment during dictionary conversion,
in DslScanner::readNextLine(), whe call iconv(), when both buffers
are of size 4 (the conversion is from UTF-8 to UTF-16).
Now, the dictionary contains two em-dash symbols at that position,
one after another, each is encoded in 3 bytes in UTF-8. So, the
input buffer of size 4 contains entire first em-dash (3 bytes)
and the first byte from the second em-dash.
Calling iconv() on Linux leads to Iconv::NeedMoreOut (E2BIG),
which makes sense, since we converted the first char and there is
no more space in the output buffer.
Calling iconv() on Windows leads to Iconv::NeedMoreIn (EINVAL),
which *also* makes sense, since we converted the first char, started
to look at the second one and noticed that it is incomplete.
The difference is only what iconv() checks first, the state
of the input or the state of the output. And it seems that it
does different things on Windows and Linux.
The patch takes this into account and resolves the conversion
problem on Windows: the only error condition that requires
to throw an encoding error is when outBytesLeft is non-empty,
that means that iconv didn't convert anything.
2011-07-01 12:21:57 +00:00
|
|
|
if ( outBytesLeft )
|
2009-01-28 20:55:45 +00:00
|
|
|
throw exEncodingError();
|
|
|
|
|
|
|
|
--leftInOut;
|
|
|
|
|
|
|
|
// Have we got \n?
|
|
|
|
if ( outPtr[ -1 ] == L'\n' )
|
|
|
|
{
|
|
|
|
--outPtr;
|
|
|
|
|
|
|
|
// Now kill a \r if there is one, and return the result.
|
|
|
|
if ( outPtr != &wcharBuffer.front() && outPtr[ -1 ] == L'\r' )
|
|
|
|
--outPtr;
|
|
|
|
|
|
|
|
out = wstring( &wcharBuffer.front(), outPtr - &wcharBuffer.front() );
|
|
|
|
|
2009-08-01 10:05:24 +00:00
|
|
|
++linesRead;
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-07 10:48:37 +00:00
|
|
|
bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset )
|
|
|
|
throw( Ex, Iconv::Ex )
|
|
|
|
{
|
|
|
|
wstring str;
|
|
|
|
bool commentToNextLine = false;
|
|
|
|
size_t currentOffset;
|
|
|
|
|
|
|
|
out.erase();
|
|
|
|
offset = 0;
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
bool b = readNextLine( str, currentOffset );
|
|
|
|
|
|
|
|
if( offset == 0 )
|
|
|
|
offset = currentOffset;
|
|
|
|
|
|
|
|
if( !b )
|
|
|
|
return false;
|
|
|
|
|
|
|
|
stripComments( str, commentToNextLine);
|
|
|
|
|
|
|
|
out += str;
|
|
|
|
}
|
|
|
|
while( commentToNextLine );
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/////////////// DslScanner
|
|
|
|
|
|
|
|
DslIconv::DslIconv( DslEncoding e ) throw( Iconv::Ex ):
|
2009-04-18 17:20:12 +00:00
|
|
|
Iconv( Iconv::GdWchar, getEncodingNameFor( e ) )
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void DslIconv::reinit( DslEncoding e ) throw( Iconv::Ex )
|
|
|
|
{
|
2009-04-18 17:20:12 +00:00
|
|
|
Iconv::reinit( Iconv::GdWchar, getEncodingNameFor( e ) );
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
char const * DslIconv::getEncodingNameFor( DslEncoding e )
|
|
|
|
{
|
|
|
|
switch( e )
|
|
|
|
{
|
|
|
|
case Utf16LE:
|
|
|
|
return "UTF-16LE";
|
|
|
|
case Utf16BE:
|
|
|
|
return "UTF-16BE";
|
|
|
|
case Windows1252:
|
|
|
|
return "WINDOWS-1252";
|
|
|
|
case Windows1251:
|
|
|
|
return "WINDOWS-1251";
|
2009-05-08 10:21:03 +00:00
|
|
|
case Details::Utf8:
|
|
|
|
return "UTF-8";
|
2009-01-28 20:55:45 +00:00
|
|
|
case Windows1250:
|
|
|
|
default:
|
|
|
|
return "WINDOWS-1250";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void processUnsortedParts( wstring & str, bool strip )
|
|
|
|
{
|
|
|
|
int refCount = 0;
|
|
|
|
|
|
|
|
size_t startPos = 0;
|
|
|
|
|
|
|
|
for( size_t x = 0; x < str.size(); )
|
|
|
|
{
|
2009-04-18 17:20:12 +00:00
|
|
|
wchar ch = str[ x ];
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
if ( ch == L'\\' )
|
|
|
|
{
|
|
|
|
// Escape code
|
|
|
|
x += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( ch == '{' )
|
|
|
|
{
|
|
|
|
++refCount;
|
|
|
|
|
|
|
|
if ( !strip )
|
|
|
|
{
|
|
|
|
// Just remove it and continue
|
|
|
|
str.erase( x, 1 );
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( refCount == 1 )
|
|
|
|
{
|
|
|
|
// First opening brace. Save this position, we will be erasing the
|
|
|
|
// whole range when we encounter the last closing brace.
|
|
|
|
startPos = x;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( ch == '}' )
|
|
|
|
{
|
|
|
|
--refCount;
|
|
|
|
|
|
|
|
if ( refCount < 0 )
|
|
|
|
{
|
2014-05-10 21:02:31 +00:00
|
|
|
GD_FDPRINTF( stderr, "Warning: an unmatched closing brace was encountered.\n" );
|
2009-01-28 20:55:45 +00:00
|
|
|
refCount = 0;
|
|
|
|
// But we remove that thing either way
|
|
|
|
str.erase( x, 1 );
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( !strip )
|
|
|
|
{
|
|
|
|
// Just remove it and continue
|
|
|
|
str.erase( x, 1 );
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( !refCount )
|
|
|
|
{
|
|
|
|
// The final closing brace -- we can erase the whole range now.
|
|
|
|
str.erase( startPos, x - startPos + 1 );
|
|
|
|
x = startPos;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
++x;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( strip && refCount )
|
|
|
|
{
|
2014-05-10 21:02:31 +00:00
|
|
|
GD_FDPRINTF( stderr, "Warning: unclosed brace(s) encountered.\n" );
|
2009-01-28 20:55:45 +00:00
|
|
|
str.erase( startPos );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-07 21:32:49 +00:00
|
|
|
void expandOptionalParts( wstring & str, list< wstring > * result,
|
|
|
|
size_t x, bool inside_recurse )
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
2012-09-07 21:32:49 +00:00
|
|
|
list< wstring > expanded;
|
|
|
|
list< wstring > * headwords;
|
|
|
|
headwords = inside_recurse ? result : &expanded;
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
for( ; x < str.size(); )
|
|
|
|
{
|
2009-04-18 17:20:12 +00:00
|
|
|
wchar ch = str[ x ];
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
if ( ch == L'\\' )
|
|
|
|
{
|
|
|
|
// Escape code
|
|
|
|
x += 2;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( ch == L'(' )
|
|
|
|
{
|
|
|
|
// First, handle the case where this block is removed
|
|
|
|
|
|
|
|
{
|
|
|
|
int refCount = 1;
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
for( size_t y = x + 1; y < str.size(); ++y )
|
|
|
|
{
|
2009-04-18 17:20:12 +00:00
|
|
|
wchar ch = str[ y ];
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
if ( ch == L'\\' )
|
|
|
|
{
|
|
|
|
// Escape code
|
|
|
|
++y;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( ch == L'(' )
|
|
|
|
++refCount;
|
|
|
|
else
|
|
|
|
if ( ch == L')' )
|
|
|
|
{
|
|
|
|
if ( !--refCount )
|
|
|
|
{
|
|
|
|
// Now that the closing parenthesis is found,
|
|
|
|
// cut the whole thing out and be done.
|
|
|
|
|
|
|
|
if ( y != x + 1 ) // Only do for non-empty cases
|
|
|
|
{
|
|
|
|
wstring removed( str, 0, x );
|
|
|
|
removed.append( str, y + 1, str.size() - y - 1 );
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2012-09-07 21:32:49 +00:00
|
|
|
expandOptionalParts( removed, headwords, x, true );
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
if ( refCount && x != str.size() - 1 )
|
|
|
|
{
|
|
|
|
// Closing paren not found? Chop it.
|
|
|
|
|
|
|
|
wstring removed( str, 0, x );
|
2009-04-22 21:37:32 +00:00
|
|
|
|
2010-09-29 04:55:53 +00:00
|
|
|
// Limit the amount of results to avoid excessive resource consumption
|
2012-09-07 21:32:49 +00:00
|
|
|
if ( headwords->size() < 32 )
|
|
|
|
headwords->push_back( removed );
|
2010-09-29 04:55:53 +00:00
|
|
|
else
|
2012-09-07 21:32:49 +00:00
|
|
|
{
|
|
|
|
if( !inside_recurse )
|
|
|
|
result->merge( expanded );
|
2010-09-29 04:55:53 +00:00
|
|
|
return;
|
2012-09-07 21:32:49 +00:00
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, handling the case where it is kept -- we just erase
|
|
|
|
// the paren and go on
|
|
|
|
|
|
|
|
str.erase( x, 1 );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
if ( ch == L')' )
|
|
|
|
{
|
|
|
|
// Closing paren doesn't mean much -- just erase it
|
|
|
|
str.erase( x, 1 );
|
|
|
|
}
|
|
|
|
else
|
|
|
|
++x;
|
|
|
|
}
|
|
|
|
|
2010-09-29 04:55:53 +00:00
|
|
|
// Limit the amount of results to avoid excessive resource consumption
|
2012-09-07 21:32:49 +00:00
|
|
|
if ( headwords->size() < 32 )
|
|
|
|
headwords->push_back( str );
|
|
|
|
if( !inside_recurse )
|
|
|
|
result->merge( expanded );
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
|
2013-07-07 10:48:37 +00:00
|
|
|
static const wstring openBraces( GD_NATIVE_TO_WS( L"{{" ) );
|
|
|
|
static const wstring closeBraces( GD_NATIVE_TO_WS( L"}}" ) );
|
|
|
|
|
|
|
|
void stripComments( wstring & str, bool & nextLine )
|
|
|
|
{
|
|
|
|
string::size_type n = 0, n2 = 0;
|
|
|
|
|
|
|
|
for( ; ; )
|
|
|
|
{
|
|
|
|
if( nextLine )
|
|
|
|
{
|
|
|
|
n = str.find( closeBraces, n2 );
|
|
|
|
if( n == string::npos )
|
|
|
|
{
|
|
|
|
str.erase( n2, n );
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
str.erase( n2, n - n2 + 2 );
|
|
|
|
nextLine = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = str.find( openBraces, n2 );
|
|
|
|
if( n == string::npos )
|
|
|
|
return;
|
|
|
|
nextLine = true;
|
|
|
|
n2 = n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
void expandTildes( wstring & str, wstring const & tildeReplacement )
|
|
|
|
{
|
2017-10-02 17:49:31 +00:00
|
|
|
wstring tildeValue = Folding::trimWhitespace( tildeReplacement );
|
2009-01-28 20:55:45 +00:00
|
|
|
for( size_t x = 0; x < str.size(); )
|
|
|
|
if ( str[ x ] == L'\\' )
|
|
|
|
x+=2;
|
|
|
|
else
|
|
|
|
if ( str[ x ] == L'~' )
|
|
|
|
{
|
2012-11-26 19:16:10 +00:00
|
|
|
if( x > 0 && str[ x - 1 ] == '^' && ( x < 2 || str[ x - 2 ] != '\\' ) )
|
|
|
|
{
|
2017-10-02 17:49:31 +00:00
|
|
|
str.replace( x - 1, 2, tildeValue );
|
2012-11-29 12:56:55 +00:00
|
|
|
str[ x - 1 ] = QChar( str[ x - 1 ] ).isUpper() ? QChar::toLower( (uint)str[ x - 1 ] )
|
|
|
|
: QChar::toUpper( (uint)str[ x - 1 ] );
|
2017-10-02 17:49:31 +00:00
|
|
|
x = x - 1 + tildeValue.size();
|
2012-11-26 19:16:10 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-10-02 17:49:31 +00:00
|
|
|
str.replace( x, 1, tildeValue );
|
|
|
|
x += tildeValue.size();
|
2012-11-26 19:16:10 +00:00
|
|
|
}
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
++x;
|
|
|
|
}
|
|
|
|
|
|
|
|
void unescapeDsl( wstring & str )
|
|
|
|
{
|
|
|
|
for( size_t x = 0; x < str.size(); ++x )
|
|
|
|
if ( str[ x ] == L'\\' )
|
|
|
|
str.erase( x, 1 ); // ++x would skip the next char without processing it
|
|
|
|
}
|
|
|
|
|
2009-06-06 16:02:52 +00:00
|
|
|
void normalizeHeadword( wstring & str )
|
|
|
|
{
|
|
|
|
for( size_t x = str.size(); x-- > 1; ) // >1 -- Don't test the first char
|
|
|
|
{
|
|
|
|
if ( str[ x ] == L' ' )
|
|
|
|
{
|
|
|
|
size_t y;
|
|
|
|
for( y = x; y && ( str[ y - 1 ] == L' ' ) ; --y );
|
|
|
|
|
|
|
|
if ( y != x )
|
|
|
|
{
|
|
|
|
// Remove extra spaces
|
|
|
|
|
|
|
|
str.erase( y, x - y );
|
|
|
|
|
|
|
|
x = y;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-04-24 13:56:38 +00:00
|
|
|
if( !str.empty() && str[ str.size() - 1 ] == L' ' )
|
|
|
|
str.erase( str.size() - 1, 1 );
|
2014-04-24 18:56:47 +00:00
|
|
|
if( !str.empty() && str[ 0 ] == L' ' )
|
|
|
|
str.erase( 0, 1 );
|
2009-06-06 16:02:52 +00:00
|
|
|
}
|
|
|
|
|
2009-04-23 11:43:20 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
void cutEnding( wstring & where, wstring const & ending )
|
|
|
|
{
|
|
|
|
if ( where.size() > ending.size() &&
|
|
|
|
where.compare( where.size() - ending.size(),
|
|
|
|
ending.size(), ending ) == 0 )
|
|
|
|
where.erase( where.size() - ending.size() );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
quint32 dslLanguageToId( wstring const & name )
|
|
|
|
{
|
|
|
|
static wstring newSp( GD_NATIVE_TO_WS( L"newspelling" ) );
|
|
|
|
static wstring st( GD_NATIVE_TO_WS( L"standard" ) );
|
|
|
|
static wstring ms( GD_NATIVE_TO_WS( L"modernsort" ) );
|
|
|
|
static wstring ts( GD_NATIVE_TO_WS( L"traditionalsort" ) );
|
|
|
|
static wstring prc( GD_NATIVE_TO_WS( L"prc" ) );
|
|
|
|
|
|
|
|
// Any of those endings are to be removed
|
|
|
|
|
|
|
|
wstring nameStripped = Folding::apply( name );
|
|
|
|
|
|
|
|
cutEnding( nameStripped, newSp );
|
|
|
|
cutEnding( nameStripped, st );
|
|
|
|
cutEnding( nameStripped, ms );
|
|
|
|
cutEnding( nameStripped, ts );
|
|
|
|
cutEnding( nameStripped, prc );
|
|
|
|
|
|
|
|
return LangCoder::findIdForLanguage( nameStripped );
|
|
|
|
}
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
}
|