2012-02-20 21:47:14 +00:00
|
|
|
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
2009-01-28 20:55:45 +00:00
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#ifndef __DSL_DETAILS_HH_INCLUDED__
|
|
|
|
#define __DSL_DETAILS_HH_INCLUDED__
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <list>
|
|
|
|
#include <vector>
|
|
|
|
#include <zlib.h>
|
|
|
|
#include "dictionary.hh"
|
|
|
|
#include "iconv.hh"
|
|
|
|
|
|
|
|
// Implementation details for Dsl, not part of its interface
|
|
|
|
namespace Dsl {
|
|
|
|
namespace Details {
|
|
|
|
|
|
|
|
using std::string;
|
2009-04-18 17:20:12 +00:00
|
|
|
using gd::wstring;
|
|
|
|
using gd::wchar;
|
2009-01-28 20:55:45 +00:00
|
|
|
using std::list;
|
|
|
|
using std::vector;
|
|
|
|
|
|
|
|
// Those are possible encodings for .dsl files
|
|
|
|
enum DslEncoding
|
|
|
|
{
|
|
|
|
Utf16LE,
|
|
|
|
Utf16BE,
|
|
|
|
Windows1252,
|
|
|
|
Windows1251,
|
2009-05-08 10:21:03 +00:00
|
|
|
Windows1250,
|
|
|
|
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
2016-12-22 19:58:01 +00:00
|
|
|
struct DSLLangCode
|
|
|
|
{
|
|
|
|
int code_id;
|
|
|
|
char code[ 3 ]; // ISO 639-1
|
|
|
|
};
|
|
|
|
|
|
|
|
string findCodeForDslId( int id );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2017-07-04 14:41:38 +00:00
|
|
|
bool isAtSignFirst( wstring const & str );
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Parses the DSL language, representing it in its structural DOM form.
|
|
|
|
struct ArticleDom
|
|
|
|
{
|
|
|
|
struct Node: public list< Node >
|
|
|
|
{
|
|
|
|
bool isTag; // true if it is a tag with subnodes, false if it's a leaf text
|
|
|
|
// data.
|
|
|
|
// Those are only used if isTag is true
|
|
|
|
wstring tagName;
|
|
|
|
wstring tagAttrs;
|
|
|
|
wstring text; // This is only used if isTag is false
|
|
|
|
|
|
|
|
class Text {};
|
|
|
|
class Tag {};
|
|
|
|
|
|
|
|
Node( Tag, wstring const & name, wstring const & attrs ): isTag( true ),
|
|
|
|
tagName( name ), tagAttrs( attrs )
|
|
|
|
{}
|
|
|
|
|
|
|
|
Node( Text, wstring const & text_ ): isTag( false ), text( text_ )
|
|
|
|
{}
|
|
|
|
|
|
|
|
/// Concatenates all childen text nodes recursively to form all text
|
|
|
|
/// the node contains stripped of any markup.
|
2014-04-16 16:18:28 +00:00
|
|
|
wstring renderAsText( bool stripTrsTag = false ) const;
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/// Does the parse at construction. Refer to the 'root' member variable
|
|
|
|
/// afterwards.
|
2014-04-18 12:34:44 +00:00
|
|
|
ArticleDom( wstring const &, string const & dictName = string(),
|
|
|
|
wstring const & headword_ = wstring() );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Root of DOM's tree
|
|
|
|
Node root;
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
2013-11-08 12:53:22 +00:00
|
|
|
void openTag( wstring const & name, wstring const & attr, list< Node * > & stack );
|
|
|
|
|
2009-05-07 14:54:26 +00:00
|
|
|
void closeTag( wstring const & name, list< Node * > & stack,
|
|
|
|
bool warn = true );
|
|
|
|
|
2017-07-04 14:41:38 +00:00
|
|
|
bool atSignFirstInLine();
|
2017-07-03 15:12:22 +00:00
|
|
|
|
|
|
|
wchar const * stringPos, * lineStartPos;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
class eot {};
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
wchar ch;
|
2009-01-28 20:55:45 +00:00
|
|
|
bool escaped;
|
2009-05-07 12:30:36 +00:00
|
|
|
unsigned transcriptionCount; // >0 = inside a [t] tag
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
void nextChar() throw( eot );
|
2014-04-18 12:34:44 +00:00
|
|
|
|
|
|
|
/// Infomation for diagnostic purposes
|
|
|
|
string dictionaryName;
|
|
|
|
wstring headword;
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
/// A adapted version of Iconv which takes Dsl encoding and decodes to wchar.
|
2009-01-28 20:55:45 +00:00
|
|
|
class DslIconv: public Iconv
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
DslIconv( DslEncoding ) throw( Iconv::Ex );
|
|
|
|
void reinit( DslEncoding ) throw( Iconv::Ex );
|
|
|
|
|
|
|
|
/// Returns a name to be passed to iconv for the given dsl encoding.
|
|
|
|
static char const * getEncodingNameFor( DslEncoding );
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Opens the .dsl or .dsl.dz file and allows line-by-line reading. Auto-detects
|
|
|
|
/// the encoding, and reads all headers by itself.
|
|
|
|
class DslScanner
|
|
|
|
{
|
|
|
|
gzFile f;
|
|
|
|
DslEncoding encoding;
|
|
|
|
DslIconv iconv;
|
|
|
|
wstring dictionaryName;
|
2009-04-23 11:43:20 +00:00
|
|
|
wstring langFrom, langTo;
|
2017-07-09 17:15:35 +00:00
|
|
|
wstring soundDictionary;
|
2009-01-28 20:55:45 +00:00
|
|
|
char readBuffer[ 65536 ];
|
|
|
|
char * readBufferPtr;
|
|
|
|
size_t readBufferLeft;
|
2009-04-18 17:20:12 +00:00
|
|
|
vector< wchar > wcharBuffer;
|
2009-08-01 10:05:24 +00:00
|
|
|
unsigned linesRead;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
DEF_EX( Ex, "Dsl scanner exception", Dictionary::Ex )
|
|
|
|
DEF_EX_STR( exCantOpen, "Can't open .dsl file", Ex )
|
|
|
|
DEF_EX( exCantReadDslFile, "Can't read .dsl file", Ex )
|
|
|
|
DEF_EX_STR( exMalformedDslFile, "The .dsl file is malformed:", Ex )
|
|
|
|
DEF_EX( exUnknownCodePage, "The .dsl file specified an unknown code page", Ex )
|
|
|
|
DEF_EX( exEncodingError, "Encoding error", Ex ) // Should never happen really
|
|
|
|
|
|
|
|
DslScanner( string const & fileName ) throw( Ex, Iconv::Ex );
|
|
|
|
~DslScanner() throw();
|
|
|
|
|
|
|
|
/// Returns the detected encoding of this file.
|
|
|
|
DslEncoding getEncoding() const
|
|
|
|
{ return encoding; }
|
|
|
|
|
|
|
|
/// Returns the dictionary's name, as was read from file's headers.
|
|
|
|
wstring const & getDictionaryName() const
|
|
|
|
{ return dictionaryName; }
|
|
|
|
|
2009-04-22 21:37:32 +00:00
|
|
|
/// Returns the dictionary's source language, as was read from file's headers.
|
2009-04-23 11:43:20 +00:00
|
|
|
wstring const & getLangFrom() const
|
2009-04-22 21:37:32 +00:00
|
|
|
{ return langFrom; }
|
|
|
|
|
|
|
|
/// Returns the dictionary's target language, as was read from file's headers.
|
2009-04-23 11:43:20 +00:00
|
|
|
wstring const & getLangTo() const
|
2009-04-22 21:37:32 +00:00
|
|
|
{ return langTo; }
|
|
|
|
|
2017-07-09 17:15:35 +00:00
|
|
|
/// Returns the preferred external dictionary with sounds, as was read from file's headers.
|
|
|
|
wstring const & getSoundDictionaryName() const
|
|
|
|
{ return soundDictionary; }
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Reads next line from the file. Returns true if reading succeeded --
|
|
|
|
/// the string gets stored in the one passed, along with its physical
|
|
|
|
/// file offset in the file (the uncompressed one if the file is compressed).
|
|
|
|
/// If end of file is reached, false is returned.
|
|
|
|
/// Reading begins from the first line after the headers (ones which start
|
|
|
|
/// with #).
|
|
|
|
bool readNextLine( wstring &, size_t & offset ) throw( Ex, Iconv::Ex );
|
|
|
|
|
2013-07-07 10:48:37 +00:00
|
|
|
/// Similar readNextLine but strip all DSL comments {{...}}
|
|
|
|
bool readNextLineWithoutComments( wstring &, size_t & offset ) throw( Ex, Iconv::Ex );
|
|
|
|
|
2009-08-01 10:05:24 +00:00
|
|
|
/// Returns the number of lines read so far from the file.
|
|
|
|
unsigned getLinesRead() const
|
|
|
|
{ return linesRead; }
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Converts the given number of characters to the number of bytes they
|
|
|
|
/// would occupy in the file, knowing its encoding. It's possible to know
|
|
|
|
/// that because no multibyte encodings are supported in .dsls.
|
|
|
|
inline size_t distanceToBytes( size_t ) const;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// This function either removes parts of string enclosed in braces, or leaves
|
|
|
|
/// them intact. The braces themselves are removed always, though.
|
|
|
|
void processUnsortedParts( wstring & str, bool strip );
|
|
|
|
|
|
|
|
/// Expands optional parts of a headword (ones marked with parentheses),
|
|
|
|
/// producing all possible combinations where they are present or absent.
|
2012-09-07 21:32:49 +00:00
|
|
|
void expandOptionalParts( wstring & str, list< wstring > * result,
|
|
|
|
size_t x = 0, bool inside_recurse = false );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Expands all unescaped tildes, inserting tildeReplacement text instead of
|
|
|
|
/// them.
|
|
|
|
void expandTildes( wstring & str, wstring const & tildeReplacement );
|
|
|
|
|
2013-08-04 19:19:57 +00:00
|
|
|
/// Unescapes any escaped chars. Be sure to handle all their special meanings
|
|
|
|
/// before unescaping them.
|
2009-01-28 20:55:45 +00:00
|
|
|
void unescapeDsl( wstring & str );
|
|
|
|
|
2013-08-04 19:19:57 +00:00
|
|
|
/// Normalizes the headword. Currently turns any sequences of consecutive spaces
|
|
|
|
/// into a single space.
|
2009-06-06 16:02:52 +00:00
|
|
|
void normalizeHeadword( wstring & );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2013-07-07 10:48:37 +00:00
|
|
|
/// Strip DSL {{...}} comments
|
|
|
|
void stripComments( wstring &, bool & );
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
inline size_t DslScanner::distanceToBytes( size_t x ) const
|
|
|
|
{
|
|
|
|
switch( encoding )
|
|
|
|
{
|
|
|
|
case Utf16LE:
|
|
|
|
case Utf16BE:
|
|
|
|
return x*2;
|
|
|
|
default:
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-04-23 11:43:20 +00:00
|
|
|
/// Converts the given language name taken from Dsl header (i.e. getLangFrom(),
|
|
|
|
/// getLangTo()) to its proper language id.
|
|
|
|
quint32 dslLanguageToId( wstring const & name );
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|