2012-02-20 21:47:14 +00:00
|
|
|
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
2009-01-28 20:55:45 +00:00
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#ifndef __DSL_DETAILS_HH_INCLUDED__
|
|
|
|
#define __DSL_DETAILS_HH_INCLUDED__
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <list>
|
|
|
|
#include <vector>
|
|
|
|
#include <zlib.h>
|
|
|
|
#include "dictionary.hh"
|
|
|
|
#include "iconv.hh"
|
2022-02-27 14:42:40 +00:00
|
|
|
#if ( QT_VERSION >= QT_VERSION_CHECK( 6, 0, 0 ) )
|
|
|
|
#include <QtCore5Compat/QTextCodec>
|
|
|
|
#else
|
2021-10-18 16:19:25 +00:00
|
|
|
#include <QTextCodec>
|
2022-02-27 14:42:40 +00:00
|
|
|
#endif
|
2021-10-18 16:19:25 +00:00
|
|
|
#include <QByteArray>
|
2021-11-06 08:26:30 +00:00
|
|
|
#include "utf8.hh"
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
// Implementation details for Dsl, not part of its interface
|
|
|
|
namespace Dsl {
|
|
|
|
namespace Details {
|
|
|
|
|
|
|
|
using std::string;
|
2009-04-18 17:20:12 +00:00
|
|
|
using gd::wstring;
|
|
|
|
using gd::wchar;
|
2009-01-28 20:55:45 +00:00
|
|
|
using std::list;
|
|
|
|
using std::vector;
|
2021-11-06 08:26:30 +00:00
|
|
|
using Utf8::Encoding;
|
2021-11-06 08:55:51 +00:00
|
|
|
using Utf8::LineFeed;
|
2021-11-06 08:26:30 +00:00
|
|
|
|
2016-12-22 19:58:01 +00:00
|
|
|
string findCodeForDslId( int id );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2017-07-04 14:41:38 +00:00
|
|
|
bool isAtSignFirst( wstring const & str );
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Parses the DSL language, representing it in its structural DOM form.
|
|
|
|
struct ArticleDom
|
|
|
|
{
|
|
|
|
struct Node: public list< Node >
|
|
|
|
{
|
|
|
|
bool isTag; // true if it is a tag with subnodes, false if it's a leaf text
|
|
|
|
// data.
|
|
|
|
// Those are only used if isTag is true
|
|
|
|
wstring tagName;
|
|
|
|
wstring tagAttrs;
|
|
|
|
wstring text; // This is only used if isTag is false
|
|
|
|
|
|
|
|
class Text
|
|
|
|
{};
|
|
|
|
class Tag
|
|
|
|
{};
|
|
|
|
|
|
|
|
Node( Tag, wstring const & name, wstring const & attrs ):
|
|
|
|
isTag( true ),
|
|
|
|
tagName( name ),
|
|
|
|
tagAttrs( attrs )
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
Node( Text, wstring const & text_ ):
|
|
|
|
isTag( false ),
|
|
|
|
text( text_ )
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Concatenates all childen text nodes recursively to form all text
|
|
|
|
/// the node contains stripped of any markup.
|
2014-04-16 16:18:28 +00:00
|
|
|
wstring renderAsText( bool stripTrsTag = false ) const;
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/// Does the parse at construction. Refer to the 'root' member variable
|
|
|
|
/// afterwards.
|
2022-12-01 11:55:00 +00:00
|
|
|
explicit ArticleDom( wstring const &, string const & dictName = string(), wstring const & headword_ = wstring() );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Root of DOM's tree
|
|
|
|
Node root;
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
2013-11-08 12:53:22 +00:00
|
|
|
void openTag( wstring const & name, wstring const & attr, list< Node * > & stack );
|
|
|
|
|
2009-05-07 14:54:26 +00:00
|
|
|
void closeTag( wstring const & name, list< Node * > & stack, bool warn = true );
|
|
|
|
|
2017-07-04 14:41:38 +00:00
|
|
|
bool atSignFirstInLine();
|
2017-07-03 15:12:22 +00:00
|
|
|
|
|
|
|
wchar const *stringPos, *lineStartPos;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2021-10-23 09:37:29 +00:00
|
|
|
class eot: std::exception
|
|
|
|
{};
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
wchar ch;
|
2009-01-28 20:55:45 +00:00
|
|
|
bool escaped;
|
2009-05-07 12:30:36 +00:00
|
|
|
unsigned transcriptionCount; // >0 = inside a [t] tag
|
2021-02-13 08:39:05 +00:00
|
|
|
unsigned mediaCount; // >0 = inside a [s] tag
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2022-01-09 08:35:07 +00:00
|
|
|
void nextChar();
|
2014-04-18 12:34:44 +00:00
|
|
|
|
2018-07-07 09:33:15 +00:00
|
|
|
/// Information for diagnostic purposes
|
2014-04-18 12:34:44 +00:00
|
|
|
string dictionaryName;
|
|
|
|
wstring headword;
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/// Opens the .dsl or .dsl.dz file and allows line-by-line reading. Auto-detects
|
|
|
|
/// the encoding, and reads all headers by itself.
|
|
|
|
class DslScanner
|
|
|
|
{
|
|
|
|
gzFile f;
|
2021-11-06 08:26:30 +00:00
|
|
|
Encoding encoding;
|
2021-10-18 16:19:25 +00:00
|
|
|
QTextCodec * codec;
|
2009-01-28 20:55:45 +00:00
|
|
|
wstring dictionaryName;
|
2009-04-23 11:43:20 +00:00
|
|
|
wstring langFrom, langTo;
|
2017-07-09 17:15:35 +00:00
|
|
|
wstring soundDictionary;
|
2021-11-06 02:10:37 +00:00
|
|
|
char readBuffer[ 65536 ];
|
2009-01-28 20:55:45 +00:00
|
|
|
char * readBufferPtr;
|
2021-11-06 08:55:51 +00:00
|
|
|
LineFeed lineFeed;
|
2009-01-28 20:55:45 +00:00
|
|
|
size_t readBufferLeft;
|
2021-11-06 02:10:37 +00:00
|
|
|
//qint64 pos;
|
2009-08-01 10:05:24 +00:00
|
|
|
unsigned linesRead;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
DEF_EX( Ex, "Dsl scanner exception", Dictionary::Ex )
|
|
|
|
DEF_EX_STR( exCantOpen, "Can't open .dsl file", Ex )
|
|
|
|
DEF_EX( exCantReadDslFile, "Can't read .dsl file", Ex )
|
|
|
|
DEF_EX_STR( exMalformedDslFile, "The .dsl file is malformed:", Ex )
|
|
|
|
DEF_EX( exUnknownCodePage, "The .dsl file specified an unknown code page", Ex )
|
|
|
|
DEF_EX( exEncodingError, "Encoding error", Ex ) // Should never happen really
|
|
|
|
|
2022-12-01 11:55:00 +00:00
|
|
|
explicit DslScanner( string const & fileName );
|
2022-06-03 13:28:41 +00:00
|
|
|
~DslScanner() noexcept;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Returns the detected encoding of this file.
|
2021-11-06 08:26:30 +00:00
|
|
|
Encoding getEncoding() const
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
return encoding;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the dictionary's name, as was read from file's headers.
|
|
|
|
wstring const & getDictionaryName() const
|
|
|
|
{
|
|
|
|
return dictionaryName;
|
|
|
|
}
|
|
|
|
|
2009-04-22 21:37:32 +00:00
|
|
|
/// Returns the dictionary's source language, as was read from file's headers.
|
2009-04-23 11:43:20 +00:00
|
|
|
wstring const & getLangFrom() const
|
2009-04-22 21:37:32 +00:00
|
|
|
{
|
|
|
|
return langFrom;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the dictionary's target language, as was read from file's headers.
|
2009-04-23 11:43:20 +00:00
|
|
|
wstring const & getLangTo() const
|
2009-04-22 21:37:32 +00:00
|
|
|
{
|
|
|
|
return langTo;
|
|
|
|
}
|
|
|
|
|
2017-07-09 17:15:35 +00:00
|
|
|
/// Returns the preferred external dictionary with sounds, as was read from file's headers.
|
|
|
|
wstring const & getSoundDictionaryName() const
|
|
|
|
{
|
|
|
|
return soundDictionary;
|
|
|
|
}
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Reads next line from the file. Returns true if reading succeeded --
|
|
|
|
/// the string gets stored in the one passed, along with its physical
|
|
|
|
/// file offset in the file (the uncompressed one if the file is compressed).
|
|
|
|
/// If end of file is reached, false is returned.
|
|
|
|
/// Reading begins from the first line after the headers (ones which start
|
|
|
|
/// with #).
|
2022-01-09 08:35:07 +00:00
|
|
|
bool readNextLine( wstring &, size_t & offset, bool only_head_word = false );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2013-07-07 10:48:37 +00:00
|
|
|
/// Similar readNextLine but strip all DSL comments {{...}}
|
2022-01-09 08:35:07 +00:00
|
|
|
bool readNextLineWithoutComments( wstring &, size_t & offset, bool only_headword = false );
|
2013-07-07 10:48:37 +00:00
|
|
|
|
2009-08-01 10:05:24 +00:00
|
|
|
/// Returns the number of lines read so far from the file.
|
|
|
|
unsigned getLinesRead() const
|
|
|
|
{
|
|
|
|
return linesRead;
|
|
|
|
}
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Converts the given number of characters to the number of bytes they
|
|
|
|
/// would occupy in the file, knowing its encoding. It's possible to know
|
|
|
|
/// that because no multibyte encodings are supported in .dsls.
|
|
|
|
inline size_t distanceToBytes( size_t ) const;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// This function either removes parts of string enclosed in braces, or leaves
|
|
|
|
/// them intact. The braces themselves are removed always, though.
|
|
|
|
void processUnsortedParts( wstring & str, bool strip );
|
|
|
|
|
|
|
|
/// Expands optional parts of a headword (ones marked with parentheses),
|
|
|
|
/// producing all possible combinations where they are present or absent.
|
2012-09-07 21:32:49 +00:00
|
|
|
void expandOptionalParts( wstring & str, list< wstring > * result, size_t x = 0, bool inside_recurse = false );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Expands all unescaped tildes, inserting tildeReplacement text instead of
|
|
|
|
/// them.
|
|
|
|
void expandTildes( wstring & str, wstring const & tildeReplacement );
|
|
|
|
|
2013-08-04 19:19:57 +00:00
|
|
|
/// Unescapes any escaped chars. Be sure to handle all their special meanings
|
|
|
|
/// before unescaping them.
|
2009-01-28 20:55:45 +00:00
|
|
|
void unescapeDsl( wstring & str );
|
|
|
|
|
2013-08-04 19:19:57 +00:00
|
|
|
/// Normalizes the headword. Currently turns any sequences of consecutive spaces
|
|
|
|
/// into a single space.
|
2009-06-06 16:02:52 +00:00
|
|
|
void normalizeHeadword( wstring & );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2013-07-07 10:48:37 +00:00
|
|
|
/// Strip DSL {{...}} comments
|
|
|
|
void stripComments( wstring &, bool & );
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
inline size_t DslScanner::distanceToBytes( size_t x ) const
|
|
|
|
{
|
|
|
|
switch ( encoding ) {
|
2021-11-06 08:26:30 +00:00
|
|
|
case Utf8::Utf16LE:
|
|
|
|
case Utf8::Utf16BE:
|
2009-01-28 20:55:45 +00:00
|
|
|
return x * 2;
|
|
|
|
default:
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-04-23 11:43:20 +00:00
|
|
|
/// Converts the given language name taken from Dsl header (i.e. getLangFrom(),
|
|
|
|
/// getLangTo()) to its proper language id.
|
|
|
|
quint32 dslLanguageToId( wstring const & name );
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
} // namespace Details
|
|
|
|
} // namespace Dsl
|
|
|
|
|
|
|
|
#endif
|