diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 3791c663..7ec2900f 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -21,3 +21,6 @@ c8af0450f1f7f8188004db96e3f53e7e33e2ccad # remove gddebug.hh and associated functions 76aaed116bdc3aeb53fd61553aedb877baf9b510 + +# wstring & wchar -> std::u32string & char32_t +f1e158578f62c96059bef1a616b75495adb6e2c6 diff --git a/src/article_maker.cc b/src/article_maker.cc index 9018802d..61feccbd 100644 --- a/src/article_maker.cc +++ b/src/article_maker.cc @@ -9,7 +9,6 @@ #include "htmlescape.hh" #include "langcoder.hh" #include "utils.hh" -#include "wstring_qt.hh" #include #include #include @@ -21,7 +20,6 @@ using std::vector; using std::string; -using gd::wstring; using std::set; using std::list; @@ -484,7 +482,7 @@ ArticleRequest::ArticleRequest( QString const & word, // Accumulate main forms for ( const auto & activeDict : activeDicts ) { - auto const s = activeDict->findHeadwordsForSynonym( gd::removeTrailingZero( word ) ); + auto const s = activeDict->findHeadwordsForSynonym( Text::removeTrailingZero( word ) ); connect( s.get(), &Dictionary::Request::finished, this, &ArticleRequest::altSearchFinished, Qt::QueuedConnection ); @@ -521,9 +519,9 @@ void ArticleRequest::altSearchFinished() altsDone = true; // So any pending signals in queued mode won't mess us up - vector< wstring > altsVector( alts.begin(), alts.end() ); + vector< std::u32string > altsVector( alts.begin(), alts.end() ); - wstring wordStd = word.toStdU32String(); + std::u32string wordStd = word.toStdU32String(); if ( activeDicts.size() <= 1 ) { articleSizeLimit = -1; // Don't collapse article if only one dictionary presented @@ -534,7 +532,7 @@ void ArticleRequest::altSearchFinished() sptr< Dictionary::DataRequest > r = activeDict->getArticle( wordStd, altsVector, - gd::removeTrailingZero( contexts.value( QString::fromStdString( activeDict->getId() ) ) ), + Text::removeTrailingZero( contexts.value( QString::fromStdString( activeDict->getId() ) ) ), ignoreDiacritics ); connect( r.get(), &Dictionary::Request::finished, this, &ArticleRequest::bodyFinished, Qt::QueuedConnection ); @@ -1008,7 +1006,7 @@ void ArticleRequest::individualWordFinished() WordFinder::SearchResults const & results = stemmedWordFinder->getResults(); if ( results.size() ) { - wstring source = Folding::applySimpleCaseOnly( currentSplittedWordCompound ); + std::u32string source = Folding::applySimpleCaseOnly( currentSplittedWordCompound ); bool hadSomething = false; @@ -1022,7 +1020,7 @@ void ArticleRequest::individualWordFinished() // Prefix match found. Check if the aliases are acceptable. - wstring result( Folding::applySimpleCaseOnly( results[ x ].first ) ); + std::u32string result( Folding::applySimpleCaseOnly( results[ x ].first ) ); if ( source.size() <= result.size() && result.compare( 0, source.size(), source ) == 0 ) { // The resulting string begins with the source one diff --git a/src/article_maker.hh b/src/article_maker.hh index a57065e7..e70e65e6 100644 --- a/src/article_maker.hh +++ b/src/article_maker.hh @@ -88,7 +88,7 @@ class ArticleRequest: public Dictionary::DataRequest QMap< QString, QString > contexts; std::vector< sptr< Dictionary::Class > > activeDicts; - std::set< gd::wstring, std::less<> > alts; // Accumulated main forms + std::set< std::u32string, std::less<> > alts; // Accumulated main forms std::list< sptr< Dictionary::WordSearchRequest > > altSearches; std::list< sptr< Dictionary::DataRequest > > bodyRequests; bool altsDone{ false }; diff --git a/src/common/filetype.cc b/src/common/filetype.cc index 87d2a79d..54ae1fc7 100644 --- a/src/common/filetype.cc +++ b/src/common/filetype.cc @@ -2,7 +2,7 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "filetype.hh" -#include "utf8.hh" +#include "text.hh" #include namespace Filetype { @@ -26,13 +26,13 @@ string simplifyString( string const & str, bool lowercase ) size_t beginPos = 0; - while ( beginPos < str.size() && Utf8::isspace( str[ beginPos ] ) ) { + while ( beginPos < str.size() && Text::isspace( str[ beginPos ] ) ) { ++beginPos; } size_t endPos = str.size(); - while ( endPos && Utf8::isspace( str[ endPos - 1 ] ) ) { + while ( endPos && Text::isspace( str[ endPos - 1 ] ) ) { --endPos; } diff --git a/src/common/folding.cc b/src/common/folding.cc index f737371a..a5bceffb 100644 --- a/src/common/folding.cc +++ b/src/common/folding.cc @@ -3,7 +3,7 @@ #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "globalregex.hh" #include "inc_case_folding.hh" @@ -13,12 +13,12 @@ namespace Folding { /// caught by the diacritics folding table, but they are only handled there /// when they come with their main characters, not by themselves. The rest /// are caught here. -bool isCombiningMark( wchar ch ) +bool isCombiningMark( char32_t ch ) { return QChar::isMark( ch ); } -wstring apply( wstring const & in, bool preserveWildcards ) +std::u32string apply( std::u32string const & in, bool preserveWildcards ) { // remove diacritics (normalization), white space, punt, auto temp = QString::fromStdU32String( in ) @@ -32,7 +32,7 @@ wstring apply( wstring const & in, bool preserveWildcards ) // case folding std::u32string caseFolded; caseFolded.reserve( temp.size() ); - wchar buf[ foldCaseMaxOut ]; + char32_t buf[ foldCaseMaxOut ]; for ( const char32_t ch : temp ) { auto n = foldCase( ch, buf ); caseFolded.append( buf, n ); @@ -40,11 +40,11 @@ wstring apply( wstring const & in, bool preserveWildcards ) return caseFolded; } -wstring applySimpleCaseOnly( wstring const & in ) +std::u32string applySimpleCaseOnly( std::u32string const & in ) { - wchar const * nextChar = in.data(); + char32_t const * nextChar = in.data(); - wstring out; + std::u32string out; out.reserve( in.size() ); @@ -55,27 +55,27 @@ wstring applySimpleCaseOnly( wstring const & in ) return out; } -wstring applySimpleCaseOnly( QString const & in ) +std::u32string applySimpleCaseOnly( QString const & in ) { //qt only support simple case folding. return in.toCaseFolded().toStdU32String(); } -wstring applySimpleCaseOnly( std::string const & in ) +std::u32string applySimpleCaseOnly( std::string const & in ) { - return applySimpleCaseOnly( Utf8::decode( in ) ); + return applySimpleCaseOnly( Text::toUtf32( in ) ); // return QString::fromStdString( in ).toCaseFolded().toStdU32String(); } -wstring applyFullCaseOnly( wstring const & in ) +std::u32string applyFullCaseOnly( std::u32string const & in ) { - wstring caseFolded; + std::u32string caseFolded; caseFolded.reserve( in.size() * foldCaseMaxOut ); - wchar const * nextChar = in.data(); + char32_t const * nextChar = in.data(); - wchar buf[ foldCaseMaxOut ]; + char32_t buf[ foldCaseMaxOut ]; for ( size_t left = in.size(); left--; ) { caseFolded.append( buf, foldCase( *nextChar++, buf ) ); @@ -84,17 +84,17 @@ wstring applyFullCaseOnly( wstring const & in ) return caseFolded; } -wstring applyDiacriticsOnly( wstring const & in ) +std::u32string applyDiacriticsOnly( std::u32string const & in ) { auto noAccent = QString::fromStdU32String( in ).normalized( QString::NormalizationForm_KD ).remove( RX::accentMark ); return noAccent.toStdU32String(); } -wstring applyPunctOnly( wstring const & in ) +std::u32string applyPunctOnly( std::u32string const & in ) { - wchar const * nextChar = in.data(); + char32_t const * nextChar = in.data(); - wstring out; + std::u32string out; out.reserve( in.size() ); @@ -119,11 +119,11 @@ QString applyPunctOnly( QString const & in ) return out; } -wstring applyWhitespaceOnly( wstring const & in ) +std::u32string applyWhitespaceOnly( std::u32string const & in ) { - wchar const * nextChar = in.data(); + char32_t const * nextChar = in.data(); - wstring out; + std::u32string out; out.reserve( in.size() ); @@ -136,11 +136,11 @@ wstring applyWhitespaceOnly( wstring const & in ) return out; } -wstring applyWhitespaceAndPunctOnly( wstring const & in ) +std::u32string applyWhitespaceAndPunctOnly( std::u32string const & in ) { - wchar const * nextChar = in.data(); + char32_t const * nextChar = in.data(); - wstring out; + std::u32string out; out.reserve( in.size() ); @@ -153,26 +153,26 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in ) return out; } -bool isWhitespace( wchar ch ) +bool isWhitespace( char32_t ch ) { //invisible character should be treated as whitespace as well. return QChar::isSpace( ch ) || !QChar::isPrint( ch ); } -bool isWhitespaceOrPunct( wchar ch ) +bool isWhitespaceOrPunct( char32_t ch ) { return isWhitespace( ch ) || QChar::isPunct( ch ); } -bool isPunct( wchar ch ) +bool isPunct( char32_t ch ) { return QChar::isPunct( ch ); } -wstring trimWhitespaceOrPunct( wstring const & in ) +std::u32string trimWhitespaceOrPunct( std::u32string const & in ) { - wchar const * wordBegin = in.c_str(); - wstring::size_type wordSize = in.size(); + char32_t const * wordBegin = in.c_str(); + std::u32string::size_type wordSize = in.size(); // Skip any leading whitespace while ( *wordBegin && Folding::isWhitespaceOrPunct( *wordBegin ) ) { @@ -185,7 +185,7 @@ wstring trimWhitespaceOrPunct( wstring const & in ) --wordSize; } - return wstring( wordBegin, wordSize ); + return std::u32string( wordBegin, wordSize ); } QString trimWhitespaceOrPunct( QString const & in ) @@ -209,13 +209,13 @@ QString trimWhitespaceOrPunct( QString const & in ) return in.mid( wordBegin, wordSize ); } -wstring trimWhitespace( wstring const & in ) +std::u32string trimWhitespace( std::u32string const & in ) { if ( in.empty() ) { return in; } - wchar const * wordBegin = in.c_str(); - wstring::size_type wordSize = in.size(); + char32_t const * wordBegin = in.c_str(); + std::u32string::size_type wordSize = in.size(); // Skip any leading whitespace while ( *wordBegin && Folding::isWhitespace( *wordBegin ) ) { @@ -228,7 +228,7 @@ wstring trimWhitespace( wstring const & in ) --wordSize; } - return wstring( wordBegin, wordSize ); + return std::u32string( wordBegin, wordSize ); } QString trimWhitespace( QString const & in ) diff --git a/src/common/folding.hh b/src/common/folding.hh index 9e9caaea..b1eb6018 100644 --- a/src/common/folding.hh +++ b/src/common/folding.hh @@ -3,7 +3,7 @@ #pragma once -#include "wstring.hh" +#include "text.hh" #include /// Folding provides means to translate several possible ways to write a @@ -17,8 +17,6 @@ namespace Folding { -using gd::wstring; -using gd::wchar; /// The algorithm's version. enum { @@ -27,48 +25,48 @@ enum { /// Applies the folding algorithm to each character in the given string, /// making another one as a result. -wstring apply( wstring const &, bool preserveWildcards = false ); +std::u32string apply( std::u32string const &, bool preserveWildcards = false ); /// Applies only simple case folding algorithm. Since many dictionaries have /// different case style, we interpret words differing only by case as synonyms. -wstring applySimpleCaseOnly( wstring const & ); -wstring applySimpleCaseOnly( QString const & in ); -wstring applySimpleCaseOnly( std::string const & in ); +std::u32string applySimpleCaseOnly( std::u32string const & ); +std::u32string applySimpleCaseOnly( QString const & in ); +std::u32string applySimpleCaseOnly( std::string const & in ); /// Applies only full case folding algorithm. This includes simple case, but also /// decomposing ligatures and complex letters. -wstring applyFullCaseOnly( wstring const & ); +std::u32string applyFullCaseOnly( std::u32string const & ); /// Applies only diacritics folding algorithm. -wstring applyDiacriticsOnly( wstring const & ); +std::u32string applyDiacriticsOnly( std::u32string const & ); /// Applies only punctuation folding algorithm. -wstring applyPunctOnly( wstring const & ); +std::u32string applyPunctOnly( std::u32string const & ); QString applyPunctOnly( QString const & in ); /// Applies only whitespace folding algorithm. -wstring applyWhitespaceOnly( wstring const & ); +std::u32string applyWhitespaceOnly( std::u32string const & ); /// Applies only whitespace&punctuation folding algorithm. -wstring applyWhitespaceAndPunctOnly( wstring const & ); +std::u32string applyWhitespaceAndPunctOnly( std::u32string const & ); /// Returns true if the given character is any form of whitespace, false /// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also /// includes \n, \r and \t. -bool isWhitespace( wchar ch ); -bool isWhitespaceOrPunct( wchar ch ); +bool isWhitespace( char32_t ch ); +bool isWhitespaceOrPunct( char32_t ch ); /// Returns true if the given character is any form of punctuation, false /// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes. -bool isPunct( wchar ch ); +bool isPunct( char32_t ch ); /// Removes any whitespace or punctuation from the beginning and the end of /// the word. -wstring trimWhitespaceOrPunct( wstring const & ); +std::u32string trimWhitespaceOrPunct( std::u32string const & ); QString trimWhitespaceOrPunct( QString const & in ); /// Removes any whitespace from the beginning and the end of /// the word. -wstring trimWhitespace( wstring const & ); +std::u32string trimWhitespace( std::u32string const & ); QString trimWhitespace( QString const & in ); /// Same as apply( wstring ), but without any heap operations, therefore @@ -86,6 +84,6 @@ QString unescapeWildcardSymbols( QString const & ); QString escapeWildcardSymbols( QString const & ); /// Tests if the given char is one of the Unicode combining marks. -bool isCombiningMark( wchar ch ); +bool isCombiningMark( char32_t ch ); } // namespace Folding diff --git a/src/common/iconv.cc b/src/common/iconv.cc index ded249ba..6c700e9c 100644 --- a/src/common/iconv.cc +++ b/src/common/iconv.cc @@ -5,7 +5,6 @@ #include #include #include -#include "wstring_qt.hh" char const * const Iconv::GdWchar = "UTF-32LE"; char const * const Iconv::Utf16Le = "UTF-16LE"; @@ -80,7 +79,7 @@ QString Iconv::convert( void const *& inBuf, size_t & inBytesLeft ) return QString::fromUtf8( &outBuf.front(), datasize ); } -gd::wstring Iconv::toWstring( char const * fromEncoding, void const * fromData, size_t dataSize ) +std::u32string Iconv::toWstring( char const * fromEncoding, void const * fromData, size_t dataSize ) { /// Special-case the dataSize == 0 to avoid any kind of iconv-specific diff --git a/src/common/iconv.hh b/src/common/iconv.hh index 3c72022b..f27aaecd 100644 --- a/src/common/iconv.hh +++ b/src/common/iconv.hh @@ -5,7 +5,7 @@ #include -#include "wstring.hh" +#include "text.hh" #include "ex.hh" #include @@ -35,7 +35,7 @@ public: QString convert( void const *& inBuf, size_t & inBytesLeft ); // Converts a given block of data from the given encoding to a wide string. - static gd::wstring toWstring( char const * fromEncoding, void const * fromData, size_t dataSize ); + static std::u32string toWstring( char const * fromEncoding, void const * fromData, size_t dataSize ); // Converts a given block of data from the given encoding to an utf8-encoded // string. diff --git a/src/common/utf8.cc b/src/common/text.cc similarity index 63% rename from src/common/utf8.cc rename to src/common/text.cc index c2fff2f1..71c8038d 100644 --- a/src/common/utf8.cc +++ b/src/common/text.cc @@ -1,15 +1,21 @@ /* This file is (c) 2008-2012 Konstantin Isakov * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ -#include "utf8.hh" +#include "text.hh" #include #include #include #include +#include -namespace Utf8 { +namespace Text { -size_t encode( wchar const * in, size_t inSize, char * out_ ) + +/// Encodes the given UTF-32 into UTF-8. The inSize specifies the number +/// of wide characters the 'in' pointer points to. The 'out' buffer must be +/// at least inSize * 4 bytes long. The function returns the number of chars +/// stored in the 'out' buffer. The result is not 0-terminated. +size_t encode( char32_t const * in, size_t inSize, char * out_ ) { unsigned char * out = (unsigned char *)out_; @@ -37,13 +43,18 @@ size_t encode( wchar const * in, size_t inSize, char * out_ ) return out - (unsigned char *)out_; } -long decode( char const * in_, size_t inSize, wchar * out_ ) +/// Decodes the given UTF-8 into UTF-32. The inSize specifies the number +/// of bytes the 'in' pointer points to. The 'out' buffer must be at least +/// inSize wide characters long. If the given UTF-8 is invalid, the decode +/// function returns -1, otherwise it returns the number of wide characters +/// stored in the 'out' buffer. The result is not 0-terminated. +long decode( char const * in_, size_t inSize, char32_t * out_ ) { unsigned char const * in = (unsigned char const *)in_; - wchar * out = out_; + char32_t * out = out_; while ( inSize-- ) { - wchar result; + char32_t result; if ( *in & 0x80 ) { if ( *in & 0x40 ) { @@ -61,22 +72,22 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) inSize -= 3; - result = ( (wchar)*in++ & 7 ) << 18; + result = ( (char32_t)*in++ & 7 ) << 18; if ( ( *in & 0xC0 ) != 0x80 ) { return -1; } - result |= ( (wchar)*in++ & 0x3F ) << 12; + result |= ( (char32_t)*in++ & 0x3F ) << 12; if ( ( *in & 0xC0 ) != 0x80 ) { return -1; } - result |= ( (wchar)*in++ & 0x3F ) << 6; + result |= ( (char32_t)*in++ & 0x3F ) << 6; if ( ( *in & 0xC0 ) != 0x80 ) { return -1; } - result |= (wchar)*in++ & 0x3F; + result |= (char32_t)*in++ & 0x3F; } else { // Three-byte sequence @@ -87,17 +98,17 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) inSize -= 2; - result = ( (wchar)*in++ & 0xF ) << 12; + result = ( (char32_t)*in++ & 0xF ) << 12; if ( ( *in & 0xC0 ) != 0x80 ) { return -1; } - result |= ( (wchar)*in++ & 0x3F ) << 6; + result |= ( (char32_t)*in++ & 0x3F ) << 6; if ( ( *in & 0xC0 ) != 0x80 ) { return -1; } - result |= (wchar)*in++ & 0x3F; + result |= (char32_t)*in++ & 0x3F; } } else { @@ -108,12 +119,12 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) --inSize; - result = ( (wchar)*in++ & 0x1F ) << 6; + result = ( (char32_t)*in++ & 0x1F ) << 6; if ( ( *in & 0xC0 ) != 0x80 ) { return -1; } - result |= (wchar)*in++ & 0x3F; + result |= (char32_t)*in++ & 0x3F; } } else { @@ -132,7 +143,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) return out - out_; } -string encode( wstring const & in ) noexcept +std::string toUtf8( std::u32string const & in ) noexcept { if ( in.empty() ) { return {}; @@ -140,16 +151,16 @@ string encode( wstring const & in ) noexcept std::vector< char > buffer( in.size() * 4 ); - return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) ); + return { &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) }; } -wstring decode( string const & in ) +std::u32string toUtf32( std::string const & in ) { if ( in.empty() ) { return {}; } - std::vector< wchar > buffer( in.size() ); + std::vector< char32_t > buffer( in.size() ); long result = decode( in.data(), in.size(), &buffer.front() ); @@ -157,7 +168,7 @@ wstring decode( string const & in ) throw exCantDecode( in ); } - return wstring( &buffer.front(), result ); + return std::u32string( &buffer.front(), result ); } bool isspace( int c ) @@ -247,29 +258,29 @@ LineFeed initLineFeed( const Encoding e ) { LineFeed lf{}; switch ( e ) { - case Utf8::Utf32LE: + case Utf32LE: lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 }; lf.length = 4; break; - case Utf8::Utf32BE: + case Utf32BE: lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A }; lf.length = 4; break; - case Utf8::Utf16LE: + case Utf16LE: lf.lineFeed = new char[ 2 ]{ 0x0A, 0 }; lf.length = 2; break; - case Utf8::Utf16BE: + case Utf16BE: lf.lineFeed = new char[ 2 ]{ 0, 0x0A }; lf.length = 2; break; - case Utf8::Windows1252: + case Windows1252: - case Utf8::Windows1251: + case Windows1251: - case Utf8::Utf8: + case Utf8: - case Utf8::Windows1250: + case Windows1250: default: lf.length = 1; lf.lineFeed = new char[ 1 ]{ 0x0A }; @@ -277,4 +288,36 @@ LineFeed initLineFeed( const Encoding e ) return lf; } -} // namespace Utf8 +// When convert non-BMP characters to wstring,the ending char maybe \0 .This method remove the tailing \0 from the wstring +// as \0 is sensitive in the index. This method will be only used with index related operations like store/query. +std::u32string removeTrailingZero( std::u32string const & v ) +{ + int n = v.size(); + while ( n > 0 && v[ n - 1 ] == 0 ) { + n--; + } + return std::u32string( v.data(), n ); +} + +std::u32string removeTrailingZero( QString const & in ) +{ + QList< unsigned int > v = in.toUcs4(); + + int n = v.size(); + while ( n > 0 && v[ n - 1 ] == 0 ) { + n--; + } + if ( n != v.size() ) { + v.resize( n ); + } + + return std::u32string( (const char32_t *)v.constData(), v.size() ); +} + +std::u32string normalize( const std::u32string & str ) +{ + return QString::fromStdU32String( str ).normalized( QString::NormalizationForm_C ).toStdU32String(); +} + + +} // namespace Text diff --git a/src/common/text.hh b/src/common/text.hh new file mode 100644 index 00000000..f3c47d81 --- /dev/null +++ b/src/common/text.hh @@ -0,0 +1,50 @@ +/* This file is (c) 2008-2012 Konstantin Isakov + * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ +#pragma once + +#include +#include +#include +#include "ex.hh" + +/// Facilities to process Text, focusing on Unicode +namespace Text { +DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception ) + +// Those are possible encodings for .dsl files +enum Encoding { + Utf16LE, + Utf16BE, + Windows1252, + Windows1251, + Windows1250, + Utf8, + Utf32BE, + Utf32LE, +}; + +std::string toUtf8( std::u32string const & ) noexcept; +std::u32string toUtf32( std::string const & ); + +/// Since the standard isspace() is locale-specific, we need something +/// that would never mess up our utf8 input. The stock one worked fine under +/// Linux but was messing up strings under Windows. +bool isspace( int c ); + +//get the first line in string s1. -1 if not found +int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length ); +char const * getEncodingNameFor( Encoding e ); +Encoding getEncodingForName( const QByteArray & name ); + +struct LineFeed +{ + int length; + char * lineFeed; +}; + +LineFeed initLineFeed( Encoding e ); + +std::u32string removeTrailingZero( std::u32string const & v ); +std::u32string removeTrailingZero( QString const & in ); +std::u32string normalize( std::u32string const & ); +} // namespace Text diff --git a/src/common/utf8.hh b/src/common/utf8.hh deleted file mode 100644 index 4a0d2c88..00000000 --- a/src/common/utf8.hh +++ /dev/null @@ -1,68 +0,0 @@ -/* This file is (c) 2008-2012 Konstantin Isakov - * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ -#pragma once - -#include -#include -#include -#include "ex.hh" -#include "wstring.hh" - -/// A simple UTF-8 encoder/decoder. Some dictionary backends only require -/// utf8, so we have this separately, removing the iconv dependency for them. -/// Besides, utf8 is quite ubiquitous now, and its use is spreaded over many -/// places. -namespace Utf8 { - -// Those are possible encodings for .dsl files -enum Encoding { - Utf16LE, - Utf16BE, - Windows1252, - Windows1251, - Windows1250, - Utf8, // This is an extension. Detected solely by the UTF8 BOM. - Utf32BE, - Utf32LE, -}; - -using std::string; -using gd::wstring; -using gd::wchar; - -DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception ) - -/// Encodes the given UCS-4 into UTF-8. The inSize specifies the number -/// of wide characters the 'in' pointer points to. The 'out' buffer must be -/// at least inSize * 4 bytes long. The function returns the number of chars -/// stored in the 'out' buffer. The result is not 0-terminated. -size_t encode( wchar const * in, size_t inSize, char * out ); -/// Decodes the given UTF-8 into UCS-32. The inSize specifies the number -/// of bytes the 'in' pointer points to. The 'out' buffer must be at least -/// inSize wide characters long. If the given UTF-8 is invalid, the decode -/// function returns -1, otherwise it returns the number of wide characters -/// stored in the 'out' buffer. The result is not 0-terminated. -long decode( char const * in, size_t inSize, wchar * out ); - -/// Versions for non time-critical code. -string encode( wstring const & ) noexcept; -wstring decode( string const & ); - -/// Since the standard isspace() is locale-specific, we need something -/// that would never mess up our utf8 input. The stock one worked fine under -/// Linux but was messing up strings under Windows. -bool isspace( int c ); - -//get the first line in string s1. -1 if not found -int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length ); -char const * getEncodingNameFor( Encoding e ); -Encoding getEncodingForName( const QByteArray & name ); - -struct LineFeed -{ - int length; - char * lineFeed; -}; - -LineFeed initLineFeed( Encoding e ); -} // namespace Utf8 diff --git a/src/common/wstring.hh b/src/common/wstring.hh deleted file mode 100644 index 4ab689e1..00000000 --- a/src/common/wstring.hh +++ /dev/null @@ -1,17 +0,0 @@ -/* This file is (c) 2008-2012 Konstantin Isakov - * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ - -#pragma once - -#include - -/// -/// Aliases for legacy reasons. -/// -/// For new code, just use the standardized std::u32string for UTF-32 strings instead. -/// - -namespace gd { -using wchar = char32_t; -using wstring = std::u32string; -} // namespace gd diff --git a/src/common/wstring_qt.cc b/src/common/wstring_qt.cc deleted file mode 100644 index 17cc867f..00000000 --- a/src/common/wstring_qt.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include "wstring_qt.hh" -#include - -namespace gd { - -// When convert non-BMP characters to wstring,the ending char maybe \0 .This method remove the tailing \0 from the wstring -// as \0 is sensitive in the index. This method will be only used with index related operations like store/query. -wstring removeTrailingZero( wstring const & v ) -{ - int n = v.size(); - while ( n > 0 && v[ n - 1 ] == 0 ) { - n--; - } - return wstring( v.data(), n ); -} - -wstring removeTrailingZero( QString const & in ) -{ - QList< unsigned int > v = in.toUcs4(); - - int n = v.size(); - while ( n > 0 && v[ n - 1 ] == 0 ) { - n--; - } - if ( n != v.size() ) { - v.resize( n ); - } - - return wstring( (const wchar *)v.constData(), v.size() ); -} - -wstring normalize( const wstring & str ) -{ - return QString::fromStdU32String( str ).normalized( QString::NormalizationForm_C ).toStdU32String(); -} - - -} // namespace gd diff --git a/src/common/wstring_qt.hh b/src/common/wstring_qt.hh deleted file mode 100644 index 982478d4..00000000 --- a/src/common/wstring_qt.hh +++ /dev/null @@ -1,16 +0,0 @@ -/* This file is (c) 2008-2012 Konstantin Isakov - * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ - -#pragma once - -/// This file adds conversions between gd::wstring and QString. See wstring.hh -/// for more details on gd::wstring. - -#include "wstring.hh" -#include - -namespace gd { -wstring removeTrailingZero( wstring const & v ); -wstring removeTrailingZero( QString const & in ); -wstring normalize( wstring const & ); -} // namespace gd diff --git a/src/dict/aard.cc b/src/dict/aard.cc index f988e659..50b70aa1 100644 --- a/src/dict/aard.cc +++ b/src/dict/aard.cc @@ -4,7 +4,7 @@ #include "aard.hh" #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "chunkedstorage.hh" #include "langcoder.hh" #include "decompress.hh" @@ -29,7 +29,6 @@ using std::multimap; using std::pair; using std::set; using std::string; -using gd::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; @@ -236,8 +235,10 @@ public: return idxHeader.langTo; } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; QString const & getDescription() override; @@ -601,8 +602,8 @@ AardDictionary::getSearchResults( QString const & searchString, int searchMode, class AardArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; AardDictionary & dict; bool ignoreDiacritics; @@ -611,8 +612,8 @@ class AardArticleRequest: public Dictionary::DataRequest public: - AardArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + AardArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, AardDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -656,13 +657,13 @@ void AardArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles; set< quint32 > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -693,12 +694,12 @@ void AardArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< std::u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); @@ -714,7 +715,7 @@ void AardArticleRequest::run() string result; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< std::u32string, pair< string, string > >::const_iterator i; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += "

"; @@ -737,9 +738,9 @@ void AardArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > AardDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > AardDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -915,7 +916,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f } // Insert new entry - wstring word = Utf8::decode( string( data.data(), wordSize ) ); + std::u32string word = Text::toUtf32( string( data.data(), wordSize ) ); if ( maxHeadwordsToExpand && dictHeader.wordsCount >= maxHeadwordsToExpand ) { indexedWords.addSingleWord( word, articleOffset ); } diff --git a/src/dict/bgl.cc b/src/dict/bgl.cc index f11772aa..9ce25517 100644 --- a/src/dict/bgl.cc +++ b/src/dict/bgl.cc @@ -11,7 +11,7 @@ #include "htmlescape.hh" #include "langcoder.hh" #include "language.hh" -#include "utf8.hh" +#include "text.hh" #include "utils.hh" #include #include @@ -30,8 +30,6 @@ namespace Bgl { using std::map; using std::multimap; using std::set; -using gd::wstring; -using gd::wchar; using std::list; using std::pair; using std::string; @@ -111,7 +109,7 @@ void trimWs( string & word ) if ( word.size() ) { unsigned begin = 0; - while ( begin < word.size() && Utf8::isspace( word[ begin ] ) ) { + while ( begin < word.size() && Text::isspace( word[ begin ] ) ) { ++begin; } @@ -123,7 +121,7 @@ void trimWs( string & word ) // Doesn't consist of ws entirely, so must end with just isspace() // condition. - while ( Utf8::isspace( word[ end - 1 ] ) ) { + while ( Text::isspace( word[ end - 1 ] ) ) { --end; } @@ -137,7 +135,7 @@ void trimWs( string & word ) void addEntryToIndex( string & word, uint32_t articleOffset, IndexedWords & indexedWords, - vector< wchar > & wcharBuffer ) + vector< char32_t > & wcharBuffer ) { // Strip any leading or trailing whitespaces trimWs( word ); @@ -159,7 +157,7 @@ void addEntryToIndex( string & word, } // Convert the word from utf8 to wide chars - indexedWords.addWord( Utf8::decode( word ), articleOffset ); + indexedWords.addWord( Text::toUtf32( word ), articleOffset ); } class BglDictionary: public BtreeIndexing::BtreeDictionary @@ -193,10 +191,12 @@ public: return idxHeader.langTo; } - sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; + sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override; - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -387,7 +387,7 @@ void BglDictionary::getArticleText( uint32_t articleAddress, QString & headword, headword = QString::fromUtf8( headwordStr.data(), headwordStr.size() ); - wstring wstr = Utf8::decode( articleStr ); + std::u32string wstr = Text::toUtf32( articleStr ); if ( getLangTo() == LangCoder::code2toInt( "he" ) ) { for ( char32_t & i : wstr ) { @@ -436,7 +436,7 @@ void BglDictionary::makeFTSIndex( QAtomicInt & isCancelled ) class BglHeadwordsRequest: public Dictionary::WordSearchRequest { - wstring str; + std::u32string str; BglDictionary & dict; QAtomicInt isCancelled; @@ -444,7 +444,7 @@ class BglHeadwordsRequest: public Dictionary::WordSearchRequest public: - BglHeadwordsRequest( wstring const & word_, BglDictionary & dict_ ): + BglHeadwordsRequest( std::u32string const & word_, BglDictionary & dict_ ): str( word_ ), dict( dict_ ) { @@ -476,7 +476,7 @@ void BglHeadwordsRequest::run() vector< WordArticleLink > chain = dict.findArticles( str ); - wstring caseFolded = Folding::applySimpleCaseOnly( str ); + std::u32string caseFolded = Folding::applySimpleCaseOnly( str ); for ( auto & x : chain ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { @@ -488,11 +488,11 @@ void BglHeadwordsRequest::run() dict.loadArticle( x.articleOffset, headword, displayedHeadword, articleText ); - wstring headwordDecoded; + std::u32string headwordDecoded; try { - headwordDecoded = Utf8::decode( removePostfix( headword ) ); + headwordDecoded = Text::toUtf32( removePostfix( headword ) ); } - catch ( Utf8::exCantDecode & ) { + catch ( Text::exCantDecode & ) { } if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) && !headwordDecoded.empty() ) { @@ -507,7 +507,7 @@ void BglHeadwordsRequest::run() finish(); } -sptr< Dictionary::WordSearchRequest > BglDictionary::findHeadwordsForSynonym( wstring const & word ) +sptr< Dictionary::WordSearchRequest > BglDictionary::findHeadwordsForSynonym( std::u32string const & word ) { return synonymSearchEnabled ? std::make_shared< BglHeadwordsRequest >( word, *this ) : @@ -547,8 +547,8 @@ string postfixToSuperscript( string const & in ) class BglArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; BglDictionary & dict; QAtomicInt isCancelled; @@ -557,8 +557,8 @@ class BglArticleRequest: public Dictionary::DataRequest public: - BglArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + BglArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, BglDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -590,11 +590,11 @@ public: void BglArticleRequest::fixHebString( string & hebStr ) // Hebrew support - convert non-unicode to unicode { - wstring hebWStr; + std::u32string hebWStr; try { - hebWStr = Utf8::decode( hebStr ); + hebWStr = Text::toUtf32( hebStr ); } - catch ( Utf8::exCantDecode & ) { + catch ( Text::exCantDecode & ) { hebStr = "Utf-8 decoding error"; return; } @@ -608,7 +608,7 @@ void BglArticleRequest::fixHebString( string & hebStr ) // Hebrew support - conv i += 1488 - 224; // Convert to Hebrew unicode } } - hebStr = Utf8::encode( hebWStr ); + hebStr = Text::toUtf8( hebWStr ); } void BglArticleRequest::fixHebArticle( string & hebArticle ) // Hebrew support - remove extra chars at the end @@ -644,7 +644,7 @@ void BglArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this @@ -653,7 +653,7 @@ void BglArticleRequest::run() // the bodies to account for this. set< QByteArray > articleBodiesIncluded; - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -681,7 +681,7 @@ void BglArticleRequest::run() // We do the case-folded and postfix-less comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( removePostfix( headword ) ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( removePostfix( headword ) ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } @@ -704,7 +704,7 @@ void BglArticleRequest::run() continue; // Already had this body } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< std::u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( targetHeadword, articleText ) ) ); @@ -725,7 +725,7 @@ void BglArticleRequest::run() string result; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< std::u32string, pair< string, string > >::const_iterator i; string cleaner = Utils::Html::getHtmlCleaner(); for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { @@ -802,9 +802,9 @@ void BglArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > BglDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > BglDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -1085,7 +1085,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f IndexedWords indexedWords; // We use this buffer to decode utf8 into it. - vector< wchar > wcharBuffer; + vector< char32_t > wcharBuffer; ChunkedStorage::Writer chunks( idx ); diff --git a/src/dict/btreeidx.cc b/src/dict/btreeidx.cc index 40a55622..31daf5b9 100644 --- a/src/dict/btreeidx.cc +++ b/src/dict/btreeidx.cc @@ -3,11 +3,10 @@ #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include #include #include -#include "wstring_qt.hh" #include "utils.hh" #include @@ -19,8 +18,6 @@ namespace BtreeIndexing { -using gd::wstring; -using gd::wchar; using std::pair; enum { @@ -59,14 +56,14 @@ void BtreeIndex::openIndex( IndexInfo const & indexInfo, File::Index & file, QMu } vector< WordArticleLink > -BtreeIndex::findArticles( wstring const & search_word, bool ignoreDiacritics, uint32_t maxMatchCount ) +BtreeIndex::findArticles( std::u32string const & search_word, bool ignoreDiacritics, uint32_t maxMatchCount ) { //First trim ending zero - wstring word = gd::removeTrailingZero( search_word ); + std::u32string word = Text::removeTrailingZero( search_word ); vector< WordArticleLink > result; try { - wstring folded = Folding::apply( word ); + std::u32string folded = Folding::apply( word ); if ( folded.empty() ) { folded = Folding::applyWhitespaceOnly( word ); } @@ -100,7 +97,7 @@ BtreeIndex::findArticles( wstring const & search_word, bool ignoreDiacritics, ui BtreeWordSearchRequest::BtreeWordSearchRequest( BtreeDictionary & dict_, - wstring const & str_, + std::u32string const & str_, unsigned minLength_, int maxSuffixVariation_, bool allowMiddleMatches_, @@ -137,11 +134,11 @@ void BtreeWordSearchRequest::findMatches() bool useWildcards = false; if ( allowMiddleMatches ) { - useWildcards = ( str.find( '*' ) != wstring::npos || str.find( '?' ) != wstring::npos - || str.find( '[' ) != wstring::npos || str.find( ']' ) != wstring::npos ); + useWildcards = ( str.find( '*' ) != std::u32string::npos || str.find( '?' ) != std::u32string::npos + || str.find( '[' ) != std::u32string::npos || str.find( ']' ) != std::u32string::npos ); } - wstring folded = Folding::apply( str ); + std::u32string folded = Folding::apply( str ); int minMatchLength = 0; @@ -154,7 +151,7 @@ void BtreeWordSearchRequest::findMatches() regexp.setPatternOptions( QRegularExpression::CaseInsensitiveOption ); bool bNoLetters = folded.empty(); - wstring foldedWithWildcards; + std::u32string foldedWithWildcards; if ( bNoLetters ) { foldedWithWildcards = Folding::applyWhitespaceOnly( str ); @@ -268,9 +265,9 @@ void BtreeWordSearchRequest::findMatches() vector< WordArticleLink > chain = dict.readChain( chainOffset ); - wstring chainHead = Utf8::decode( chain[ 0 ].word ); + std::u32string chainHead = Text::toUtf32( chain[ 0 ].word ); - wstring resultFolded = Folding::apply( chainHead ); + std::u32string resultFolded = Folding::apply( chainHead ); if ( resultFolded.empty() ) { resultFolded = Folding::applyWhitespaceOnly( chainHead ); } @@ -286,9 +283,9 @@ void BtreeWordSearchRequest::findMatches() break; } if ( useWildcards ) { - wstring word = Utf8::decode( x.prefix + x.word ); - wstring result = Folding::applyDiacriticsOnly( word ); - if ( result.size() >= (wstring::size_type)minMatchLength ) { + std::u32string word = Text::toUtf32( x.prefix + x.word ); + std::u32string result = Folding::applyDiacriticsOnly( word ); + if ( result.size() >= (std::u32string::size_type)minMatchLength ) { QRegularExpressionMatch match = regexp.match( QString::fromStdU32String( result ) ); if ( match.hasMatch() && match.capturedStart() == 0 ) { addMatch( word ); @@ -298,10 +295,10 @@ void BtreeWordSearchRequest::findMatches() else { // Skip middle matches, if requested. If suffix variation is specified, // make sure the string isn't larger than requested. - if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( x.prefix ) ).empty() ) + if ( ( allowMiddleMatches || Folding::apply( Text::toUtf32( x.prefix ) ).empty() ) && ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) ) { - addMatch( Utf8::decode( x.prefix + x.word ) ); + addMatch( Text::toUtf32( x.prefix + x.word ) ); } } if ( matches.size() >= maxResults ) { @@ -393,13 +390,14 @@ BtreeWordSearchRequest::~BtreeWordSearchRequest() f.waitForFinished(); } -sptr< Dictionary::WordSearchRequest > BtreeDictionary::prefixMatch( wstring const & str, unsigned long maxResults ) +sptr< Dictionary::WordSearchRequest > BtreeDictionary::prefixMatch( std::u32string const & str, + unsigned long maxResults ) { return std::make_shared< BtreeWordSearchRequest >( *this, str, 0, -1, true, maxResults ); } -sptr< Dictionary::WordSearchRequest > BtreeDictionary::stemmedMatch( wstring const & str, +sptr< Dictionary::WordSearchRequest > BtreeDictionary::stemmedMatch( std::u32string const & str, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ) @@ -437,8 +435,11 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out ) } } -char const * BtreeIndex::findChainOffsetExactOrPrefix( - wstring const & target, bool & exactMatch, vector< char > & extLeaf, uint32_t & nextLeaf, char const *& leafEnd ) +char const * BtreeIndex::findChainOffsetExactOrPrefix( std::u32string const & target, + bool & exactMatch, + vector< char > & extLeaf, + uint32_t & nextLeaf, + char const *& leafEnd ) { if ( !idxFile ) { throw exIndexWasNotOpened(); @@ -449,7 +450,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( // Lookup the index by traversing the index btree // vector< wchar > wcharBuffer; - wstring w_word; + std::u32string w_word; exactMatch = false; // Read a node @@ -530,7 +531,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( size_t wordSize = strlen( closestString ); - w_word = Utf8::decode( string( closestString, wordSize ) ); + w_word = Text::toUtf32( string( closestString, wordSize ) ); compareResult = target.compare( w_word ); @@ -649,9 +650,9 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( size_t wordSize = strlen( ptr ); - w_word = Utf8::decode( string( ptr, wordSize ) ); + w_word = Text::toUtf32( string( ptr, wordSize ) ); - wstring foldedWord = Folding::apply( w_word ); + std::u32string foldedWord = Folding::apply( w_word ); if ( foldedWord.empty() ) { foldedWord = Folding::applyWhitespaceOnly( w_word ); } @@ -750,9 +751,9 @@ vector< WordArticleLink > BtreeIndex::readChain( char const *& ptr, uint32_t max return result; } -void BtreeIndex::antialias( wstring const & str, vector< WordArticleLink > & chain, bool ignoreDiacritics ) +void BtreeIndex::antialias( std::u32string const & str, vector< WordArticleLink > & chain, bool ignoreDiacritics ) { - wstring caseFolded = Folding::applySimpleCaseOnly( gd::normalize( str ) ); + std::u32string caseFolded = Folding::applySimpleCaseOnly( Text::normalize( str ) ); if ( ignoreDiacritics ) { caseFolded = Folding::applyDiacriticsOnly( caseFolded ); } @@ -764,8 +765,8 @@ void BtreeIndex::antialias( wstring const & str, vector< WordArticleLink > & cha for ( unsigned x = chain.size(); x--; ) { // If after applying case folding to each word they wouldn't match, we // drop the entry. - wstring entry = - Folding::applySimpleCaseOnly( gd::normalize( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) ); + std::u32string entry = + Folding::applySimpleCaseOnly( Text::normalize( Text::toUtf32( chain[ x ].prefix + chain[ x ].word ) ) ); if ( ignoreDiacritics ) { entry = Folding::applyDiacriticsOnly( entry ); } @@ -923,9 +924,9 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, return offset; } -void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset, unsigned int maxHeadwordSize ) +void IndexedWords::addWord( std::u32string const & index_word, uint32_t articleOffset, unsigned int maxHeadwordSize ) { - wstring word = gd::removeTrailingZero( index_word ); + std::u32string word = Text::removeTrailingZero( index_word ); string::size_type wordSize = word.size(); // Safeguard us against various bugs here. Don't attempt adding words @@ -945,7 +946,7 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset, wordSize = word.size(); } - wchar const * wordBegin = word.c_str(); + char32_t const * wordBegin = word.c_str(); // Skip any leading whitespace while ( *wordBegin && Folding::isWhitespace( *wordBegin ) ) { @@ -958,7 +959,7 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset, --wordSize; } - wchar const * nextChar = wordBegin; + char32_t const * nextChar = wordBegin; vector< char > utfBuffer( wordSize * 4 ); @@ -970,11 +971,11 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset, if ( !*nextChar ) // End of string ends everything { if ( wordsAdded == 0 ) { - wstring folded = Folding::applyWhitespaceOnly( wstring( wordBegin, wordSize ) ); + std::u32string folded = Folding::applyWhitespaceOnly( std::u32string( wordBegin, wordSize ) ); if ( !folded.empty() ) { - auto i = insert( { Utf8::encode( folded ), vector< WordArticleLink >() } ).first; + auto i = insert( { Text::toUtf8( folded ), vector< WordArticleLink >() } ).first; - string utfWord = Utf8::encode( wstring( wordBegin, wordSize ) ); + string utfWord = Text::toUtf8( std::u32string( wordBegin, wordSize ) ); string utfPrefix; i->second.emplace_back( utfWord, articleOffset, utfPrefix ); } @@ -988,15 +989,15 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset, } // Insert this word - wstring folded = Folding::apply( nextChar ); - auto name = Utf8::encode( folded ); + std::u32string folded = Folding::apply( nextChar ); + auto name = Text::toUtf8( folded ); auto i = insert( { std::move( name ), vector< WordArticleLink >() } ).first; if ( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches { - string utfWord = Utf8::encode( wstring( nextChar, wordSize - ( nextChar - wordBegin ) ) ); - string utfPrefix = Utf8::encode( wstring( wordBegin, nextChar - wordBegin ) ); + string utfWord = Text::toUtf8( std::u32string( nextChar, wordSize - ( nextChar - wordBegin ) ) ); + string utfPrefix = Text::toUtf8( std::u32string( wordBegin, nextChar - wordBegin ) ); i->second.emplace_back( std::move( utfWord ), articleOffset, std::move( utfPrefix ) ); // reduce the vector reallocation. @@ -1020,14 +1021,14 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset, } } -void IndexedWords::addSingleWord( wstring const & index_word, uint32_t articleOffset ) +void IndexedWords::addSingleWord( std::u32string const & index_word, uint32_t articleOffset ) { - wstring const & word = gd::removeTrailingZero( index_word ); - wstring folded = Folding::apply( word ); + std::u32string const & word = Text::removeTrailingZero( index_word ); + std::u32string folded = Folding::apply( word ); if ( folded.empty() ) { folded = Folding::applyWhitespaceOnly( word ); } - operator[]( Utf8::encode( folded ) ).emplace_back( Utf8::encode( word ), articleOffset ); + operator[]( Text::toUtf8( folded ) ).emplace_back( Text::toUtf8( word ), articleOffset ); } IndexInfo buildIndex( IndexedWords const & indexedWords, File::Index & file ) diff --git a/src/dict/btreeidx.hh b/src/dict/btreeidx.hh index ed0eb8fb..b8513e01 100644 --- a/src/dict/btreeidx.hh +++ b/src/dict/btreeidx.hh @@ -18,7 +18,6 @@ namespace BtreeIndexing { using std::string; -using gd::wstring; using std::vector; using std::map; @@ -80,7 +79,8 @@ public: /// Finds articles that match the given string. A case-insensitive search /// is performed. - vector< WordArticleLink > findArticles( wstring const &, bool ignoreDiacritics = false, uint32_t maxMatchCount = -1 ); + vector< WordArticleLink > + findArticles( std::u32string const &, bool ignoreDiacritics = false, uint32_t maxMatchCount = -1 ); /// Find all unique article links in the index void findAllArticleLinks( QList< WordArticleLink > & articleLinks ); @@ -116,8 +116,11 @@ protected: /// case, the returned pointer wouldn't belong to 'leaf' at all. To that end, /// the leafEnd pointer always holds the pointer to the first byte outside /// the node data. - char const * findChainOffsetExactOrPrefix( - wstring const & target, bool & exactMatch, vector< char > & leaf, uint32_t & nextLeaf, char const *& leafEnd ); + char const * findChainOffsetExactOrPrefix( std::u32string const & target, + bool & exactMatch, + vector< char > & leaf, + uint32_t & nextLeaf, + char const *& leafEnd ); /// Reads a node or leaf at the given offset. Just uncompresses its data /// to the given vector and does nothing more. @@ -129,7 +132,7 @@ protected: /// Drops any aliases which arose due to folding. Only case-folded aliases /// are left. - void antialias( wstring const &, vector< WordArticleLink > &, bool ignoreDiactitics ); + void antialias( std::u32string const &, vector< WordArticleLink > &, bool ignoreDiactitics ); protected: @@ -161,10 +164,10 @@ public: /// This function does the search using the btree index. Derivatives usually /// need not to implement this function. - virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &, unsigned long ); + virtual sptr< Dictionary::WordSearchRequest > prefixMatch( std::u32string const &, unsigned long ); virtual sptr< Dictionary::WordSearchRequest > - stemmedMatch( wstring const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ); + stemmedMatch( std::u32string const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ); virtual bool isLocalDictionary() { @@ -210,7 +213,7 @@ class BtreeWordSearchRequest: public Dictionary::WordSearchRequest { protected: BtreeDictionary & dict; - wstring str; + std::u32string str; unsigned long maxResults; unsigned minLength; int maxSuffixVariation; @@ -221,7 +224,7 @@ protected: public: BtreeWordSearchRequest( BtreeDictionary & dict_, - wstring const & str_, + std::u32string const & str_, unsigned minLength_, int maxSuffixVariation_, bool allowMiddleMatches_, @@ -251,11 +254,11 @@ struct IndexedWords: public map< string, vector< WordArticleLink > > /// Instead of adding to the map directly, use this function. It does folding /// itself, and for phrases/sentences it adds additional entries beginning with /// each new word. - void addWord( wstring const & word, uint32_t articleOffset, unsigned int maxHeadwordSize = 100U ); + void addWord( std::u32string const & word, uint32_t articleOffset, unsigned int maxHeadwordSize = 100U ); /// Differs from addWord() in that it only adds a single entry. We use this /// for zip's file names. - void addSingleWord( wstring const & word, uint32_t articleOffset ); + void addSingleWord( std::u32string const & word, uint32_t articleOffset ); }; /// Builds the index, as a compressed btree. Returns IndexInfo. diff --git a/src/dict/dictdfiles.cc b/src/dict/dictdfiles.cc index 4e4690dd..d99262c1 100644 --- a/src/dict/dictdfiles.cc +++ b/src/dict/dictdfiles.cc @@ -4,7 +4,7 @@ #include "dictdfiles.hh" #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "dictzip.hh" #include "htmlescape.hh" #include "langcoder.hh" @@ -29,7 +29,6 @@ using std::multimap; using std::pair; using std::set; using std::string; -using gd::wstring; using std::vector; using std::list; @@ -113,8 +112,10 @@ public: return idxHeader.langTo; } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; QString const & getDescription() override; @@ -234,9 +235,9 @@ uint32_t decodeBase64( string const & str ) return number; } -sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > DictdDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -251,13 +252,13 @@ sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & wor chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, string > mainArticles, alternateArticles; + multimap< std::u32string, string > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonyms make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -377,12 +378,12 @@ sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & wor // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( x.word ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( x.word ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, string > & mapToUse = + multimap< std::u32string, string > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( x.word ), articleText ) ); @@ -396,7 +397,7 @@ sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & wor string result; - multimap< wstring, string >::const_iterator i; + multimap< std::u32string, string >::const_iterator i; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += i->second; @@ -422,7 +423,8 @@ QString const & DictdDictionary::getDescription() return dictionaryDescription; } - sptr< Dictionary::DataRequest > req = getArticle( U"00databaseinfo", vector< wstring >(), wstring(), false ); + sptr< Dictionary::DataRequest > req = + getArticle( U"00databaseinfo", vector< std::u32string >(), std::u32string(), false ); if ( req->dataSize() > 0 ) { dictionaryDescription = QString::fromUtf8( req->getFullData().data(), req->getFullData().size() ); @@ -629,10 +631,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Handle the forth entry, if it exists. From dictfmt man: // When --index-keep-orig option is used fourth column is created // (if necessary) in .index file. - indexedWords.addWord( Utf8::decode( string( tab3 + 1, strlen( tab3 + 1 ) ) ), curOffset ); + indexedWords.addWord( Text::toUtf32( string( tab3 + 1, strlen( tab3 + 1 ) ) ), curOffset ); ++idxHeader.wordCount; } - indexedWords.addWord( Utf8::decode( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset ); + indexedWords.addWord( Text::toUtf32( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset ); ++idxHeader.wordCount; ++idxHeader.articleCount; @@ -657,7 +659,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f eol = articleBody; // No headword itself } if ( eol ) { - while ( *eol && Utf8::isspace( *eol ) ) { + while ( *eol && Text::isspace( *eol ) ) { ++eol; // skip spaces } diff --git a/src/dict/dictionary.cc b/src/dict/dictionary.cc index 3224e5f7..1181b7f7 100644 --- a/src/dict/dictionary.cc +++ b/src/dict/dictionary.cc @@ -177,7 +177,7 @@ void Class::deferredInit() //base method. } -sptr< WordSearchRequest > Class::stemmedMatch( wstring const & /*str*/, +sptr< WordSearchRequest > Class::stemmedMatch( std::u32string const & /*str*/, unsigned /*minLength*/, unsigned /*maxSuffixVariation*/, unsigned long /*maxResults*/ ) @@ -185,12 +185,12 @@ sptr< WordSearchRequest > Class::stemmedMatch( wstring const & /*str*/, return std::make_shared< WordSearchRequestInstant >(); } -sptr< WordSearchRequest > Class::findHeadwordsForSynonym( wstring const & ) +sptr< WordSearchRequest > Class::findHeadwordsForSynonym( std::u32string const & ) { return std::make_shared< WordSearchRequestInstant >(); } -vector< wstring > Class::getAlternateWritings( wstring const & ) noexcept +vector< std::u32string > Class::getAlternateWritings( std::u32string const & ) noexcept { return {}; } diff --git a/src/dict/dictionary.hh b/src/dict/dictionary.hh index 4e099d6e..e290162d 100644 --- a/src/dict/dictionary.hh +++ b/src/dict/dictionary.hh @@ -19,7 +19,7 @@ #include "langcoder.hh" #include "sptr.hh" #include "utils.hh" -#include "wstring.hh" +#include "text.hh" #include /// Abstract dictionary-related stuff @@ -27,7 +27,6 @@ namespace Dictionary { using std::vector; using std::string; -using gd::wstring; using std::map; DEF_EX( Ex, "Dictionary error", std::exception ) @@ -124,19 +123,19 @@ private: /// algorithms. Positive values are used by morphology matches. struct WordMatch { - wstring word; + std::u32string word; int weight; WordMatch(): weight( 0 ) { } - WordMatch( wstring const & word_ ): + WordMatch( std::u32string const & word_ ): word( word_ ), weight( 0 ) { } - WordMatch( wstring const & word_, int weight_ ): + WordMatch( std::u32string const & word_, int weight_ ): word( word_ ), weight( weight_ ) { @@ -431,7 +430,7 @@ public: /// prefix results should be added. Not more than maxResults results should /// be stored. The whole operation is supposed to be fast, though some /// dictionaries, the network ones particularly, may of course be slow. - virtual sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) = 0; + virtual sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) = 0; /// Looks up a given word in the dictionary, aiming to find different forms /// of the given word by allowing suffix variations. This means allowing words @@ -442,20 +441,20 @@ public: /// in the middle of a phrase got matched should be returned. /// The default implementation does nothing, returning an empty result. virtual sptr< WordSearchRequest > - stemmedMatch( wstring const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ); + stemmedMatch( std::u32string const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ); /// Finds known headwords for the given word, that is, the words for which /// the given word is a synonym. If a dictionary can't perform this operation, /// it should leave the default implementation which always returns an empty /// result. - virtual sptr< WordSearchRequest > findHeadwordsForSynonym( wstring const & ); + virtual sptr< WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ); /// For a given word, provides alternate writings of it which are to be looked /// up alongside with it. Transliteration dictionaries implement this. The /// default implementation returns an empty list. Note that this function is /// supposed to be very fast and simple, and the results are thus returned /// synchronously. - virtual vector< wstring > getAlternateWritings( wstring const & ) noexcept; + virtual vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept; /// Returns a definition for the given word. The definition should /// be an html fragment (without html/head/body tags) in an utf8 encoding. @@ -464,10 +463,10 @@ public: /// synonyms for the main word. /// context is a dictionary-specific data, currently only used for the /// 'Websites' feature. - virtual sptr< DataRequest > getArticle( wstring const &, - vector< wstring > const & alts, - wstring const & context = wstring(), - bool ignoreDiacritics = false ) = 0; + virtual sptr< DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const & context = std::u32string(), + bool ignoreDiacritics = false ) = 0; /// Loads contents of a resource named 'name' into the 'data' vector. This is /// usually a picture file referenced in the article or something like that. diff --git a/src/dict/dictserver.cc b/src/dict/dictserver.cc index db798195..2dff23b2 100644 --- a/src/dict/dictserver.cc +++ b/src/dict/dictserver.cc @@ -2,7 +2,6 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "dictserver.hh" -#include "wstring_qt.hh" #include #include #include @@ -314,9 +313,10 @@ public: return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) override; + sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) override; - sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; + sptr< DataRequest > + getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override; quint32 getLangFrom() const override { @@ -387,7 +387,7 @@ class DictServerWordSearchRequest: public Dictionary::WordSearchRequest { Q_OBJECT QAtomicInt isCancelled; - wstring word; + std::u32string word; QString errorString; DictServerDictionary & dict; @@ -402,7 +402,7 @@ class DictServerWordSearchRequest: public Dictionary::WordSearchRequest public: - DictServerWordSearchRequest( wstring word_, DictServerDictionary & dict_ ): + DictServerWordSearchRequest( std::u32string word_, DictServerDictionary & dict_ ): word( std::move( word_ ) ), dict( dict_ ), dictImpl( new DictServerImpl( this, dict_.url, "GoldenDict-w" ) ) @@ -562,7 +562,7 @@ void DictServer::DictServerWordSearchRequest::addMatchedWord( const QString & st class DictServerArticleRequest: public Dictionary::DataRequest { QAtomicInt isCancelled; - wstring word; + std::u32string word; QString errorString; DictServerDictionary & dict; string articleData; @@ -578,7 +578,7 @@ class DictServerArticleRequest: public Dictionary::DataRequest public: DictServerImpl * dictImpl; - DictServerArticleRequest( wstring word_, DictServerDictionary & dict_ ): + DictServerArticleRequest( std::u32string word_, DictServerDictionary & dict_ ): word( std::move( word_ ) ), dict( dict_ ), dictImpl( new DictServerImpl( this, dict_.url, "GoldenDict-t" ) ) @@ -870,7 +870,7 @@ void DictServerArticleRequest::cancel() finish(); } -sptr< WordSearchRequest > DictServerDictionary::prefixMatch( wstring const & word, unsigned long maxResults ) +sptr< WordSearchRequest > DictServerDictionary::prefixMatch( std::u32string const & word, unsigned long maxResults ) { (void)maxResults; if ( word.size() > 80 ) { @@ -883,8 +883,10 @@ sptr< WordSearchRequest > DictServerDictionary::prefixMatch( wstring const & wor } } -sptr< DataRequest > -DictServerDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) +sptr< DataRequest > DictServerDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const &, + std::u32string const &, + bool ) { if ( word.size() > 80 ) { diff --git a/src/dict/dsl.cc b/src/dict/dsl.cc index 71a597f9..ed8dfbe0 100644 --- a/src/dict/dsl.cc +++ b/src/dict/dsl.cc @@ -5,7 +5,7 @@ #include "dsl_details.hh" #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "chunkedstorage.hh" #include "dictzip.hh" #include "htmlescape.hh" @@ -13,7 +13,6 @@ #include "filetype.hh" #include "audiolink.hh" #include "langcoder.hh" -#include "wstring_qt.hh" #include "indexedzip.hh" #include "tiff.hh" #include "ftshelpers.hh" @@ -44,11 +43,9 @@ using std::multimap; using std::pair; using std::set; using std::string; -using gd::wstring; -using gd::wchar; using std::vector; using std::list; -using Utf8::Encoding; +using Text::Encoding; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; @@ -100,8 +97,8 @@ struct InsidedCard { uint32_t offset; uint32_t size; - QList< wstring > headwords; - InsidedCard( uint32_t _offset, uint32_t _size, QList< wstring > const & words ): + QList< std::u32string > headwords; + InsidedCard( uint32_t _offset, uint32_t _size, QList< std::u32string > const & words ): offset( _offset ), size( _size ), headwords( words ) @@ -144,7 +141,7 @@ class DslDictionary: public BtreeIndexing::BtreeDictionary int optionalPartNom; quint8 articleNom; - wstring currentHeadword; + std::u32string currentHeadword; string resourceDir1, resourceDir2; public: @@ -187,8 +184,10 @@ public: } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -232,15 +231,15 @@ private: /// Loads the article. Does not process the DSL language. void loadArticle( uint32_t address, - wstring const & requestedHeadwordFolded, + std::u32string const & requestedHeadwordFolded, bool ignoreDiacritics, - wstring & tildeValue, - wstring & displayedHeadword, + std::u32string & tildeValue, + std::u32string & displayedHeadword, unsigned & headwordIndex, - wstring & articleText ); + std::u32string & articleText ); /// Converts DSL language to an Html. - string dslToHtml( wstring const &, wstring const & headword = wstring() ); + string dslToHtml( std::u32string const &, std::u32string const & headword = std::u32string() ); // Parts of dslToHtml() string nodeToHtml( ArticleDom::Node const & ); @@ -452,7 +451,7 @@ void DslDictionary::loadIcon() noexcept /// so nbsp is not a whitespace character for Dsl compiler. /// For now we have only space and tab, since those are most likely the only /// ones recognized as spaces by that compiler. -bool isDslWs( wchar ch ) +bool isDslWs( char32_t ch ) { switch ( ch ) { case ' ': @@ -464,14 +463,14 @@ bool isDslWs( wchar ch ) } void DslDictionary::loadArticle( uint32_t address, - wstring const & requestedHeadwordFolded, + std::u32string const & requestedHeadwordFolded, bool ignoreDiacritics, - wstring & tildeValue, - wstring & displayedHeadword, + std::u32string & tildeValue, + std::u32string & displayedHeadword, unsigned & headwordIndex, - wstring & articleText ) + std::u32string & articleText ) { - wstring articleData; + std::u32string articleData; { vector< char > chunk; @@ -507,7 +506,7 @@ void DslDictionary::loadArticle( uint32_t address, else { try { articleData = - Iconv::toWstring( Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize ); + Iconv::toWstring( Text::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize ); free( articleBody ); // Strip DSL comments @@ -528,27 +527,27 @@ void DslDictionary::loadArticle( uint32_t address, // Check is we retrieve insided card bool insidedCard = isDslWs( articleData.at( 0 ) ); - wstring tildeValueWithUnsorted; // This one has unsorted parts left + std::u32string tildeValueWithUnsorted; // This one has unsorted parts left for ( headwordIndex = 0;; ) { size_t begin = pos; pos = articleData.find_first_of( U"\n\r", begin ); - if ( pos == wstring::npos ) { + if ( pos == std::u32string::npos ) { pos = articleData.size(); } if ( !foundDisplayedHeadword ) { // Process the headword - wstring rawHeadword = wstring( articleData, begin, pos - begin ); + std::u32string rawHeadword = std::u32string( articleData, begin, pos - begin ); if ( insidedCard && !rawHeadword.empty() && isDslWs( rawHeadword[ 0 ] ) ) { // Headword of the insided card - wstring::size_type hpos = rawHeadword.find( L'@' ); + std::u32string::size_type hpos = rawHeadword.find( L'@' ); if ( hpos != string::npos ) { - wstring head = Folding::trimWhitespace( rawHeadword.substr( hpos + 1 ) ); - hpos = head.find( L'~' ); + std::u32string head = Folding::trimWhitespace( rawHeadword.substr( hpos + 1 ) ); + hpos = head.find( L'~' ); while ( hpos != string::npos ) { if ( hpos == 0 || head[ hpos ] != L'\\' ) { break; @@ -569,7 +568,7 @@ void DslDictionary::loadArticle( uint32_t address, // We need our tilde expansion value tildeValue = rawHeadword; - list< wstring > lst; + list< std::u32string > lst; expandOptionalParts( tildeValue, &lst ); @@ -581,7 +580,7 @@ void DslDictionary::loadArticle( uint32_t address, processUnsortedParts( tildeValue, false ); } - wstring str = rawHeadword; + std::u32string str = rawHeadword; if ( hadFirstHeadword ) { expandTildes( str, tildeValueWithUnsorted ); @@ -591,7 +590,7 @@ void DslDictionary::loadArticle( uint32_t address, str = Folding::applySimpleCaseOnly( str ); - list< wstring > lst; + list< std::u32string > lst; expandOptionalParts( str, &lst ); // Does one of the results match the requested word? If so, we'd choose @@ -657,15 +656,15 @@ void DslDictionary::loadArticle( uint32_t address, // Check for begin article text if ( insidedCard ) { // Check for next insided headword - wstring::size_type hpos = articleData.find_first_of( U"\n\r", pos ); - if ( hpos == wstring::npos ) { + std::u32string::size_type hpos = articleData.find_first_of( U"\n\r", pos ); + if ( hpos == std::u32string::npos ) { hpos = articleData.size(); } - wstring str = wstring( articleData, pos, hpos - pos ); + std::u32string str = std::u32string( articleData, pos, hpos - pos ); hpos = str.find( L'@' ); - if ( hpos == wstring::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) { + if ( hpos == std::u32string::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) { break; } } @@ -687,18 +686,18 @@ void DslDictionary::loadArticle( uint32_t address, } if ( pos != articleData.size() ) { - articleText = wstring( articleData, pos ); + articleText = std::u32string( articleData, pos ); } else { articleText.clear(); } } -string DslDictionary::dslToHtml( wstring const & str, wstring const & headword ) +string DslDictionary::dslToHtml( std::u32string const & str, std::u32string const & headword ) { // Normalize the string - wstring normalizedStr = gd::normalize( str ); - currentHeadword = headword; + std::u32string normalizedStr = Text::normalize( str ); + currentHeadword = headword; ArticleDom dom( normalizedStr, getName(), headword ); @@ -733,7 +732,7 @@ string DslDictionary::getNodeLink( ArticleDom::Node const & node ) } } if ( link.empty() ) { - link = Html::escape( Filetype::simplifyString( Utf8::encode( node.renderAsText() ), false ) ); + link = Html::escape( Filetype::simplifyString( Text::toUtf8( node.renderAsText() ), false ) ); } return link; @@ -744,7 +743,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node ) string result; if ( !node.isTag ) { - result = Html::escape( Utf8::encode( node.text ) ); + result = Html::escape( Text::toUtf8( node.text ) ); // Handle all end-of-line @@ -784,7 +783,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node ) result += "" + processNodeChildren( node ) + ""; } else { - result += "" + processNodeChildren( node ) + result += "" + processNodeChildren( node ) + ""; } } @@ -797,7 +796,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node ) result += "
" + processNodeChildren( node ) + "
"; } else if ( node.tagName.size() == 2 && node.tagName[ 0 ] == L'm' && iswdigit( node.tagName[ 1 ] ) ) { - result += "
" + processNodeChildren( node ) + "
"; + result += "
" + processNodeChildren( node ) + "
"; } else if ( node.tagName == U"trn" ) { result += "" + processNodeChildren( node ) + ""; @@ -809,7 +808,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node ) result += "" + processNodeChildren( node ) + ""; } else if ( node.tagName == U"s" || node.tagName == U"video" ) { - string filename = Filetype::simplifyString( Utf8::encode( node.renderAsText() ), false ); + string filename = Filetype::simplifyString( Text::toUtf8( node.renderAsText() ), false ); string n = resourceDir1 + filename; if ( Filetype::isNameOfSound( filename ) ) { @@ -888,7 +887,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node ) else if ( node.tagName == U"p" ) { result += ")" + data + "" - + "" + data + Utf8::encode( wstring( 1, 0x301 ) ) + ""; + + "" + data + Text::toUtf8( std::u32string( 1, 0x301 ) ) + + ""; } else if ( node.tagName == U"lang" ) { result += " chunk; char * articleProps; - wstring articleData; + std::u32string articleData; { QMutexLocker _( &idxMutex ); @@ -1161,7 +1161,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, // Skip headword size_t pos = 0; - wstring articleHeadword, tildeValue; + std::u32string articleHeadword, tildeValue; // Check if we retrieve insided card bool insidedCard = isDslWs( articleData.at( 0 ) ); @@ -1170,20 +1170,20 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, size_t begin = pos; pos = articleData.find_first_of( U"\n\r", begin ); - if ( pos == wstring::npos ) { + if ( pos == std::u32string::npos ) { pos = articleData.size(); } if ( articleHeadword.empty() ) { // Process the headword - articleHeadword = wstring( articleData, begin, pos - begin ); + articleHeadword = std::u32string( articleData, begin, pos - begin ); if ( insidedCard && !articleHeadword.empty() && isDslWs( articleHeadword[ 0 ] ) ) { // Headword of the insided card - wstring::size_type hpos = articleHeadword.find( L'@' ); + std::u32string::size_type hpos = articleHeadword.find( L'@' ); if ( hpos != string::npos ) { - wstring head = Folding::trimWhitespace( articleHeadword.substr( hpos + 1 ) ); - hpos = head.find( L'~' ); + std::u32string head = Folding::trimWhitespace( articleHeadword.substr( hpos + 1 ) ); + hpos = head.find( L'~' ); while ( hpos != string::npos ) { if ( hpos == 0 || head[ hpos ] != L'\\' ) { break; @@ -1200,7 +1200,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, } if ( !articleHeadword.empty() ) { - list< wstring > lst; + list< std::u32string > lst; tildeValue = articleHeadword; @@ -1237,15 +1237,15 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, // Check for begin article text if ( insidedCard ) { // Check for next insided headword - wstring::size_type hpos = articleData.find_first_of( U"\n\r", pos ); - if ( hpos == wstring::npos ) { + std::u32string::size_type hpos = articleData.find_first_of( U"\n\r", pos ); + if ( hpos == std::u32string::npos ) { hpos = articleData.size(); } - wstring str = wstring( articleData, pos, hpos - pos ); + std::u32string str = std::u32string( articleData, pos, hpos - pos ); hpos = str.find( L'@' ); - if ( hpos == wstring::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) { + if ( hpos == std::u32string::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) { break; } } @@ -1261,17 +1261,17 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, headword = QString::fromStdU32String( articleHeadword ); } - wstring articleText; + std::u32string articleText; if ( pos != articleData.size() ) { - articleText = wstring( articleData, pos ); + articleText = std::u32string( articleData, pos ); } else { articleText.clear(); } if ( !tildeValue.empty() ) { - list< wstring > lst; + list< std::u32string > lst; processUnsortedParts( tildeValue, false ); expandOptionalParts( tildeValue, &lst ); @@ -1377,8 +1377,8 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, class DslArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; DslDictionary & dict; bool ignoreDiacritics; @@ -1387,8 +1387,8 @@ class DslArticleRequest: public Dictionary::DataRequest public: - DslArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + DslArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, DslDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -1444,7 +1444,7 @@ void DslArticleRequest::run() // index here. set< pair< uint32_t, unsigned > > articlesIncluded; - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); for ( auto & x : chain ) { // Check if we're cancelled occasionally @@ -1455,9 +1455,9 @@ void DslArticleRequest::run() // Grab that article - wstring tildeValue; - wstring displayedHeadword; - wstring articleBody; + std::u32string tildeValue; + std::u32string displayedHeadword; + std::u32string articleBody; unsigned headwordIndex; string articleText, articleAfter; @@ -1541,9 +1541,9 @@ void DslArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > DslDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > DslDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -1632,7 +1632,7 @@ void DslResourceRequest::run() if ( dict.resourceZip.isOpen() ) { QMutexLocker _( &dataMutex ); - if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { + if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) { throw; // Make it fail since we couldn't read the archive } } @@ -1761,7 +1761,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f } // Building the index - initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) ); + initializing.indexingDictionary( Text::toUtf8( scanner.getDictionaryName() ) ); qDebug( "Dsl: Building the index for dictionary: %s", QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() ); @@ -1777,12 +1777,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f idx.write( idxHeader ); - string dictionaryName = Utf8::encode( scanner.getDictionaryName() ); + string dictionaryName = Text::toUtf8( scanner.getDictionaryName() ); idx.write( (uint32_t)dictionaryName.size() ); idx.write( dictionaryName.data(), dictionaryName.size() ); - string soundDictName = Utf8::encode( scanner.getSoundDictionaryName() ); + string soundDictName = Text::toUtf8( scanner.getSoundDictionaryName() ); if ( !soundDictName.empty() ) { idxHeader.hasSoundDictionaryName = 1; idx.write( (uint32_t)soundDictName.size() ); @@ -1803,7 +1803,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f map< string, string > abrv; - wstring curString; + std::u32string curString; size_t curOffset; for ( ;; ) { @@ -1815,7 +1815,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f continue; } - list< wstring > keys; + list< std::u32string > keys; bool eof = false; @@ -1851,13 +1851,13 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f } // If the string has any dsl markup, we strip it - string value = Utf8::encode( ArticleDom( curString ).root.renderAsText() ); + string value = Text::toUtf8( ArticleDom( curString ).root.renderAsText() ); for ( auto & key : keys ) { unescapeDsl( key ); normalizeHeadword( key ); - abrv[ Utf8::encode( Folding::trimWhitespace( key ) ) ] = value; + abrv[ Text::toUtf8( Folding::trimWhitespace( key ) ) ] = value; } } @@ -1885,7 +1885,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f } bool hasString = false; - wstring curString; + std::u32string curString; size_t curOffset; uint32_t articleCount = 0, wordCount = 0; @@ -1919,7 +1919,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Ok, got the headword - list< wstring > allEntryWords; + list< std::u32string > allEntryWords; processUnsortedParts( curString, true ); expandOptionalParts( curString, &allEntryWords ); @@ -1972,10 +1972,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f wordCount += allEntryWords.size(); int insideInsided = 0; - wstring headword; + std::u32string headword; QList< InsidedCard > insidedCards; uint32_t offset = curOffset; - QList< wstring > insidedHeadwords; + QList< std::u32string > insidedHeadwords; unsigned linesInsideCard = 0; int dogLine = 0; bool wasEmptyLine = false; @@ -2018,8 +2018,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Find embedded cards - wstring::size_type n = curString.find( L'@' ); - if ( n == wstring::npos || curString[ n - 1 ] == L'\\' ) { + std::u32string::size_type n = curString.find( L'@' ); + if ( n == std::u32string::npos || curString[ n - 1 ] == L'\\' ) { if ( insideInsided ) { linesInsideCard++; } diff --git a/src/dict/dsl_details.cc b/src/dict/dsl_details.cc index 55595563..c4afbb1d 100644 --- a/src/dict/dsl_details.cc +++ b/src/dict/dsl_details.cc @@ -6,7 +6,7 @@ #include "folding.hh" #include "langcoder.hh" #include "ufile.hh" -#include "utf8.hh" +#include "text.hh" #include #include @@ -17,9 +17,8 @@ namespace Dsl { namespace Details { -using gd::wstring; using std::list; -using Utf8::Encoding; +using Text::Encoding; static QMap< int, string > lang_codes = { { 1, "en" }, { 1033, "en" }, { 2, "ru" }, { 1049, "ru" }, { 1068, "az" }, { 1025, "ar" }, { 1067, "am" }, @@ -40,7 +39,7 @@ string findCodeForDslId( int id ) return lang_codes[ id ]; } -bool isAtSignFirst( wstring const & str ) +bool isAtSignFirst( std::u32string const & str ) { // Test if '@' is first in string except spaces and dsl tags QRegularExpression reg( R"([ \t]*(?:\[[^\]]+\][ \t]*)*@)", QRegularExpression::PatternOption::CaseInsensitiveOption ); @@ -49,13 +48,13 @@ bool isAtSignFirst( wstring const & str ) /////////////// ArticleDom -wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const +std::u32string ArticleDom::Node::renderAsText( bool stripTrsTag ) const { if ( !isTag ) { return text; } - wstring result; + std::u32string result; for ( const auto & i : *this ) { if ( !stripTrsTag || i.tagName != U"!trs" ) { @@ -69,17 +68,17 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const namespace { /// @return true if @p tagName equals "mN" where N is a digit -bool is_mN( wstring const & tagName ) +bool is_mN( std::u32string const & tagName ) { return tagName.size() == 2 && tagName[ 0 ] == U'm' && iswdigit( tagName[ 1 ] ); } -bool isAnyM( wstring const & tagName ) +bool isAnyM( std::u32string const & tagName ) { return tagName == U"m" || is_mN( tagName ); } -bool checkM( wstring const & dest, wstring const & src ) +bool checkM( std::u32string const & dest, std::u32string const & src ) { return src == U"m" && is_mN( dest ); } @@ -97,8 +96,8 @@ struct MustTagBeClosed } // unnamed namespace -ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring const & headword_ ): - root( Node::Tag(), wstring(), wstring() ), +ArticleDom::ArticleDom( std::u32string const & str, string const & dictName, std::u32string const & headword_ ): + root( Node::Tag(), std::u32string(), std::u32string() ), stringPos( str.c_str() ), lineStartPos( str.c_str() ), transcriptionCount( 0 ), @@ -126,7 +125,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co } else { // Insided card - wstring linkTo; + std::u32string linkTo; nextChar(); for ( ;; nextChar() ) { if ( ch == L'\n' ) { @@ -142,13 +141,13 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co linkTo = Folding::trimWhitespace( linkTo ); if ( !linkTo.empty() ) { - list< wstring > allLinkEntries; + list< std::u32string > allLinkEntries; processUnsortedParts( linkTo, true ); expandOptionalParts( linkTo, &allLinkEntries ); for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) { if ( !textNode ) { - Node text = Node( Node::Text(), wstring() ); + Node text = Node( Node::Text(), std::u32string() ); if ( stack.empty() ) { root.push_back( text ); @@ -168,10 +167,10 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co stack.pop_back(); textNode = 0; - wstring linkText = Folding::trimWhitespace( *entry ); + std::u32string linkText = Folding::trimWhitespace( *entry ); ArticleDom nodeDom( linkText, dictName, headword_ ); - Node link( Node::Tag(), U"@", wstring() ); + Node link( Node::Tag(), U"@", std::u32string() ); for ( auto & n : nodeDom.root ) { link.push_back( n ); } @@ -181,13 +180,13 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co if ( stack.empty() ) { root.push_back( link ); if ( entry != allLinkEntries.end() ) { // Add line break before next entry - root.push_back( Node( Node::Tag(), U"br", wstring() ) ); + root.push_back( Node( Node::Tag(), U"br", std::u32string() ) ); } } else { stack.back()->push_back( link ); if ( entry != allLinkEntries.end() ) { - stack.back()->push_back( Node( Node::Tag(), U"br", wstring() ) ); + stack.back()->push_back( Node( Node::Tag(), U"br", std::u32string() ) ); } } } @@ -208,8 +207,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co if ( ch == L'[' && !escaped ) { // Beginning of a tag. bool isClosing; - wstring name; - wstring attrs; + std::u32string name; + std::u32string attrs; try { do { @@ -330,7 +329,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co nextChar(); } while ( Folding::isWhitespace( ch ) ); - wstring linkTo, linkText; + std::u32string linkTo, linkText; for ( ;; nextChar() ) { // Is it the end? @@ -373,7 +372,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co processUnsortedParts( linkText, true ); ArticleDom nodeDom( linkText, dictName, headword_ ); - Node link( Node::Tag(), U"ref", wstring() ); + Node link( Node::Tag(), U"ref", std::u32string() ); for ( auto & n : nodeDom.root ) { link.push_back( n ); } @@ -427,7 +426,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co // If there's currently no text node, open one if ( !textNode ) { - Node text = Node( Node::Text(), wstring() ); + Node text = Node( Node::Text(), std::u32string() ); if ( stack.empty() ) { root.push_back( text ); @@ -691,7 +690,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co } } -void ArticleDom::openTag( wstring const & name, wstring const & attrs, list< Node * > & stack ) +void ArticleDom::openTag( std::u32string const & name, std::u32string const & attrs, list< Node * > & stack ) { list< Node > nodesToReopen; @@ -746,7 +745,7 @@ void ArticleDom::openTag( wstring const & name, wstring const & attrs, list< Nod } } -void ArticleDom::closeTag( wstring const & name, list< Node * > & stack, bool warn ) +void ArticleDom::closeTag( std::u32string const & name, list< Node * > & stack, bool warn ) { // Find the tag which is to be closed @@ -839,13 +838,13 @@ bool ArticleDom::atSignFirstInLine() return true; } - return isAtSignFirst( wstring( lineStartPos ) ); + return isAtSignFirst( std::u32string( lineStartPos ) ); } /////////////// DslScanner DslScanner::DslScanner( string const & fileName ): - encoding( Utf8::Utf8 ), + encoding( Text::Utf8 ), readBufferPtr( readBuffer ), readBufferLeft( 0 ), linesRead( 0 ) @@ -876,19 +875,19 @@ DslScanner::DslScanner( string const & fileName ): guessedEncoding.has_value() ) { switch ( guessedEncoding.value() ) { case QStringConverter::Utf8: - encoding = Utf8::Utf8; + encoding = Text::Utf8; break; case QStringConverter::Utf16LE: - encoding = Utf8::Utf16LE; + encoding = Text::Utf16LE; break; case QStringConverter::Utf16BE: - encoding = Utf8::Utf16BE; + encoding = Text::Utf16BE; break; case QStringConverter::Utf32LE: - encoding = Utf8::Utf16LE; + encoding = Text::Utf16LE; break; case QStringConverter::Utf32BE: - encoding = Utf8::Utf32BE; + encoding = Text::Utf32BE; break; default: break; @@ -905,10 +904,10 @@ DslScanner::DslScanner( string const & fileName ): } //iconv.reinit( encoding ); - lineFeed = Utf8::initLineFeed( encoding ); + lineFeed = Text::initLineFeed( encoding ); // We now can use our own readNextLine() function - wstring str; + std::u32string str; size_t offset; for ( ;; ) { @@ -946,7 +945,7 @@ DslScanner::DslScanner( string const & fileName ): size_t beg = str.find_first_of( L'"' ); - if ( beg == wstring::npos ) { + if ( beg == std::u32string::npos ) { throw exMalformedDslFile( fileName ); } @@ -956,7 +955,7 @@ DslScanner::DslScanner( string const & fileName ): throw exMalformedDslFile( fileName ); } - wstring arg( str, beg + 1, end - beg - 1 ); + std::u32string arg( str, beg + 1, end - beg - 1 ); if ( isName ) { dictionaryName = arg; @@ -977,13 +976,13 @@ DslScanner::DslScanner( string const & fileName ): qWarning( "Warning: encoding was specified in a Unicode file, ignoring." ); } else if ( !arg.compare( U"Latin" ) ) { - encoding = Utf8::Windows1252; + encoding = Text::Windows1252; } else if ( !arg.compare( U"Cyrillic" ) ) { - encoding = Utf8::Windows1251; + encoding = Text::Windows1251; } else if ( !arg.compare( U"EasternEuropean" ) ) { - encoding = Utf8::Windows1250; + encoding = Text::Windows1250; } else { gzclose( f ); @@ -1009,7 +1008,7 @@ DslScanner::~DslScanner() noexcept gzclose( f ); } -bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) +bool DslScanner::readNextLine( std::u32string & out, size_t & offset, bool only_head_word ) { offset = gztell( f ) - readBufferLeft /*+pos*/; @@ -1036,7 +1035,7 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo return false; } - int pos = Utf8::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length ); + int pos = Text::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length ); if ( pos == -1 ) { return false; } @@ -1057,9 +1056,9 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo } } -bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset, bool only_headword ) +bool DslScanner::readNextLineWithoutComments( std::u32string & out, size_t & offset, bool only_headword ) { - wstring str; + std::u32string str; bool commentToNextLine = false; size_t currentOffset; @@ -1087,14 +1086,14 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset, bo /////////////// DslScanner -void processUnsortedParts( wstring & str, bool strip ) +void processUnsortedParts( std::u32string & str, bool strip ) { int refCount = 0; size_t startPos = 0; for ( size_t x = 0; x < str.size(); ) { - wchar ch = str[ x ]; + char32_t ch = str[ x ]; if ( ch == L'\\' ) { // Escape code @@ -1150,18 +1149,18 @@ void processUnsortedParts( wstring & str, bool strip ) } } -void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, bool inside_recurse ) +void expandOptionalParts( std::u32string & str, list< std::u32string > * result, size_t x, bool inside_recurse ) { // if str is too long ,it can never be headwords. if ( str.size() > 100 ) { return; } - list< wstring > expanded; - list< wstring > * headwords; + list< std::u32string > expanded; + list< std::u32string > * headwords; headwords = inside_recurse ? result : &expanded; for ( ; x < str.size(); ) { - wchar ch = str[ x ]; + char32_t ch = str[ x ]; if ( ch == L'\\' ) { // Escape code @@ -1174,7 +1173,7 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo int refCount = 1; for ( size_t y = x + 1; y < str.size(); ++y ) { - wchar ch = str[ y ]; + char32_t ch = str[ y ]; if ( ch == L'\\' ) { // Escape code @@ -1190,7 +1189,7 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo if ( y != x + 1 ) // Only do for non-empty cases { - wstring removed( str, 0, x ); + std::u32string removed( str, 0, x ); removed.append( str, y + 1, str.size() - y - 1 ); expandOptionalParts( removed, headwords, x, true ); @@ -1204,7 +1203,7 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo if ( refCount && x != str.size() - 1 ) { // Closing paren not found? Chop it. - wstring removed( str, 0, x ); + std::u32string removed( str, 0, x ); // Limit the amount of results to avoid excessive resource consumption if ( headwords->size() < 32 ) { @@ -1242,10 +1241,10 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo } } -static const wstring openBraces( U"{{" ); -static const wstring closeBraces( U"}}" ); +static const std::u32string openBraces( U"{{" ); +static const std::u32string closeBraces( U"}}" ); -void stripComments( wstring & str, bool & nextLine ) +void stripComments( std::u32string & str, bool & nextLine ) { string::size_type n = 0, n2 = 0; @@ -1269,9 +1268,9 @@ void stripComments( wstring & str, bool & nextLine ) } } -void expandTildes( wstring & str, wstring const & tildeReplacement ) +void expandTildes( std::u32string & str, std::u32string const & tildeReplacement ) { - wstring tildeValue = Folding::trimWhitespace( tildeReplacement ); + std::u32string tildeValue = Folding::trimWhitespace( tildeReplacement ); for ( size_t x = 0; x < str.size(); ) { if ( str[ x ] == L'\\' ) { x += 2; @@ -1294,7 +1293,7 @@ void expandTildes( wstring & str, wstring const & tildeReplacement ) } } -void unescapeDsl( wstring & str ) +void unescapeDsl( std::u32string & str ) { for ( size_t x = 0; x < str.size(); ++x ) { if ( str[ x ] == L'\\' ) { @@ -1303,7 +1302,7 @@ void unescapeDsl( wstring & str ) } } -void normalizeHeadword( wstring & str ) +void normalizeHeadword( std::u32string & str ) { for ( size_t x = str.size(); x-- > 1; ) // >1 -- Don't test the first char { @@ -1331,7 +1330,7 @@ void normalizeHeadword( wstring & str ) } namespace { -void cutEnding( wstring & where, wstring const & ending ) +void cutEnding( std::u32string & where, std::u32string const & ending ) { if ( where.size() > ending.size() && where.compare( where.size() - ending.size(), ending.size(), ending ) == 0 ) { where.erase( where.size() - ending.size() ); @@ -1339,17 +1338,17 @@ void cutEnding( wstring & where, wstring const & ending ) } } // namespace -quint32 dslLanguageToId( wstring const & name ) +quint32 dslLanguageToId( std::u32string const & name ) { - static wstring newSp( U"newspelling" ); - static wstring st( U"standard" ); - static wstring ms( U"modernsort" ); - static wstring ts( U"traditionalsort" ); - static wstring prc( U"prc" ); + static std::u32string newSp( U"newspelling" ); + static std::u32string st( U"standard" ); + static std::u32string ms( U"modernsort" ); + static std::u32string ts( U"traditionalsort" ); + static std::u32string prc( U"prc" ); // Any of those endings are to be removed - wstring nameStripped = Folding::apply( name ); + std::u32string nameStripped = Folding::apply( name ); cutEnding( nameStripped, newSp ); cutEnding( nameStripped, st ); diff --git a/src/dict/dsl_details.hh b/src/dict/dsl_details.hh index 8eaa7481..15159661 100644 --- a/src/dict/dsl_details.hh +++ b/src/dict/dsl_details.hh @@ -11,23 +11,21 @@ #include "iconv.hh" #include #include -#include "utf8.hh" +#include "text.hh" // Implementation details for Dsl, not part of its interface namespace Dsl { namespace Details { using std::string; -using gd::wstring; -using gd::wchar; using std::list; using std::vector; -using Utf8::Encoding; -using Utf8::LineFeed; +using Text::Encoding; +using Text::LineFeed; string findCodeForDslId( int id ); -bool isAtSignFirst( wstring const & str ); +bool isAtSignFirst( std::u32string const & str ); /// Parses the DSL language, representing it in its structural DOM form. struct ArticleDom @@ -37,23 +35,23 @@ struct ArticleDom bool isTag; // true if it is a tag with subnodes, false if it's a leaf text // data. // Those are only used if isTag is true - wstring tagName; - wstring tagAttrs; - wstring text; // This is only used if isTag is false + std::u32string tagName; + std::u32string tagAttrs; + std::u32string text; // This is only used if isTag is false class Text {}; class Tag {}; - Node( Tag, wstring const & name, wstring const & attrs ): + Node( Tag, std::u32string const & name, std::u32string const & attrs ): isTag( true ), tagName( name ), tagAttrs( attrs ) { } - Node( Text, wstring const & text_ ): + Node( Text, std::u32string const & text_ ): isTag( false ), text( text_ ) { @@ -61,30 +59,32 @@ struct ArticleDom /// Concatenates all childen text nodes recursively to form all text /// the node contains stripped of any markup. - wstring renderAsText( bool stripTrsTag = false ) const; + std::u32string renderAsText( bool stripTrsTag = false ) const; }; /// Does the parse at construction. Refer to the 'root' member variable /// afterwards. - explicit ArticleDom( wstring const &, string const & dictName = string(), wstring const & headword_ = wstring() ); + explicit ArticleDom( std::u32string const &, + string const & dictName = string(), + std::u32string const & headword_ = std::u32string() ); /// Root of DOM's tree Node root; private: - void openTag( wstring const & name, wstring const & attr, list< Node * > & stack ); + void openTag( std::u32string const & name, std::u32string const & attr, list< Node * > & stack ); - void closeTag( wstring const & name, list< Node * > & stack, bool warn = true ); + void closeTag( std::u32string const & name, list< Node * > & stack, bool warn = true ); bool atSignFirstInLine(); - wchar const *stringPos, *lineStartPos; + char32_t const *stringPos, *lineStartPos; class eot: std::exception {}; - wchar ch; + char32_t ch; bool escaped; unsigned transcriptionCount; // >0 = inside a [t] tag unsigned mediaCount; // >0 = inside a [s] tag @@ -93,7 +93,7 @@ private: /// Information for diagnostic purposes string dictionaryName; - wstring headword; + std::u32string headword; }; /// Opens the .dsl or .dsl.dz file and allows line-by-line reading. Auto-detects @@ -103,9 +103,9 @@ class DslScanner gzFile f; Encoding encoding; QTextCodec * codec; - wstring dictionaryName; - wstring langFrom, langTo; - wstring soundDictionary; + std::u32string dictionaryName; + std::u32string langFrom, langTo; + std::u32string soundDictionary; char readBuffer[ 65536 ]; char * readBufferPtr; LineFeed lineFeed; @@ -132,25 +132,25 @@ public: } /// Returns the dictionary's name, as was read from file's headers. - wstring const & getDictionaryName() const + std::u32string const & getDictionaryName() const { return dictionaryName; } /// Returns the dictionary's source language, as was read from file's headers. - wstring const & getLangFrom() const + std::u32string const & getLangFrom() const { return langFrom; } /// Returns the dictionary's target language, as was read from file's headers. - wstring const & getLangTo() const + std::u32string const & getLangTo() const { return langTo; } /// Returns the preferred external dictionary with sounds, as was read from file's headers. - wstring const & getSoundDictionaryName() const + std::u32string const & getSoundDictionaryName() const { return soundDictionary; } @@ -161,10 +161,10 @@ public: /// If end of file is reached, false is returned. /// Reading begins from the first line after the headers (ones which start /// with #). - bool readNextLine( wstring &, size_t & offset, bool only_head_word = false ); + bool readNextLine( std::u32string &, size_t & offset, bool only_head_word = false ); /// Similar readNextLine but strip all DSL comments {{...}} - bool readNextLineWithoutComments( wstring &, size_t & offset, bool only_headword = false ); + bool readNextLineWithoutComments( std::u32string &, size_t & offset, bool only_headword = false ); /// Returns the number of lines read so far from the file. unsigned getLinesRead() const @@ -180,32 +180,35 @@ public: /// This function either removes parts of string enclosed in braces, or leaves /// them intact. The braces themselves are removed always, though. -void processUnsortedParts( wstring & str, bool strip ); +void processUnsortedParts( std::u32string & str, bool strip ); /// Expands optional parts of a headword (ones marked with parentheses), /// producing all possible combinations where they are present or absent. -void expandOptionalParts( wstring & str, list< wstring > * result, size_t x = 0, bool inside_recurse = false ); +void expandOptionalParts( std::u32string & str, + list< std::u32string > * result, + size_t x = 0, + bool inside_recurse = false ); /// Expands all unescaped tildes, inserting tildeReplacement text instead of /// them. -void expandTildes( wstring & str, wstring const & tildeReplacement ); +void expandTildes( std::u32string & str, std::u32string const & tildeReplacement ); /// Unescapes any escaped chars. Be sure to handle all their special meanings /// before unescaping them. -void unescapeDsl( wstring & str ); +void unescapeDsl( std::u32string & str ); /// Normalizes the headword. Currently turns any sequences of consecutive spaces /// into a single space. -void normalizeHeadword( wstring & ); +void normalizeHeadword( std::u32string & ); /// Strip DSL {{...}} comments -void stripComments( wstring &, bool & ); +void stripComments( std::u32string &, bool & ); inline size_t DslScanner::distanceToBytes( size_t x ) const { switch ( encoding ) { - case Utf8::Utf16LE: - case Utf8::Utf16BE: + case Text::Utf16LE: + case Text::Utf16BE: return x * 2; default: return x; @@ -214,7 +217,7 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const /// Converts the given language name taken from Dsl header (i.e. getLangFrom(), /// getLangTo()) to its proper language id. -quint32 dslLanguageToId( wstring const & name ); +quint32 dslLanguageToId( std::u32string const & name ); } // namespace Details } // namespace Dsl diff --git a/src/dict/epwing.cc b/src/dict/epwing.cc index c6f020b0..f70f8514 100644 --- a/src/dict/epwing.cc +++ b/src/dict/epwing.cc @@ -29,7 +29,7 @@ using std::multimap; using std::vector; using std::set; using std::pair; -using gd::wstring; +using std::u32string; namespace { @@ -109,10 +109,10 @@ public: QString const & getDescription() override; - void getHeadwordPos( wstring const & word_, QList< int > & pg, QList< int > & off ); + void getHeadwordPos( u32string const & word_, QList< int > & pg, QList< int > & off ); sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + getArticle( u32string const &, vector< u32string > const & alts, u32string const &, bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -134,16 +134,16 @@ public: && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize ); } - static int japaneseWriting( gd::wchar ch ); + static int japaneseWriting( char32_t ch ); - static bool isSign( gd::wchar ch ); + static bool isSign( char32_t ch ); - static bool isJapanesePunctiation( gd::wchar ch ); + static bool isJapanesePunctiation( char32_t ch ); - sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &, unsigned long ) override; + sptr< Dictionary::WordSearchRequest > prefixMatch( u32string const &, unsigned long ) override; sptr< Dictionary::WordSearchRequest > - stemmedMatch( wstring const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ) override; + stemmedMatch( u32string const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ) override; protected: @@ -156,7 +156,7 @@ private: quint32 address, string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ); - sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & word ) override; + sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( u32string const & word ) override; void loadArticleNextPage( string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ); void @@ -449,7 +449,7 @@ void EpwingDictionary::getArticleText( uint32_t articleAddress, QString & headwo class EpwingHeadwordsRequest: public Dictionary::WordSearchRequest { - wstring str; + u32string str; EpwingDictionary & dict; QAtomicInt isCancelled; @@ -457,7 +457,7 @@ class EpwingHeadwordsRequest: public Dictionary::WordSearchRequest public: - EpwingHeadwordsRequest( wstring const & word_, EpwingDictionary & dict_ ): + EpwingHeadwordsRequest( u32string const & word_, EpwingDictionary & dict_ ): str( word_ ), dict( dict_ ) { @@ -533,7 +533,7 @@ void EpwingHeadwordsRequest::run() finish(); } -sptr< Dictionary::WordSearchRequest > EpwingDictionary::findHeadwordsForSynonym( wstring const & word ) +sptr< Dictionary::WordSearchRequest > EpwingDictionary::findHeadwordsForSynonym( u32string const & word ) { return synonymSearchEnabled ? std::make_shared< EpwingHeadwordsRequest >( word, *this ) : Class::findHeadwordsForSynonym( word ); @@ -542,8 +542,8 @@ sptr< Dictionary::WordSearchRequest > EpwingDictionary::findHeadwordsForSynonym( class EpwingArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + u32string word; + vector< u32string > alts; EpwingDictionary & dict; bool ignoreDiacritics; @@ -552,8 +552,8 @@ class EpwingArticleRequest: public Dictionary::DataRequest public: - EpwingArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + EpwingArticleRequest( u32string const & word_, + vector< u32string > const & alts_, EpwingDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -568,10 +568,10 @@ public: void run(); - void getBuiltInArticle( wstring const & word_, + void getBuiltInArticle( u32string const & word_, QList< int > & pages, QList< int > & offsets, - multimap< wstring, pair< string, string > > & mainArticles ); + multimap< u32string, pair< string, string > > & mainArticles ); void cancel() override { @@ -601,13 +601,13 @@ void EpwingArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< u32string, pair< string, string > > mainArticles, alternateArticles; set< quint32 > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); @@ -641,11 +641,11 @@ void EpwingArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); + u32string headwordStripped = Folding::applySimpleCaseOnly( headword ); if ( ignoreDiacritics ) headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); - multimap< wstring, pair< string, string > > & mapToUse = + multimap< u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); @@ -670,7 +670,7 @@ void EpwingArticleRequest::run() string result = "
"; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< u32string, pair< string, string > >::const_iterator i; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += "

"; @@ -719,10 +719,10 @@ void EpwingArticleRequest::run() finish(); } -void EpwingArticleRequest::getBuiltInArticle( wstring const & word_, +void EpwingArticleRequest::getBuiltInArticle( u32string const & word_, QList< int > & pages, QList< int > & offsets, - multimap< wstring, pair< string, string > > & mainArticles ) + multimap< u32string, pair< string, string > > & mainArticles ) { try { string headword, articleText; @@ -756,7 +756,7 @@ void EpwingArticleRequest::getBuiltInArticle( wstring const & word_, } } -void EpwingDictionary::getHeadwordPos( wstring const & word_, QList< int > & pg, QList< int > & off ) +void EpwingDictionary::getHeadwordPos( u32string const & word_, QList< int > & pg, QList< int > & off ) { try { QMutexLocker _( &eBook.getLibMutex() ); @@ -767,9 +767,9 @@ void EpwingDictionary::getHeadwordPos( wstring const & word_, QList< int > & pg, } } -sptr< Dictionary::DataRequest > EpwingDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > EpwingDictionary::getArticle( u32string const & word, + vector< u32string > const & alts, + u32string const &, bool ignoreDiacritics ) { @@ -882,7 +882,7 @@ sptr< Dictionary::DataRequest > EpwingDictionary::getSearchResults( QString cons ignoreDiacritics ); } -int EpwingDictionary::japaneseWriting( gd::wchar ch ) +int EpwingDictionary::japaneseWriting( char32_t ch ) { if ( ( ch >= 0x30A0 && ch <= 0x30FF ) || ( ch >= 0x31F0 && ch <= 0x31FF ) || ( ch >= 0x3200 && ch <= 0x32FF ) || ( ch >= 0xFF00 && ch <= 0xFFEF ) || ( ch == 0x1B000 ) ) @@ -895,7 +895,7 @@ int EpwingDictionary::japaneseWriting( gd::wchar ch ) return 0; } -bool EpwingDictionary::isSign( gd::wchar ch ) +bool EpwingDictionary::isSign( char32_t ch ) { switch ( ch ) { case 0x002B: // PLUS SIGN @@ -915,7 +915,7 @@ bool EpwingDictionary::isSign( gd::wchar ch ) } } -bool EpwingDictionary::isJapanesePunctiation( gd::wchar ch ) +bool EpwingDictionary::isJapanesePunctiation( char32_t ch ) { return ch >= 0x3000 && ch <= 0x303F; } @@ -929,7 +929,7 @@ class EpwingWordSearchRequest: public BtreeIndexing::BtreeWordSearchRequest public: EpwingWordSearchRequest( EpwingDictionary & dict_, - wstring const & str_, + u32string const & str_, unsigned minLength_, int maxSuffixVariation_, bool allowMiddleMatches_, @@ -976,13 +976,13 @@ void EpwingWordSearchRequest::findMatches() finish(); } -sptr< Dictionary::WordSearchRequest > EpwingDictionary::prefixMatch( wstring const & str, unsigned long maxResults ) +sptr< Dictionary::WordSearchRequest > EpwingDictionary::prefixMatch( u32string const & str, unsigned long maxResults ) { return std::make_shared< EpwingWordSearchRequest >( *this, str, 0, -1, true, maxResults ); } -sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch( wstring const & str, +sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch( u32string const & str, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ) @@ -1021,20 +1021,20 @@ void addWordToChunks( Epwing::Book::EpwingHeadword & head, chunks.addToBlock( &head.page, sizeof( head.page ) ); chunks.addToBlock( &head.offset, sizeof( head.offset ) ); - wstring hw = head.headword.toStdU32String(); + u32string hw = head.headword.toStdU32String(); indexedWords.addWord( hw, offset ); wordCount++; articleCount++; - vector< wstring > words; + vector< u32string > words; // Parse combined kanji/katakana/hiragana headwords int w_prev = 0; - wstring word; - for ( wstring::size_type n = 0; n < hw.size(); n++ ) { - gd::wchar ch = hw[ n ]; + u32string word; + for ( u32string::size_type n = 0; n < hw.size(); n++ ) { + char32_t ch = hw[ n ]; if ( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isSign( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) ) @@ -1044,7 +1044,7 @@ void addWordToChunks( Epwing::Book::EpwingHeadword & head, if ( w > 0 ) { // Store only separated words - gd::wchar ch_prev = 0; + char32_t ch_prev = 0; if ( n ) ch_prev = hw[ n - 1 ]; bool needStore = ( n == 0 || Folding::isPunct( ch_prev ) || Folding::isWhitespace( ch_prev ) @@ -1052,7 +1052,7 @@ void addWordToChunks( Epwing::Book::EpwingHeadword & head, word.push_back( ch ); w_prev = w; - wstring::size_type i; + u32string::size_type i; for ( i = n + 1; i < hw.size(); i++ ) { ch = hw[ i ]; if ( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) ) diff --git a/src/dict/epwing_book.cc b/src/dict/epwing_book.cc index 159c42b7..87a10ffc 100644 --- a/src/dict/epwing_book.cc +++ b/src/dict/epwing_book.cc @@ -10,8 +10,7 @@ #include #include #include "audiolink.hh" - #include "wstring.hh" - #include "wstring_qt.hh" + #include "text.hh" #include "folding.hh" #include "epwing_charmap.hh" #include "htmlescape.hh" @@ -1135,7 +1134,7 @@ void EpwingBook::fixHeadword( QString & headword ) // return; //} - gd::wstring folded = Folding::applyPunctOnly( fixed.toStdU32String() ); + std::u32string folded = Folding::applyPunctOnly( fixed.toStdU32String() ); //fixed = QString::fromStdU32String( folded ); //if( isHeadwordCorrect( fixed ) ) @@ -1993,4 +1992,4 @@ QMutex EpwingBook::libMutex; } // namespace Epwing -#endif \ No newline at end of file +#endif diff --git a/src/dict/forvo.cc b/src/dict/forvo.cc index af5ee64d..6caa752e 100644 --- a/src/dict/forvo.cc +++ b/src/dict/forvo.cc @@ -2,14 +2,13 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "forvo.hh" -#include "wstring_qt.hh" #include #include #include #include #include "audiolink.hh" #include "htmlescape.hh" -#include "utf8.hh" +#include "text.hh" namespace Forvo { @@ -48,7 +47,7 @@ public: return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const & /*word*/, unsigned long /*maxResults*/ ) override + sptr< WordSearchRequest > prefixMatch( std::u32string const & /*word*/, unsigned long /*maxResults*/ ) override { sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >(); @@ -57,7 +56,8 @@ public: return sr; } - sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; + sptr< DataRequest > + getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override; protected: @@ -89,8 +89,8 @@ class ForvoArticleRequest: public Dictionary::DataRequest public: - ForvoArticleRequest( wstring const & word, - vector< wstring > const & alts, + ForvoArticleRequest( std::u32string const & word, + vector< std::u32string > const & alts, QString const & apiKey_, QString const & languageCode_, string const & dictionaryId_, @@ -100,14 +100,16 @@ public: private: - void addQuery( QNetworkAccessManager & mgr, wstring const & word ); + void addQuery( QNetworkAccessManager & mgr, std::u32string const & word ); private slots: virtual void requestFinished( QNetworkReply * ); }; -sptr< DataRequest > -ForvoDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) +sptr< DataRequest > ForvoDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, + bool ) { if ( word.size() > 80 || apiKey.isEmpty() ) { @@ -137,8 +139,8 @@ void ForvoArticleRequest::cancel() finish(); } -ForvoArticleRequest::ForvoArticleRequest( wstring const & str, - vector< wstring > const & alts, +ForvoArticleRequest::ForvoArticleRequest( std::u32string const & str, + vector< std::u32string > const & alts, QString const & apiKey_, QString const & languageCode_, string const & dictionaryId_, @@ -156,7 +158,7 @@ ForvoArticleRequest::ForvoArticleRequest( wstring const & str, } } -void ForvoArticleRequest::addQuery( QNetworkAccessManager & mgr, wstring const & str ) +void ForvoArticleRequest::addQuery( QNetworkAccessManager & mgr, std::u32string const & str ) { qDebug( "Forvo: requesting article %s", QString::fromStdU32String( str ).toUtf8().data() ); @@ -177,7 +179,7 @@ void ForvoArticleRequest::addQuery( QNetworkAccessManager & mgr, wstring const & sptr< QNetworkReply > netReply = std::shared_ptr< QNetworkReply >( mgr.get( QNetworkRequest( reqUrl ) ) ); - netReplies.push_back( NetReply( netReply, Utf8::encode( str ) ) ); + netReplies.push_back( NetReply( netReply, Text::toUtf8( str ) ) ); } void ForvoArticleRequest::requestFinished( QNetworkReply * r ) diff --git a/src/dict/gls.cc b/src/dict/gls.cc index 88516566..299ec99f 100644 --- a/src/dict/gls.cc +++ b/src/dict/gls.cc @@ -8,8 +8,7 @@ #include "ufile.hh" #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" -#include "wstring_qt.hh" +#include "text.hh" #include "chunkedstorage.hh" #include "langcoder.hh" #include "dictzip.hh" @@ -39,14 +38,12 @@ using std::set; using std::multimap; using std::pair; -using gd::wstring; -using gd::wchar; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexInfo; -using Utf8::Encoding; -using Utf8::LineFeed; +using Text::Encoding; +using Text::LineFeed; /////////////// GlsScanner @@ -55,9 +52,9 @@ class GlsScanner gzFile f; Encoding encoding; QTextCodec * codec; - wstring dictionaryName; - wstring dictionaryDecription, dictionaryAuthor; - wstring langFrom, langTo; + std::u32string dictionaryName; + std::u32string dictionaryDecription, dictionaryAuthor; + std::u32string langFrom, langTo; char readBuffer[ 10000 ]; char * readBufferPtr; size_t readBufferLeft; @@ -82,31 +79,31 @@ public: } /// Returns the dictionary's name, as was read from file's headers. - wstring const & getDictionaryName() const + std::u32string const & getDictionaryName() const { return dictionaryName; } /// Returns the dictionary's author, as was read from file's headers. - wstring const & getDictionaryAuthor() const + std::u32string const & getDictionaryAuthor() const { return dictionaryAuthor; } /// Returns the dictionary's description, as was read from file's headers. - wstring const & getDictionaryDescription() const + std::u32string const & getDictionaryDescription() const { return dictionaryDecription; } /// Returns the dictionary's source language, as was read from file's headers. - wstring const & getLangFrom() const + std::u32string const & getLangFrom() const { return langFrom; } /// Returns the dictionary's target language, as was read from file's headers. - wstring const & getLangTo() const + std::u32string const & getLangTo() const { return langTo; } @@ -117,7 +114,7 @@ public: /// If end of file is reached, false is returned. /// Reading begins from the first line after the headers (ones which end /// by the "### Glossary section:" line). - bool readNextLine( wstring &, size_t & offset ); + bool readNextLine( std::u32string &, size_t & offset ); /// Returns the number of lines read so far from the file. unsigned getLinesRead() const { @@ -126,7 +123,7 @@ public: }; GlsScanner::GlsScanner( string const & fileName ): - encoding( Utf8::Utf8 ), + encoding( Text::Utf8 ), readBufferPtr( readBuffer ), readBufferLeft( 0 ), linesRead( 0 ) @@ -152,10 +149,10 @@ GlsScanner::GlsScanner( string const & fileName ): // If the file begins with the dedicated Unicode marker, we just consume // it. If, on the other hand, it's not, we return the bytes back if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) { - encoding = Utf8::Utf16LE; + encoding = Text::Utf16LE; } else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) { - encoding = Utf8::Utf16BE; + encoding = Text::Utf16BE; } else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) { // Looks like Utf8, read one more byte @@ -164,29 +161,29 @@ GlsScanner::GlsScanner( string const & fileName ): gzclose( f ); throw exMalformedGlsFile( fileName ); } - encoding = Utf8::Utf8; + encoding = Text::Utf8; } else { if ( gzrewind( f ) ) { gzclose( f ); throw exCantOpen( fileName ); } - encoding = Utf8::Utf8; + encoding = Text::Utf8; } - codec = QTextCodec::codecForName( Utf8::getEncodingNameFor( encoding ) ); + codec = QTextCodec::codecForName( Text::getEncodingNameFor( encoding ) ); // We now can use our own readNextLine() function - lineFeed = Utf8::initLineFeed( encoding ); + lineFeed = Text::initLineFeed( encoding ); - wstring str; - wstring * currentField = 0; - wstring mark = U"###"; - wstring titleMark = U"### Glossary title:"; - wstring authorMark = U"### Author:"; - wstring descriptionMark = U"### Description:"; - wstring langFromMark = U"### Source language:"; - wstring langToMark = U"### Target language:"; - wstring endOfHeaderMark = U"### Glossary section:"; + std::u32string str; + std::u32string * currentField = 0; + std::u32string mark = U"###"; + std::u32string titleMark = U"### Glossary title:"; + std::u32string authorMark = U"### Author:"; + std::u32string descriptionMark = U"### Description:"; + std::u32string langFromMark = U"### Source language:"; + std::u32string langToMark = U"### Target language:"; + std::u32string endOfHeaderMark = U"### Glossary section:"; size_t offset; for ( ;; ) { @@ -199,22 +196,22 @@ GlsScanner::GlsScanner( string const & fileName ): currentField = 0; if ( str.compare( 0, titleMark.size(), titleMark ) == 0 ) { - dictionaryName = wstring( str, titleMark.size(), str.size() - titleMark.size() ); + dictionaryName = std::u32string( str, titleMark.size(), str.size() - titleMark.size() ); currentField = &dictionaryName; } else if ( str.compare( 0, authorMark.size(), authorMark ) == 0 ) { - dictionaryAuthor = wstring( str, authorMark.size(), str.size() - authorMark.size() ); + dictionaryAuthor = std::u32string( str, authorMark.size(), str.size() - authorMark.size() ); currentField = &dictionaryAuthor; } else if ( str.compare( 0, descriptionMark.size(), descriptionMark ) == 0 ) { - dictionaryDecription = wstring( str, descriptionMark.size(), str.size() - descriptionMark.size() ); + dictionaryDecription = std::u32string( str, descriptionMark.size(), str.size() - descriptionMark.size() ); currentField = &dictionaryDecription; } else if ( str.compare( 0, langFromMark.size(), langFromMark ) == 0 ) { - langFrom = wstring( str, langFromMark.size(), str.size() - langFromMark.size() ); + langFrom = std::u32string( str, langFromMark.size(), str.size() - langFromMark.size() ); } else if ( str.compare( 0, langToMark.size(), langToMark ) == 0 ) { - langTo = wstring( str, langToMark.size(), str.size() - langToMark.size() ); + langTo = std::u32string( str, langToMark.size(), str.size() - langToMark.size() ); } else if ( str.compare( 0, endOfHeaderMark.size(), endOfHeaderMark ) == 0 ) { break; @@ -229,7 +226,7 @@ GlsScanner::GlsScanner( string const & fileName ): } } -bool GlsScanner::readNextLine( wstring & out, size_t & offset ) +bool GlsScanner::readNextLine( std::u32string & out, size_t & offset ) { offset = (size_t)( gztell( f ) - readBufferLeft ); @@ -256,7 +253,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) return false; } - int pos = Utf8::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length ); + int pos = Text::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length ); if ( pos == -1 ) { return false; } @@ -369,10 +366,12 @@ public: return idxHeader.langTo; } - sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; + sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override; - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -503,11 +502,11 @@ QString const & GlsDictionary::getDescription() try { GlsScanner scanner( getDictionaryFilenames()[ 0 ] ); - string str = Utf8::encode( scanner.getDictionaryAuthor() ); + string str = Text::toUtf8( scanner.getDictionaryAuthor() ); if ( !str.empty() ) { dictionaryDescription = QObject::tr( "Author: %1%2" ).arg( QString::fromUtf8( str.c_str() ) ).arg( "\n\n" ); } - str = Utf8::encode( scanner.getDictionaryDescription() ); + str = Text::toUtf8( scanner.getDictionaryDescription() ); if ( !str.empty() ) { QString desc = QString::fromUtf8( str.c_str() ); desc.replace( "\t", "
" ); @@ -592,7 +591,7 @@ void GlsDictionary::loadArticleText( uint32_t address, vector< string > & headwo } else { string articleData = - Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize ); + Iconv::toUtf8( Text::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize ); string::size_type start_pos = 0, end_pos = 0; for ( ;; ) { @@ -621,7 +620,7 @@ void GlsDictionary::loadArticleText( uint32_t address, vector< string > & headwo end_pos = 0; for ( ;; ) { end_pos = headword.find( '|', start_pos ); - if ( end_pos == wstring::npos ) { + if ( end_pos == std::u32string::npos ) { string hw = headword.substr( start_pos ); if ( !hw.empty() ) { headwords.push_back( hw ); @@ -804,7 +803,7 @@ void GlsDictionary::getArticleText( uint32_t articleAddress, QString & headword, class GlsHeadwordsRequest: public Dictionary::WordSearchRequest { - wstring word; + std::u32string word; GlsDictionary & dict; QAtomicInt isCancelled; @@ -812,7 +811,7 @@ class GlsHeadwordsRequest: public Dictionary::WordSearchRequest public: - GlsHeadwordsRequest( wstring const & word_, GlsDictionary & dict_ ): + GlsHeadwordsRequest( std::u32string const & word_, GlsDictionary & dict_ ): word( word_ ), dict( dict_ ) { @@ -845,7 +844,7 @@ void GlsHeadwordsRequest::run() try { vector< WordArticleLink > chain = dict.findArticles( word ); - wstring caseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string caseFolded = Folding::applySimpleCaseOnly( word ); for ( auto & x : chain ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { @@ -858,7 +857,7 @@ void GlsHeadwordsRequest::run() dict.loadArticleText( x.articleOffset, headwords, articleText ); - wstring headwordDecoded = Utf8::decode( headwords.front() ); + std::u32string headwordDecoded = Text::toUtf32( headwords.front() ); if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) ) { // The headword seems to differ from the input word, which makes the @@ -876,7 +875,7 @@ void GlsHeadwordsRequest::run() finish(); } -sptr< Dictionary::WordSearchRequest > GlsDictionary::findHeadwordsForSynonym( wstring const & word ) +sptr< Dictionary::WordSearchRequest > GlsDictionary::findHeadwordsForSynonym( std::u32string const & word ) { return synonymSearchEnabled ? std::make_shared< GlsHeadwordsRequest >( word, *this ) : @@ -889,8 +888,8 @@ sptr< Dictionary::WordSearchRequest > GlsDictionary::findHeadwordsForSynonym( ws class GlsArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; GlsDictionary & dict; bool ignoreDiacritics; @@ -899,8 +898,8 @@ class GlsArticleRequest: public Dictionary::DataRequest public: - GlsArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + GlsArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, GlsDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -944,13 +943,13 @@ void GlsArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -976,16 +975,16 @@ void GlsArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( Utf8::decode( headword ) ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( Text::toUtf32( headword ) ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< std::u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( - pair( Folding::applySimpleCaseOnly( Utf8::decode( headword ) ), pair( headword, articleText ) ) ); + pair( Folding::applySimpleCaseOnly( Text::toUtf32( headword ) ), pair( headword, articleText ) ) ); articlesIncluded.insert( x.articleOffset ); } @@ -998,7 +997,7 @@ void GlsArticleRequest::run() string result; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< std::u32string, pair< string, string > >::const_iterator i; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += i->second.second; @@ -1019,9 +1018,9 @@ void GlsArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > GlsDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > GlsDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -1097,7 +1096,7 @@ void GlsResourceRequest::run() if ( dict.resourceZip.isOpen() ) { QMutexLocker _( &dataMutex ); - if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { + if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) { throw; // Make it fail since we couldn't read the archive } } @@ -1239,7 +1238,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // which the incident happened. We need alive scanner for that. // Building the index - initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) ); + initializing.indexingDictionary( Text::toUtf8( scanner.getDictionaryName() ) ); qDebug( "Gls: Building the index for dictionary: %s", QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() ); @@ -1255,7 +1254,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f idx.write( idxHeader ); - string dictionaryName = Utf8::encode( scanner.getDictionaryName() ); + string dictionaryName = Text::toUtf8( scanner.getDictionaryName() ); idx.write( (uint32_t)dictionaryName.size() ); idx.write( dictionaryName.data(), dictionaryName.size() ); @@ -1266,7 +1265,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f ChunkedStorage::Writer chunks( idx ); - wstring curString; + std::u32string curString; size_t curOffset; uint32_t articleCount = 0, wordCount = 0; @@ -1286,12 +1285,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Parse headwords - list< wstring > allEntryWords; - wstring::size_type start_pos = 0, end_pos = 0; + list< std::u32string > allEntryWords; + std::u32string::size_type start_pos = 0, end_pos = 0; for ( ;; ) { end_pos = curString.find( '|', start_pos ); - if ( end_pos == wstring::npos ) { - wstring headword = curString.substr( start_pos ); + if ( end_pos == std::u32string::npos ) { + std::u32string headword = curString.substr( start_pos ); if ( !headword.empty() ) { allEntryWords.push_back( headword ); } diff --git a/src/dict/hunspell.cc b/src/dict/hunspell.cc index 8478eff1..dfb93dcb 100644 --- a/src/dict/hunspell.cc +++ b/src/dict/hunspell.cc @@ -2,7 +2,7 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "hunspell.hh" -#include "utf8.hh" +#include "text.hh" #include "htmlescape.hh" #include "iconv.hh" #include "folding.hh" @@ -21,7 +21,6 @@ namespace HunspellMorpho { using namespace Dictionary; -using gd::wchar; namespace { @@ -60,18 +59,19 @@ public: return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) override; + sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) override; - sptr< WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; + sptr< WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override; - sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; + sptr< DataRequest > + getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override; bool isLocalDictionary() override { return true; } - vector< wstring > getAlternateWritings( const wstring & word ) noexcept override; + vector< std::u32string > getAlternateWritings( const std::u32string & word ) noexcept override; protected: @@ -94,25 +94,25 @@ private: /// Encodes the given string to be passed to the hunspell object. May throw /// Iconv::Ex -string encodeToHunspell( Hunspell &, wstring const & ); +string encodeToHunspell( Hunspell &, std::u32string const & ); /// Decodes the given string returned by the hunspell object. May throw /// Iconv::Ex -wstring decodeFromHunspell( Hunspell &, char const * ); +std::u32string decodeFromHunspell( Hunspell &, char const * ); /// Generates suggestions via hunspell -QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hunspell ); +QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex, Hunspell & hunspell ); /// Generates suggestions for compound expression -void getSuggestionsForExpression( wstring const & expression, - vector< wstring > & suggestions, +void getSuggestionsForExpression( std::u32string const & expression, + vector< std::u32string > & suggestions, QMutex & hunspellMutex, Hunspell & hunspell ); /// Returns true if the string contains whitespace, false otherwise -bool containsWhitespace( wstring const & str ) +bool containsWhitespace( std::u32string const & str ) { - wchar const * next = str.c_str(); + char32_t const * next = str.c_str(); for ( ; *next; ++next ) { if ( Folding::isWhitespace( *next ) ) { @@ -142,9 +142,9 @@ void HunspellDictionary::loadIcon() noexcept dictionaryIconLoaded = true; } -vector< wstring > HunspellDictionary::getAlternateWritings( wstring const & word ) noexcept +vector< std::u32string > HunspellDictionary::getAlternateWritings( std::u32string const & word ) noexcept { - vector< wstring > results; + vector< std::u32string > results; if ( containsWhitespace( word ) ) { getSuggestionsForExpression( word, results, getHunspellMutex(), hunspell ); @@ -160,14 +160,14 @@ class HunspellArticleRequest: public Dictionary::DataRequest QMutex & hunspellMutex; Hunspell & hunspell; - wstring word; + std::u32string word; QAtomicInt isCancelled; QFuture< void > f; public: - HunspellArticleRequest( wstring const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): + HunspellArticleRequest( std::u32string const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): hunspellMutex( hunspellMutex_ ), hunspell( hunspell_ ), word( word_ ) @@ -201,7 +201,7 @@ void HunspellArticleRequest::run() vector< string > suggestions; try { - wstring trimmedWord = Folding::trimWhitespaceOrPunct( word ); + std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( word ); if ( containsWhitespace( trimmedWord ) ) { // For now we don't analyze whitespace-containing phrases @@ -226,10 +226,10 @@ void HunspellArticleRequest::run() string result = "
" + Html::escape( QCoreApplication::translate( "Hunspell", "Spelling suggestions: " ).toUtf8().data() ); - wstring lowercasedWord = Folding::applySimpleCaseOnly( word ); + std::u32string lowercasedWord = Folding::applySimpleCaseOnly( word ); for ( vector< string >::size_type x = 0; x < suggestions.size(); ++x ) { - wstring suggestion = decodeFromHunspell( hunspell, suggestions[ x ].c_str() ); + std::u32string suggestion = decodeFromHunspell( hunspell, suggestions[ x ].c_str() ); if ( Folding::applySimpleCaseOnly( suggestion ) == lowercasedWord ) { // If among suggestions we see the same word just with the different @@ -240,7 +240,7 @@ void HunspellArticleRequest::run() return; } - string suggestionUtf8 = Utf8::encode( suggestion ); + string suggestionUtf8 = Text::toUtf8( suggestion ); result += ""; @@ -268,8 +268,10 @@ void HunspellArticleRequest::run() finish(); } -sptr< DataRequest > -HunspellDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) +sptr< DataRequest > HunspellDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const &, + std::u32string const &, + bool ) { return std::make_shared< HunspellArticleRequest >( word, getHunspellMutex(), hunspell ); @@ -282,7 +284,7 @@ class HunspellHeadwordsRequest: public Dictionary::WordSearchRequest QMutex & hunspellMutex; Hunspell & hunspell; - wstring word; + std::u32string word; QAtomicInt isCancelled; QFuture< void > f; @@ -290,7 +292,7 @@ class HunspellHeadwordsRequest: public Dictionary::WordSearchRequest public: - HunspellHeadwordsRequest( wstring const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): + HunspellHeadwordsRequest( std::u32string const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): hunspellMutex( hunspellMutex_ ), hunspell( hunspell_ ), word( word_ ) @@ -322,7 +324,7 @@ void HunspellHeadwordsRequest::run() return; } - wstring trimmedWord = Folding::trimWhitespaceOrPunct( word ); + std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( word ); if ( trimmedWord.size() > 80 ) { // We won't do anything for overly long sentences since that would probably @@ -332,7 +334,7 @@ void HunspellHeadwordsRequest::run() } if ( containsWhitespace( trimmedWord ) ) { - vector< wstring > results; + vector< std::u32string > results; getSuggestionsForExpression( trimmedWord, results, hunspellMutex, hunspell ); @@ -342,7 +344,7 @@ void HunspellHeadwordsRequest::run() } } else { - QList< wstring > suggestions = suggest( trimmedWord, hunspellMutex, hunspell ); + QList< std::u32string > suggestions = suggest( trimmedWord, hunspellMutex, hunspell ); if ( !suggestions.empty() ) { QMutexLocker _( &dataMutex ); @@ -356,9 +358,9 @@ void HunspellHeadwordsRequest::run() finish(); } -QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hunspell ) +QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex, Hunspell & hunspell ) { - QList< wstring > result; + QList< std::u32string > result; vector< string > suggestions; @@ -371,7 +373,7 @@ QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hun if ( !suggestions.empty() ) { // There were some suggestions made for us. Make an appropriate output. - wstring lowercasedWord = Folding::applySimpleCaseOnly( word ); + std::u32string lowercasedWord = Folding::applySimpleCaseOnly( word ); static QRegularExpression cutStem( R"(^\s*st:(((\s+(?!\w{2}:)(?!-)(?!\+))|\S+)+))" ); @@ -388,7 +390,7 @@ QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hun auto match = cutStem.match( suggestion.trimmed() ); if ( match.hasMatch() ) { - wstring alt = match.captured( 1 ).toStdU32String(); + std::u32string alt = match.captured( 1 ).toStdU32String(); if ( Folding::applySimpleCaseOnly( alt ) != lowercasedWord ) // No point in providing same word { @@ -406,7 +408,7 @@ QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hun } -sptr< WordSearchRequest > HunspellDictionary::findHeadwordsForSynonym( wstring const & word ) +sptr< WordSearchRequest > HunspellDictionary::findHeadwordsForSynonym( std::u32string const & word ) { return std::make_shared< HunspellHeadwordsRequest >( word, getHunspellMutex(), hunspell ); @@ -420,14 +422,14 @@ class HunspellPrefixMatchRequest: public Dictionary::WordSearchRequest QMutex & hunspellMutex; Hunspell & hunspell; - wstring word; + std::u32string word; QAtomicInt isCancelled; QFuture< void > f; public: - HunspellPrefixMatchRequest( wstring const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): + HunspellPrefixMatchRequest( std::u32string const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): hunspellMutex( hunspellMutex_ ), hunspell( hunspell_ ), word( word_ ) @@ -460,7 +462,7 @@ void HunspellPrefixMatchRequest::run() } try { - wstring trimmedWord = Folding::trimWhitespaceOrPunct( word ); + std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( word ); if ( trimmedWord.empty() || containsWhitespace( trimmedWord ) ) { // For now we don't analyze whitespace-containing phrases @@ -487,14 +489,14 @@ void HunspellPrefixMatchRequest::run() finish(); } -sptr< WordSearchRequest > HunspellDictionary::prefixMatch( wstring const & word, unsigned long /*maxResults*/ ) +sptr< WordSearchRequest > HunspellDictionary::prefixMatch( std::u32string const & word, unsigned long /*maxResults*/ ) { return std::make_shared< HunspellPrefixMatchRequest >( word, getHunspellMutex(), hunspell ); } -void getSuggestionsForExpression( wstring const & expression, - vector< wstring > & suggestions, +void getSuggestionsForExpression( std::u32string const & expression, + vector< std::u32string > & suggestions, QMutex & hunspellMutex, Hunspell & hunspell ) { @@ -502,15 +504,15 @@ void getSuggestionsForExpression( wstring const & expression, // This is useful for compound expressions where some words is // in different form, e.g. "dozing off" -> "doze off". - wstring trimmedWord = Folding::trimWhitespaceOrPunct( expression ); - wstring word, punct; - QList< wstring > words; + std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( expression ); + std::u32string word, punct; + QList< std::u32string > words; suggestions.clear(); // Parse string to separate words - for ( wchar const * c = trimmedWord.c_str();; ++c ) { + for ( char32_t const * c = trimmedWord.c_str();; ++c ) { if ( !*c || Folding::isPunct( *c ) || Folding::isWhitespace( *c ) ) { if ( word.size() ) { words.push_back( word ); @@ -541,7 +543,7 @@ void getSuggestionsForExpression( wstring const & expression, // Combine result strings from suggestions - QList< wstring > results; + QList< std::u32string > results; for ( const auto & i : words ) { word = i; @@ -551,13 +553,13 @@ void getSuggestionsForExpression( wstring const & expression, } } else { - QList< wstring > sugg = suggest( word, hunspellMutex, hunspell ); + QList< std::u32string > sugg = suggest( word, hunspellMutex, hunspell ); int suggNum = sugg.size() + 1; if ( suggNum > 3 ) { suggNum = 3; } int resNum = results.size(); - wstring resultStr; + std::u32string resultStr; if ( resNum == 0 ) { for ( int k = 0; k < suggNum; k++ ) { @@ -587,12 +589,12 @@ void getSuggestionsForExpression( wstring const & expression, } } -string encodeToHunspell( Hunspell & hunspell, wstring const & str ) +string encodeToHunspell( Hunspell & hunspell, std::u32string const & str ) { Iconv conv( Iconv::GdWchar ); void const * in = str.data(); - size_t inLeft = str.size() * sizeof( wchar ); + size_t inLeft = str.size() * sizeof( char32_t ); vector< char > result( str.size() * 4 + 1 ); // +1 isn't actually needed, // but then iconv complains on empty @@ -605,17 +607,17 @@ string encodeToHunspell( Hunspell & hunspell, wstring const & str ) return convStr.toStdString(); } -wstring decodeFromHunspell( Hunspell & hunspell, char const * str ) +std::u32string decodeFromHunspell( Hunspell & hunspell, char const * str ) { Iconv conv( hunspell.get_dic_encoding() ); void const * in = str; size_t inLeft = strlen( str ); - vector< wchar > result( inLeft + 1 ); // +1 isn't needed, but see above + vector< char32_t > result( inLeft + 1 ); // +1 isn't needed, but see above void * out = &result.front(); - size_t outLeft = result.size() * sizeof( wchar ); + size_t outLeft = result.size() * sizeof( char32_t ); QString convStr = conv.convert( in, inLeft ); return convStr.toStdU32String(); diff --git a/src/dict/lingualibre.cc b/src/dict/lingualibre.cc index e63d27af..e77b7fd1 100644 --- a/src/dict/lingualibre.cc +++ b/src/dict/lingualibre.cc @@ -1,5 +1,5 @@ #include "lingualibre.hh" -#include "utf8.hh" +#include "text.hh" #include "audiolink.hh" #include @@ -40,8 +40,8 @@ class LinguaArticleRequest: public Dictionary::DataRequest public: - LinguaArticleRequest( wstring const & word, - vector< wstring > const & alts, + LinguaArticleRequest( std::u32string const & word, + vector< std::u32string > const & alts, QString const & languageCode_, QString const & langWikipediaID_, string const & dictionaryId_, @@ -51,7 +51,7 @@ public: private: - void addQuery( QNetworkAccessManager & mgr, wstring const & word ); + void addQuery( QNetworkAccessManager & mgr, std::u32string const & word ); private slots: virtual void requestFinished( QNetworkReply * ); @@ -175,7 +175,7 @@ WHERE { return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const & /*word*/, unsigned long /*maxResults*/ ) override + sptr< WordSearchRequest > prefixMatch( std::u32string const & /*word*/, unsigned long /*maxResults*/ ) override { sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >(); @@ -184,7 +184,10 @@ WHERE { return sr; } - sptr< DataRequest > getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) override + sptr< DataRequest > getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, + bool ) override { if ( word.size() < 50 ) { return std::make_shared< LinguaArticleRequest >( word, alts, languageCode, langWikipediaID, getId(), netMgr ); @@ -231,8 +234,8 @@ void LinguaArticleRequest::cancel() finish(); } -LinguaArticleRequest::LinguaArticleRequest( const wstring & str, - const vector< wstring > & alts, +LinguaArticleRequest::LinguaArticleRequest( const std::u32string & str, + const vector< std::u32string > & alts, const QString & languageCode_, const QString & langWikipediaID, const string & dictionaryId_, @@ -245,7 +248,7 @@ LinguaArticleRequest::LinguaArticleRequest( const wstring & str, addQuery( mgr, str ); } -void LinguaArticleRequest::addQuery( QNetworkAccessManager & mgr, const wstring & word ) +void LinguaArticleRequest::addQuery( QNetworkAccessManager & mgr, const std::u32string & word ) { // Doc of the @@ -273,7 +276,7 @@ void LinguaArticleRequest::addQuery( QNetworkAccessManager & mgr, const wstring auto netReply = std::shared_ptr< QNetworkReply >( mgr.get( netRequest ) ); - netReplies.emplace_back( netReply, Utf8::encode( word ) ); + netReplies.emplace_back( netReply, Text::toUtf8( word ) ); } diff --git a/src/dict/lsa.cc b/src/dict/lsa.cc index d6614e33..1af16bdf 100644 --- a/src/dict/lsa.cc +++ b/src/dict/lsa.cc @@ -5,7 +5,7 @@ #include "dictfile.hh" #include "iconv.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "btreeidx.hh" #include "audiolink.hh" @@ -24,7 +24,6 @@ namespace Lsa { using std::string; -using gd::wstring; using std::map; using std::multimap; using std::set; @@ -169,8 +168,10 @@ public: return getArticleCount(); } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -199,9 +200,9 @@ LsaDictionary::LsaDictionary( string const & id, string const & indexFile, vecto openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); } -sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > LsaDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -215,13 +216,13 @@ sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word, chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, string > mainArticles, alternateArticles; + multimap< std::u32string, string > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -236,12 +237,13 @@ sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word, // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( x.word ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( x.word ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, string > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; + multimap< std::u32string, string > & mapToUse = + ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( x.word ), x.word ) ); @@ -254,7 +256,7 @@ sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word, string result; - multimap< wstring, string >::const_iterator i; + multimap< std::u32string, string >::const_iterator i; result += ""; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { @@ -389,7 +391,7 @@ sptr< Dictionary::DataRequest > LsaDictionary::getResource( string const & name string strippedName = Utils::endsWithIgnoreCase( name, ".wav" ) ? string( name, 0, name.size() - 4 ) : name; - vector< WordArticleLink > chain = findArticles( Utf8::decode( strippedName ) ); + vector< WordArticleLink > chain = findArticles( Text::toUtf32( strippedName ) ); if ( chain.empty() ) { return std::make_shared< Dictionary::DataRequestInstant >( false ); // No such resource @@ -572,7 +574,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Insert new entry into an index - indexedWords.addWord( Utf8::decode( e.name ), offset ); + indexedWords.addWord( Text::toUtf32( e.name ), offset ); } idxHeader.vorbisOffset = f.tell(); diff --git a/src/dict/mdx.cc b/src/dict/mdx.cc index 99aba846..49a15bb1 100644 --- a/src/dict/mdx.cc +++ b/src/dict/mdx.cc @@ -4,10 +4,9 @@ #include "mdx.hh" #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "dictfile.hh" -#include "wstring.hh" -#include "wstring_qt.hh" +#include "text.hh" #include "chunkedstorage.hh" #include "langcoder.hh" #include "audiolink.hh" @@ -37,8 +36,6 @@ namespace Mdx { using std::map; using std::multimap; using std::set; -using gd::wstring; -using gd::wchar; using std::list; using std::pair; using std::string; @@ -129,7 +126,7 @@ public: /// Checks whether the given file exists in the mdd file or not. /// Note that this function is thread-safe, since it does not access mdd file. - bool hasFile( gd::wstring const & name ) + bool hasFile( std::u32string const & name ) { if ( !isFileOpen ) { return false; @@ -140,7 +137,7 @@ public: /// Attempts loading the given file into the given vector. Returns true on /// success, false otherwise. - bool loadFile( gd::wstring const & name, std::vector< char > & result ) + bool loadFile( std::u32string const & name, std::vector< char > & result ) { if ( !isFileOpen ) { return false; @@ -232,8 +229,10 @@ public: return idxHeader.langTo; } - sptr< Dictionary::DataRequest > - getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; QString const & getDescription() override; @@ -281,7 +280,7 @@ private: friend class MdxArticleRequest; friend class MddResourceRequest; - void loadResourceFile( const wstring & resourceName, vector< char > & data ); + void loadResourceFile( const std::u32string & resourceName, vector< char > & data ); }; MdxDictionary::MdxDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ): @@ -488,8 +487,8 @@ sptr< Dictionary::DataRequest > MdxDictionary::getSearchResults( QString const & class MdxArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; MdxDictionary & dict; bool ignoreDiacritics; @@ -498,8 +497,8 @@ class MdxArticleRequest: public Dictionary::DataRequest public: - MdxArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + MdxArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, MdxDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -602,8 +601,8 @@ void MdxArticleRequest::run() // Handle internal redirects if ( strncmp( articleBody.c_str(), "@@@LINK=", 8 ) == 0 ) { - wstring target = Utf8::decode( articleBody.c_str() + 8 ); - target = Folding::trimWhitespace( target ); + std::u32string target = Text::toUtf32( articleBody.c_str() + 8 ); + target = Folding::trimWhitespace( target ); // Make an additional query for this redirection vector< WordArticleLink > altChain = dict.findArticles( target ); chain.insert( chain.end(), altChain.begin(), altChain.end() ); @@ -626,9 +625,9 @@ void MdxArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > MdxDictionary::getArticle( const wstring & word, - const vector< wstring > & alts, - const wstring &, +sptr< Dictionary::DataRequest > MdxDictionary::getArticle( const std::u32string & word, + const vector< std::u32string > & alts, + const std::u32string &, bool ignoreDiacritics ) { return std::make_shared< MdxArticleRequest >( word, alts, *this, ignoreDiacritics ); @@ -638,7 +637,7 @@ sptr< Dictionary::DataRequest > MdxDictionary::getArticle( const wstring & word, class MddResourceRequest: public Dictionary::DataRequest { MdxDictionary & dict; - wstring resourceName; + std::u32string resourceName; QAtomicInt isCancelled; QFuture< void > f; @@ -647,7 +646,7 @@ public: MddResourceRequest( MdxDictionary & dict_, string const & resourceName_ ): Dictionary::DataRequest( &dict_ ), dict( dict_ ), - resourceName( Utf8::decode( resourceName_ ) ) + resourceName( Text::toUtf32( resourceName_ ) ) { f = QtConcurrent::run( [ this ]() { this->run(); @@ -722,7 +721,7 @@ void MddResourceRequest::run() } // In order to prevent recursive internal redirection... - set< wstring, std::less<> > resourceIncluded; + set< std::u32string, std::less<> > resourceIncluded; for ( ;; ) { // Some runnables linger enough that they are cancelled before they start @@ -730,7 +729,7 @@ void MddResourceRequest::run() finish(); return; } - string u8ResourceName = Utf8::encode( resourceName ); + string u8ResourceName = Text::toUtf8( resourceName ); if ( !resourceIncluded.insert( resourceName ).second ) { finish(); return; @@ -1151,11 +1150,11 @@ QString MdxDictionary::getCachedFileName( QString filename ) qWarning( R"(Mdx: file "%s" creating error: "%s")", fullName.toUtf8().data(), f.errorString().toUtf8().data() ); return QString(); } - gd::wstring resourceName = filename.toStdU32String(); + std::u32string resourceName = filename.toStdU32String(); vector< char > data; // In order to prevent recursive internal redirection... - set< wstring, std::less<> > resourceIncluded; + set< std::u32string, std::less<> > resourceIncluded; for ( ;; ) { if ( !resourceIncluded.insert( resourceName ).second ) { @@ -1194,10 +1193,10 @@ QString MdxDictionary::getCachedFileName( QString filename ) return fullName; } -void MdxDictionary::loadResourceFile( const wstring & resourceName, vector< char > & data ) +void MdxDictionary::loadResourceFile( const std::u32string & resourceName, vector< char > & data ) { - wstring newResourceName = resourceName; - string u8ResourceName = Utf8::encode( resourceName ); + std::u32string newResourceName = resourceName; + string u8ResourceName = Text::toUtf8( resourceName ); // Convert to the Windows separator std::replace( newResourceName.begin(), newResourceName.end(), '/', '\\' ); diff --git a/src/dict/mediawiki.cc b/src/dict/mediawiki.cc index b7a4a65e..c0865d94 100644 --- a/src/dict/mediawiki.cc +++ b/src/dict/mediawiki.cc @@ -2,7 +2,6 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "mediawiki.hh" -#include "wstring_qt.hh" #include #include #include @@ -66,9 +65,10 @@ public: return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) override; + sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) override; - sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; + sptr< DataRequest > + getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override; quint32 getLangFrom() const override { @@ -133,7 +133,10 @@ class MediaWikiWordSearchRequest: public MediaWikiWordSearchRequestSlots public: - MediaWikiWordSearchRequest( wstring const &, QString const & url, QString const & lang, QNetworkAccessManager & mgr ); + MediaWikiWordSearchRequest( std::u32string const &, + QString const & url, + QString const & lang, + QNetworkAccessManager & mgr ); ~MediaWikiWordSearchRequest(); @@ -144,7 +147,7 @@ private: void downloadFinished() override; }; -MediaWikiWordSearchRequest::MediaWikiWordSearchRequest( wstring const & str, +MediaWikiWordSearchRequest::MediaWikiWordSearchRequest( std::u32string const & str, QString const & url, QString const & lang, QNetworkAccessManager & mgr ): @@ -390,8 +393,8 @@ class MediaWikiArticleRequest: public MediaWikiDataRequestSlots public: - MediaWikiArticleRequest( wstring const & word, - vector< wstring > const & alts, + MediaWikiArticleRequest( std::u32string const & word, + vector< std::u32string > const & alts, QString const & url, QString const & lang, QNetworkAccessManager & mgr, @@ -401,7 +404,7 @@ public: private: - void addQuery( QNetworkAccessManager & mgr, wstring const & word ); + void addQuery( QNetworkAccessManager & mgr, std::u32string const & word ); void requestFinished( QNetworkReply * ) override; @@ -435,8 +438,8 @@ void MediaWikiArticleRequest::cancel() finish(); } -MediaWikiArticleRequest::MediaWikiArticleRequest( wstring const & str, - vector< wstring > const & alts, +MediaWikiArticleRequest::MediaWikiArticleRequest( std::u32string const & str, + vector< std::u32string > const & alts, QString const & url_, QString const & lang_, QNetworkAccessManager & mgr, @@ -458,7 +461,7 @@ MediaWikiArticleRequest::MediaWikiArticleRequest( wstring const & str, } } -void MediaWikiArticleRequest::addQuery( QNetworkAccessManager & mgr, wstring const & str ) +void MediaWikiArticleRequest::addQuery( QNetworkAccessManager & mgr, std::u32string const & str ) { qDebug( "MediaWiki: requesting article %s", QString::fromStdU32String( str ).toUtf8().data() ); @@ -705,7 +708,7 @@ void MediaWikiArticleRequest::requestFinished( QNetworkReply * r ) } } -sptr< WordSearchRequest > MediaWikiDictionary::prefixMatch( wstring const & word, unsigned long maxResults ) +sptr< WordSearchRequest > MediaWikiDictionary::prefixMatch( std::u32string const & word, unsigned long maxResults ) { (void)maxResults; @@ -719,8 +722,10 @@ sptr< WordSearchRequest > MediaWikiDictionary::prefixMatch( wstring const & word } } -sptr< DataRequest > -MediaWikiDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) +sptr< DataRequest > MediaWikiDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, + bool ) { if ( word.size() > 80 ) { diff --git a/src/dict/programs.cc b/src/dict/programs.cc index 13cd69cf..9052d492 100644 --- a/src/dict/programs.cc +++ b/src/dict/programs.cc @@ -4,8 +4,7 @@ #include "programs.hh" #include "audiolink.hh" #include "htmlescape.hh" -#include "utf8.hh" -#include "wstring_qt.hh" +#include "text.hh" #include "iconv.hh" #include "utils.hh" #include "globalbroadcaster.hh" @@ -46,16 +45,17 @@ public: return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const & word, unsigned long maxResults ) override; + sptr< WordSearchRequest > prefixMatch( std::u32string const & word, unsigned long maxResults ) override; - sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; + sptr< DataRequest > + getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override; protected: void loadIcon() noexcept override; }; -sptr< WordSearchRequest > ProgramsDictionary::prefixMatch( wstring const & word, unsigned long /*maxResults*/ ) +sptr< WordSearchRequest > ProgramsDictionary::prefixMatch( std::u32string const & word, unsigned long /*maxResults*/ ) { if ( prg.type == Config::Program::PrefixMatch ) { @@ -70,8 +70,10 @@ sptr< WordSearchRequest > ProgramsDictionary::prefixMatch( wstring const & word, } } -sptr< Dictionary::DataRequest > -ProgramsDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) +sptr< Dictionary::DataRequest > ProgramsDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const &, + std::u32string const &, + bool ) { switch ( prg.type ) { @@ -79,7 +81,7 @@ ProgramsDictionary::getArticle( wstring const & word, vector< wstring > const &, // Audio results are instantaneous string result; - string wordUtf8( Utf8::encode( word ) ); + string wordUtf8( Text::toUtf8( word ) ); result += "
"; diff --git a/src/dict/programs.hh b/src/dict/programs.hh index 772cecf9..7814f5a4 100644 --- a/src/dict/programs.hh +++ b/src/dict/programs.hh @@ -6,14 +6,13 @@ #include #include "dictionary.hh" #include "config.hh" -#include "wstring.hh" +#include "text.hh" /// Support for arbitrary programs. namespace Programs { using std::vector; using std::string; -using gd::wstring; vector< sptr< Dictionary::Class > > makeDictionaries( Config::Programs const & ); diff --git a/src/dict/sdict.cc b/src/dict/sdict.cc index fd4877a0..65c7f715 100644 --- a/src/dict/sdict.cc +++ b/src/dict/sdict.cc @@ -9,7 +9,7 @@ #include "htmlescape.hh" #include "langcoder.hh" #include "sdict.hh" -#include "utf8.hh" +#include "text.hh" #include #include #include @@ -26,7 +26,6 @@ using std::multimap; using std::pair; using std::set; using std::string; -using gd::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; @@ -133,8 +132,10 @@ public: return idxHeader.langTo; } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; QString const & getDescription() override; @@ -416,8 +417,8 @@ SdictDictionary::getSearchResults( QString const & searchString, int searchMode, class SdictArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; SdictDictionary & dict; bool ignoreDiacritics; @@ -427,8 +428,8 @@ class SdictArticleRequest: public Dictionary::DataRequest public: - SdictArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + SdictArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, SdictDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -472,13 +473,13 @@ void SdictArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -507,12 +508,12 @@ void SdictArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< std::u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); @@ -532,7 +533,7 @@ void SdictArticleRequest::run() string result; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< std::u32string, pair< string, string > >::const_iterator i; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += dict.isFromLanguageRTL() ? "

" : "

"; @@ -561,9 +562,9 @@ void SdictArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > SdictDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > SdictDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -741,7 +742,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Insert new entry - indexedWords.addWord( Utf8::decode( string( data.data(), size ) ), articleOffset ); + indexedWords.addWord( Text::toUtf32( string( data.data(), size ) ), articleOffset ); pos += el.nextWord; } diff --git a/src/dict/slob.cc b/src/dict/slob.cc index e6b82ade..2a2fc588 100644 --- a/src/dict/slob.cc +++ b/src/dict/slob.cc @@ -6,7 +6,7 @@ #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "decompress.hh" #include "langcoder.hh" #include "ftshelpers.hh" @@ -40,7 +40,6 @@ using std::vector; using std::multimap; using std::pair; using std::set; -using gd::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; @@ -630,8 +629,10 @@ public: return idxHeader.langTo; } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -853,7 +854,7 @@ void SlobDictionary::loadResource( std::string & resourceName, string & data ) vector< WordArticleLink > link; RefEntry entry; - link = resourceIndex.findArticles( Utf8::decode( resourceName ) ); + link = resourceIndex.findArticles( Text::toUtf32( resourceName ) ); if ( link.empty() ) { return; @@ -989,8 +990,8 @@ SlobDictionary::getSearchResults( QString const & searchString, int searchMode, class SlobArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; SlobDictionary & dict; bool ignoreDiacritics; @@ -999,8 +1000,8 @@ class SlobArticleRequest: public Dictionary::DataRequest public: - SlobArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + SlobArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, SlobDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -1045,13 +1046,13 @@ void SlobArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles; set< quint64 > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -1084,12 +1085,12 @@ void SlobArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< std::u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); @@ -1105,7 +1106,7 @@ void SlobArticleRequest::run() string result; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< std::u32string, pair< string, string > >::const_iterator i; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { result += R"(

)"; @@ -1128,9 +1129,9 @@ void SlobArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > SlobDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > SlobDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { diff --git a/src/dict/sounddir.cc b/src/dict/sounddir.cc index 7806bd2b..23a707f2 100644 --- a/src/dict/sounddir.cc +++ b/src/dict/sounddir.cc @@ -3,13 +3,12 @@ #include "sounddir.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "btreeidx.hh" #include "chunkedstorage.hh" #include "filetype.hh" #include "htmlescape.hh" #include "audiolink.hh" -#include "wstring_qt.hh" #include "utils.hh" @@ -21,7 +20,6 @@ namespace SoundDir { using std::string; -using gd::wstring; using std::map; using std::multimap; using std::set; @@ -85,8 +83,10 @@ public: return getArticleCount(); } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -114,9 +114,9 @@ SoundDirDictionary::SoundDirDictionary( string const & id, openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); } -sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { vector< WordArticleLink > chain = findArticles( word, ignoreDiacritics ); @@ -130,13 +130,13 @@ sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & } // maps to the chain number - multimap< wstring, unsigned > mainArticles, alternateArticles; + multimap< std::u32string, unsigned > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -151,12 +151,12 @@ sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( chain[ x ].word ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( chain[ x ].word ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, unsigned > & mapToUse = + multimap< std::u32string, unsigned > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( chain[ x ].word ), x ) ); @@ -170,7 +170,7 @@ sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & string result; - multimap< wstring, uint32_t >::const_iterator i; + multimap< std::u32string, uint32_t >::const_iterator i; string displayedName; vector< char > chunk; @@ -399,11 +399,11 @@ void addDir( QDir const & baseDir, const uint32_t articleOffset = chunks.startNewBlock(); chunks.addToBlock( fileName.c_str(), fileName.size() + 1 ); - wstring name = i->fileName().toStdU32String(); + std::u32string name = i->fileName().toStdU32String(); - const wstring::size_type pos = name.rfind( L'.' ); + const std::u32string::size_type pos = name.rfind( L'.' ); - if ( pos != wstring::npos ) { + if ( pos != std::u32string::npos ) { name.erase( pos ); } diff --git a/src/dict/stardict.cc b/src/dict/stardict.cc index cb42f2a7..d8c718c6 100644 --- a/src/dict/stardict.cc +++ b/src/dict/stardict.cc @@ -4,7 +4,7 @@ #include "stardict.hh" #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "chunkedstorage.hh" #include "dictzip.hh" #include "xdxf2html.hh" @@ -42,7 +42,6 @@ using std::multimap; using std::pair; using std::set; using std::string; -using gd::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; @@ -154,10 +153,12 @@ public: return idxHeader.langTo; } - sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; + sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override; - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -1164,7 +1165,7 @@ sptr< Dictionary::DataRequest > StardictDictionary::getSearchResults( QString co class StardictHeadwordsRequest: public Dictionary::WordSearchRequest { - wstring word; + std::u32string word; StardictDictionary & dict; QAtomicInt isCancelled; @@ -1172,7 +1173,7 @@ class StardictHeadwordsRequest: public Dictionary::WordSearchRequest public: - StardictHeadwordsRequest( wstring const & word_, StardictDictionary & dict_ ): + StardictHeadwordsRequest( std::u32string const & word_, StardictDictionary & dict_ ): word( word_ ), dict( dict_ ) { @@ -1207,7 +1208,7 @@ void StardictHeadwordsRequest::run() //limited the synomys to at most 10 entries vector< WordArticleLink > chain = dict.findArticles( word, false, 10 ); - wstring caseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string caseFolded = Folding::applySimpleCaseOnly( word ); for ( auto & x : chain ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { @@ -1219,7 +1220,7 @@ void StardictHeadwordsRequest::run() dict.loadArticle( x.articleOffset, headword, articleText ); - wstring headwordDecoded = Utf8::decode( headword ); + std::u32string headwordDecoded = Text::toUtf32( headword ); if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) ) { // The headword seems to differ from the input word, which makes the @@ -1237,7 +1238,7 @@ void StardictHeadwordsRequest::run() finish(); } -sptr< Dictionary::WordSearchRequest > StardictDictionary::findHeadwordsForSynonym( wstring const & word ) +sptr< Dictionary::WordSearchRequest > StardictDictionary::findHeadwordsForSynonym( std::u32string const & word ) { return synonymSearchEnabled ? std::make_shared< StardictHeadwordsRequest >( word, *this ) : Class::findHeadwordsForSynonym( word ); @@ -1250,8 +1251,8 @@ sptr< Dictionary::WordSearchRequest > StardictDictionary::findHeadwordsForSynony class StardictArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; StardictDictionary & dict; bool ignoreDiacritics; @@ -1261,8 +1262,8 @@ class StardictArticleRequest: public Dictionary::DataRequest public: - StardictArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + StardictArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, StardictDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -1312,13 +1313,13 @@ void StardictArticleRequest::run() } } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonyms make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -1345,12 +1346,12 @@ void StardictArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< std::u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); @@ -1366,7 +1367,7 @@ void StardictArticleRequest::run() string result; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< std::u32string, pair< string, string > >::const_iterator i; string cleaner = Utils::Html::getHtmlCleaner(); @@ -1409,9 +1410,9 @@ void StardictArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > StardictDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > StardictDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -1569,7 +1570,7 @@ void StardictResourceRequest::run() if ( dict.resourceZip.isOpen() ) { QMutexLocker _( &dataMutex ); - if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { + if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) { throw; // Make it fail since we couldn't read the archive } } @@ -1801,10 +1802,10 @@ static void handleIdxSynFile( string const & fileName, // Insert new entry into an index if ( parseHeadwords ) { - indexedWords.addWord( Utf8::decode( word ), offset ); + indexedWords.addWord( Text::toUtf32( word ), offset ); } else { - indexedWords.addSingleWord( Utf8::decode( word ), offset ); + indexedWords.addSingleWord( Text::toUtf32( word ), offset ); } } diff --git a/src/dict/transliteration/chinese.cc b/src/dict/transliteration/chinese.cc index 8bf78b76..f6c485b8 100644 --- a/src/dict/transliteration/chinese.cc +++ b/src/dict/transliteration/chinese.cc @@ -7,7 +7,7 @@ #include #include "folding.hh" #include "transliteration.hh" -#include "utf8.hh" +#include "text.hh" namespace ChineseTranslit { @@ -27,7 +27,7 @@ public: QString const & openccConfig ); ~CharacterConversionDictionary(); - std::vector< gd::wstring > getAlternateWritings( gd::wstring const & ) noexcept override; + std::vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept override; }; CharacterConversionDictionary::CharacterConversionDictionary( std::string const & id, @@ -68,15 +68,15 @@ CharacterConversionDictionary::~CharacterConversionDictionary() // #endif } -std::vector< gd::wstring > CharacterConversionDictionary::getAlternateWritings( gd::wstring const & str ) noexcept +std::vector< std::u32string > CharacterConversionDictionary::getAlternateWritings( std::u32string const & str ) noexcept { - std::vector< gd::wstring > results; + std::vector< std::u32string > results; if ( converter != NULL ) { - gd::wstring folded = Folding::applySimpleCaseOnly( str ); - std::string input = Utf8::encode( folded ); + std::u32string folded = Folding::applySimpleCaseOnly( str ); + std::string input = Text::toUtf8( folded ); std::string output; - gd::wstring result; + std::u32string result; try { // #ifdef Q_OS_MAC @@ -93,7 +93,7 @@ std::vector< gd::wstring > CharacterConversionDictionary::getAlternateWritings( // #else // output = converter->Convert( input ); // #endif - result = Utf8::decode( output ); + result = Text::toUtf32( output ); } catch ( std::exception & ex ) { qWarning( "OpenCC: conversion failed %s", ex.what() ); diff --git a/src/dict/transliteration/transliteration.cc b/src/dict/transliteration/transliteration.cc index b39d27bb..e41b9ec2 100644 --- a/src/dict/transliteration/transliteration.cc +++ b/src/dict/transliteration/transliteration.cc @@ -2,12 +2,11 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "transliteration.hh" -#include "utf8.hh" +#include "text.hh" #include "folding.hh" namespace Transliteration { -using gd::wchar; BaseTransliterationDictionary::BaseTransliterationDictionary( string const & id, string const & name_, @@ -36,24 +35,28 @@ unsigned long BaseTransliterationDictionary::getWordCount() noexcept return 0; } -sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::prefixMatch( wstring const &, unsigned long ) +sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::prefixMatch( std::u32string const &, + unsigned long ) { return std::make_shared< Dictionary::WordSearchRequestInstant >(); } -sptr< Dictionary::DataRequest > -BaseTransliterationDictionary::getArticle( wstring const &, vector< wstring > const &, wstring const &, bool ) +sptr< Dictionary::DataRequest > BaseTransliterationDictionary::getArticle( std::u32string const &, + vector< std::u32string > const &, + std::u32string const &, + bool ) { return std::make_shared< Dictionary::DataRequestInstant >( false ); } -sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::findHeadwordsForSynonym( wstring const & str ) +sptr< Dictionary::WordSearchRequest > +BaseTransliterationDictionary::findHeadwordsForSynonym( std::u32string const & str ) { sptr< Dictionary::WordSearchRequestInstant > result = std::make_shared< Dictionary::WordSearchRequestInstant >(); - vector< wstring > alts = getAlternateWritings( str ); + vector< std::u32string > alts = getAlternateWritings( str ); qDebug( "alts = %u", (unsigned)alts.size() ); @@ -67,13 +70,13 @@ sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::findHeadwor void Table::ins( char const * from, char const * to ) { - wstring fr = Utf8::decode( std::string( from ) ); + std::u32string fr = Text::toUtf32( std::string( from ) ); if ( fr.size() > maxEntrySize ) { maxEntrySize = fr.size(); } - insert( std::pair< wstring, wstring >( fr, Utf8::decode( std::string( to ) ) ) ); + insert( std::pair< std::u32string, std::u32string >( fr, Text::toUtf32( std::string( to ) ) ) ); } @@ -84,12 +87,12 @@ TransliterationDictionary::TransliterationDictionary( { } -vector< wstring > TransliterationDictionary::getAlternateWritings( wstring const & str ) noexcept +vector< std::u32string > TransliterationDictionary::getAlternateWritings( std::u32string const & str ) noexcept { - vector< wstring > results; + vector< std::u32string > results; - wstring result, folded; - wstring const * target; + std::u32string result, folded; + std::u32string const * target; if ( caseSensitive ) { // Don't do any transform -- the transliteration is case-sensitive @@ -100,8 +103,8 @@ vector< wstring > TransliterationDictionary::getAlternateWritings( wstring const target = &folded; } - wchar const * ptr = target->c_str(); - size_t left = target->size(); + char32_t const * ptr = target->c_str(); + size_t left = target->size(); Table::const_iterator i; @@ -110,7 +113,7 @@ vector< wstring > TransliterationDictionary::getAlternateWritings( wstring const for ( x = table.getMaxEntrySize(); x >= 1; --x ) { if ( left >= x ) { - i = table.find( wstring( ptr, x ) ); + i = table.find( std::u32string( ptr, x ) ); if ( i != table.end() ) { result.append( i->second ); diff --git a/src/dict/transliteration/transliteration.hh b/src/dict/transliteration/transliteration.hh index 7003cd73..04335fea 100644 --- a/src/dict/transliteration/transliteration.hh +++ b/src/dict/transliteration/transliteration.hh @@ -9,7 +9,6 @@ namespace Transliteration { using std::map; -using gd::wstring; using std::string; using std::vector; @@ -32,18 +31,18 @@ public: virtual unsigned long getWordCount() noexcept; - virtual vector< wstring > getAlternateWritings( wstring const & ) noexcept = 0; + virtual vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept = 0; - virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ); + virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ); - virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &, unsigned long ); + virtual sptr< Dictionary::WordSearchRequest > prefixMatch( std::u32string const &, unsigned long ); virtual sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const &, wstring const &, bool ); + getArticle( std::u32string const &, vector< std::u32string > const &, std::u32string const &, bool ); }; -class Table: public map< wstring, wstring > +class Table: public map< std::u32string, std::u32string > { unsigned maxEntrySize; @@ -77,7 +76,7 @@ public: TransliterationDictionary( string const & id, string const & name, QIcon icon, Table const & table, bool caseSensitive = true ); - virtual vector< wstring > getAlternateWritings( wstring const & ) noexcept; + virtual vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept; }; } // namespace Transliteration diff --git a/src/dict/utils/indexedzip.cc b/src/dict/utils/indexedzip.cc index 90cdaf57..c1aa4c60 100644 --- a/src/dict/utils/indexedzip.cc +++ b/src/dict/utils/indexedzip.cc @@ -4,9 +4,8 @@ #include "indexedzip.hh" #include "zipfile.hh" #include -#include "utf8.hh" +#include "text.hh" #include "iconv.hh" -#include "wstring_qt.hh" #include #include @@ -23,7 +22,7 @@ bool IndexedZip::openZipFile( QString const & name ) return zipIsOpen; } -bool IndexedZip::hasFile( gd::wstring const & name ) +bool IndexedZip::hasFile( std::u32string const & name ) { if ( !zipIsOpen ) { return false; @@ -34,7 +33,7 @@ bool IndexedZip::hasFile( gd::wstring const & name ) return !links.empty(); } -bool IndexedZip::loadFile( gd::wstring const & name, vector< char > & data ) +bool IndexedZip::loadFile( std::u32string const & name, vector< char > & data ) { if ( !zipIsOpen ) { return false; @@ -180,7 +179,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32 if ( !hasNonAscii ) { // Add entry as is - zipFileNames.addSingleWord( Utf8::decode( entry.fileName.data() ), entry.localHeaderOffset ); + zipFileNames.addSingleWord( Text::toUtf32( entry.fileName.data() ), entry.localHeaderOffset ); if ( filesCount ) { *filesCount += 1; } @@ -192,7 +191,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32 // Utf8 try { - wstring decoded = Utf8::decode( entry.fileName.constData() ); + std::u32string decoded = Text::toUtf32( entry.fileName.constData() ); zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); if ( filesCount != 0 && !alreadyCounted ) { @@ -200,12 +199,12 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32 alreadyCounted = true; } } - catch ( Utf8::exCantDecode & ) { + catch ( Text::exCantDecode & ) { // Failed to decode } if ( !entry.fileNameInUTF8 ) { - wstring nameInSystemLocale; + std::u32string nameInSystemLocale; // System locale if ( localeCodec ) { @@ -224,7 +223,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32 // CP866 try { - wstring decoded = Iconv::toWstring( "CP866", entry.fileName.constData(), entry.fileName.size() ); + std::u32string decoded = Iconv::toWstring( "CP866", entry.fileName.constData(), entry.fileName.size() ); if ( nameInSystemLocale != decoded ) { zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); @@ -241,7 +240,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32 // CP1251 try { - wstring decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(), entry.fileName.size() ); + std::u32string decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(), entry.fileName.size() ); if ( nameInSystemLocale != decoded ) { zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); diff --git a/src/dict/utils/indexedzip.hh b/src/dict/utils/indexedzip.hh index 750f2b15..41337b2d 100644 --- a/src/dict/utils/indexedzip.hh +++ b/src/dict/utils/indexedzip.hh @@ -37,11 +37,11 @@ public: /// Checks whether the given file exists in the zip file or not. /// Note that this function is thread-safe, since it does not access zip file. - bool hasFile( gd::wstring const & name ); + bool hasFile( std::u32string const & name ); /// Attempts loading the given file into the given vector. Returns true on /// success, false otherwise. - bool loadFile( gd::wstring const & name, std::vector< char > & ); + bool loadFile( std::u32string const & name, std::vector< char > & ); bool loadFile( uint32_t offset, std::vector< char > & ); /// Index compressed files in zip file diff --git a/src/dict/voiceengines.cc b/src/dict/voiceengines.cc index 8e01138b..bba36760 100644 --- a/src/dict/voiceengines.cc +++ b/src/dict/voiceengines.cc @@ -5,8 +5,7 @@ #include "voiceengines.hh" #include "audiolink.hh" #include "htmlescape.hh" - #include "utf8.hh" - #include "wstring_qt.hh" + #include "text.hh" #include #include @@ -21,6 +20,7 @@ namespace VoiceEngines { using namespace Dictionary; using std::string; +using std::u32string; using std::map; inline string toMd5( QByteArray const & b ) @@ -58,16 +58,18 @@ public: return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const & word, unsigned long maxResults ) override; + sptr< WordSearchRequest > prefixMatch( u32string const & word, unsigned long maxResults ) override; - sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; + sptr< DataRequest > + getArticle( u32string const &, vector< u32string > const & alts, u32string const &, bool ) override; protected: void loadIcon() noexcept override; }; -sptr< WordSearchRequest > VoiceEnginesDictionary::prefixMatch( wstring const & /*word*/, unsigned long /*maxResults*/ ) +sptr< WordSearchRequest > VoiceEnginesDictionary::prefixMatch( u32string const & /*word*/, + unsigned long /*maxResults*/ ) { WordSearchRequestInstant * sr = new WordSearchRequestInstant(); @@ -76,11 +78,11 @@ sptr< WordSearchRequest > VoiceEnginesDictionary::prefixMatch( wstring const & / } sptr< Dictionary::DataRequest > -VoiceEnginesDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) +VoiceEnginesDictionary::getArticle( u32string const & word, vector< u32string > const &, u32string const &, bool ) { string result; - string wordUtf8( Utf8::encode( word ) ); + string wordUtf8( Text::toUtf8( word ) ); result += "

"; @@ -135,4 +137,4 @@ vector< sptr< Dictionary::Class > > makeDictionaries( Config::VoiceEngines const } // namespace VoiceEngines -#endif \ No newline at end of file +#endif diff --git a/src/dict/voiceengines.hh b/src/dict/voiceengines.hh index 17e9a8e3..bf0981b1 100644 --- a/src/dict/voiceengines.hh +++ b/src/dict/voiceengines.hh @@ -5,16 +5,13 @@ #include "dictionary.hh" #include "config.hh" - #include "wstring.hh" - + #include "text.hh" #include - namespace VoiceEngines { using std::vector; using std::string; -using gd::wstring; vector< sptr< Dictionary::Class > > makeDictionaries( Config::VoiceEngines const & voiceEngines ); diff --git a/src/dict/website.cc b/src/dict/website.cc index 4f229a4d..f4d40947 100644 --- a/src/dict/website.cc +++ b/src/dict/website.cc @@ -2,8 +2,7 @@ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "website.hh" -#include "wstring_qt.hh" -#include "utf8.hh" +#include "text.hh" #include #include #include @@ -62,10 +61,12 @@ public: return 0; } - sptr< WordSearchRequest > prefixMatch( wstring const & word, unsigned long ) override; + sptr< WordSearchRequest > prefixMatch( std::u32string const & word, unsigned long ) override; - sptr< DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const & context, bool ) override; + sptr< DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const & context, + bool ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -90,7 +91,7 @@ protected slots: virtual void requestFinished( QNetworkReply * ) {} }; -sptr< WordSearchRequest > WebSiteDictionary::prefixMatch( wstring const & /*word*/, unsigned long ) +sptr< WordSearchRequest > WebSiteDictionary::prefixMatch( std::u32string const & /*word*/, unsigned long ) { sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >(); @@ -308,9 +309,9 @@ void WebSiteArticleRequest::requestFinished( QNetworkReply * r ) finish(); } -sptr< DataRequest > WebSiteDictionary::getArticle( wstring const & str, - vector< wstring > const & /*alts*/, - wstring const & context, +sptr< DataRequest > WebSiteDictionary::getArticle( std::u32string const & str, + vector< std::u32string > const & /*alts*/, + std::u32string const & context, bool /*ignoreDiacritics*/ ) { QString urlString = Utils::WebSite::urlReplaceWord( QString( urlTemplate ), QString::fromStdU32String( str ) ); diff --git a/src/dict/xdxf.cc b/src/dict/xdxf.cc index 2fae7ff2..2b717260 100644 --- a/src/dict/xdxf.cc +++ b/src/dict/xdxf.cc @@ -4,7 +4,7 @@ #include "xdxf.hh" #include "btreeidx.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "chunkedstorage.hh" #include "dictzip.hh" #include "htmlescape.hh" @@ -39,7 +39,6 @@ using std::multimap; using std::pair; using std::set; using std::string; -using gd::wstring; using std::vector; using std::list; @@ -160,8 +159,10 @@ public: return idxHeader.langTo; } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -411,8 +412,8 @@ XdxfDictionary::getSearchResults( QString const & searchString, int searchMode, class XdxfArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + std::u32string word; + vector< std::u32string > alts; XdxfDictionary & dict; bool ignoreDiacritics; @@ -421,8 +422,8 @@ class XdxfArticleRequest: public Dictionary::DataRequest public: - XdxfArticleRequest( wstring const & word_, - vector< wstring > const & alts_, + XdxfArticleRequest( std::u32string const & word_, + vector< std::u32string > const & alts_, XdxfDictionary & dict_, bool ignoreDiacritics_ ): word( word_ ), @@ -467,13 +468,13 @@ void XdxfArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -502,12 +503,12 @@ void XdxfArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< std::u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); @@ -527,7 +528,7 @@ void XdxfArticleRequest::run() string result; - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< std::u32string, pair< string, string > >::const_iterator i; string cleaner = Utils::Html::getHtmlCleaner(); @@ -554,9 +555,9 @@ void XdxfArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > XdxfDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > XdxfDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -973,7 +974,7 @@ void XdxfResourceRequest::run() if ( dict.resourceZip.isOpen() ) { QMutexLocker _( &dataMutex ); - if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { + if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) { throw; // Make it fail since we couldn't read the archive } } @@ -1194,7 +1195,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f else if ( stream.name() == u"abbreviations" ) { QString s; string value; - list< wstring > keys; + list< std::u32string > keys; while ( !( stream.isEndElement() && stream.name() == u"abbreviations" ) && !stream.atEnd() ) { if ( !stream.readNextStartElement() ) { break; @@ -1210,7 +1211,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f s = readElementText( stream ); value = Folding::trimWhitespace( s ).toStdString(); for ( const auto & key : keys ) { - abrv[ Utf8::encode( Folding::trimWhitespace( key ) ) ] = value; + abrv[ Text::toUtf8( Folding::trimWhitespace( key ) ) ] = value; } keys.clear(); } @@ -1230,7 +1231,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f s = readElementText( stream ); value = Folding::trimWhitespace( s ).toStdString(); for ( const auto & key : keys ) { - abrv[ Utf8::encode( Folding::trimWhitespace( key ) ) ] = value; + abrv[ Text::toUtf8( Folding::trimWhitespace( key ) ) ] = value; } keys.clear(); } diff --git a/src/dict/xdxf2html.cc b/src/dict/xdxf2html.cc index 7c7d9875..1508858e 100644 --- a/src/dict/xdxf2html.cc +++ b/src/dict/xdxf2html.cc @@ -3,8 +3,7 @@ #include "xdxf2html.hh" #include -#include "utf8.hh" -#include "wstring_qt.hh" +#include "text.hh" #include "folding.hh" #include "audiolink.hh" @@ -442,7 +441,7 @@ string convert( string const & in, if ( i != pAbrv->end() ) { string title; - if ( Utf8::decode( i->second ).size() < 70 ) { + if ( Text::toUtf32( i->second ).size() < 70 ) { // Replace all spaces with non-breakable ones, since that's how Lingvo shows tooltips title.reserve( i->second.size() ); @@ -466,7 +465,7 @@ string convert( string const & in, else { title = i->second; } - el.setAttribute( "title", QString::fromStdU32String( Utf8::decode( title ) ) ); + el.setAttribute( "title", QString::fromStdU32String( Text::toUtf32( title ) ) ); } } } @@ -628,7 +627,7 @@ string convert( string const & in, // if( type == XDXF && dictPtr != NULL && !el.hasAttribute( "start" ) ) if ( dictPtr != NULL && !el.hasAttribute( "start" ) ) { - string filename = Utf8::encode( el.text().toStdU32String() ); + string filename = Text::toUtf8( el.text().toStdU32String() ); if ( Filetype::isNameOfPicture( filename ) ) { QUrl url; diff --git a/src/dict/zim.cc b/src/dict/zim.cc index 4efe6316..07cee834 100644 --- a/src/dict/zim.cc +++ b/src/dict/zim.cc @@ -6,7 +6,7 @@ #include "zim.hh" #include "btreeidx.hh" #include "folding.hh" - #include "utf8.hh" + #include "text.hh" #include "langcoder.hh" #include "filetype.hh" #include "dictfile.hh" @@ -38,12 +38,12 @@ namespace Zim { using std::string; +using std::u32string; using std::map; using std::vector; using std::multimap; using std::pair; using std::set; -using gd::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; @@ -182,7 +182,7 @@ public: } sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + getArticle( u32string const &, vector< u32string > const & alts, u32string const &, bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -519,8 +519,8 @@ ZimDictionary::getSearchResults( QString const & searchString, int searchMode, b class ZimArticleRequest: public Dictionary::DataRequest { - wstring word; - vector< wstring > alts; + u32string word; + vector< u32string > alts; ZimDictionary & dict; bool ignoreDiacritics; @@ -529,7 +529,10 @@ class ZimArticleRequest: public Dictionary::DataRequest public: - ZimArticleRequest( wstring word_, vector< wstring > const & alts_, ZimDictionary & dict_, bool ignoreDiacritics_ ): + ZimArticleRequest( u32string word_, + vector< u32string > const & alts_, + ZimDictionary & dict_, + bool ignoreDiacritics_ ): word( std::move( word_ ) ), alts( alts_ ), dict( dict_ ), @@ -571,13 +574,13 @@ void ZimArticleRequest::run() chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, pair< string, string > > mainArticles, alternateArticles; + multimap< u32string, pair< string, string > > mainArticles, alternateArticles; set< quint32 > articlesIncluded; // Some synonyms make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -614,12 +617,12 @@ void ZimArticleRequest::run() // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); + u32string headwordStripped = Folding::applySimpleCaseOnly( headword ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, pair< string, string > > & mapToUse = + multimap< u32string, pair< string, string > > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); @@ -638,7 +641,7 @@ void ZimArticleRequest::run() // See Issue #271: A mechanism to clean-up invalid HTML cards. string cleaner = Utils::Html::getHtmlCleaner(); - multimap< wstring, pair< string, string > >::const_iterator i; + multimap< u32string, pair< string, string > >::const_iterator i; for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { @@ -666,9 +669,9 @@ void ZimArticleRequest::run() finish(); } -sptr< Dictionary::DataRequest > ZimDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > ZimDictionary::getArticle( u32string const & word, + vector< u32string > const & alts, + u32string const &, bool ignoreDiacritics ) { @@ -766,7 +769,7 @@ sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name return std::make_shared< ZimResourceRequest >( *this, noLeadingDot.toStdString() ); } -wstring normalizeWord( const std::string & url ); +u32string normalizeWord( const std::string & url ); vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, Dictionary::Initializing & initializing, @@ -849,7 +852,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f if ( maxHeadwordsToExpand > 0 && ( articleCount >= maxHeadwordsToExpand ) ) { if ( !title.empty() ) { - wstring word = Utf8::decode( title ); + u32string word = Text::toUtf32( title ); indexedWords.addSingleWord( word, index ); } else if ( !url.empty() ) { @@ -858,7 +861,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f } else { if ( !title.empty() ) { - auto word = Utf8::decode( title ); + auto word = Text::toUtf32( title ); indexedWords.addWord( word, index ); wordCount++; } @@ -903,7 +906,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f } return dictionaries; } -wstring normalizeWord( const std::string & url ) +u32string normalizeWord( const std::string & url ) { auto formattedUrl = QString::fromStdString( url ).remove( RX::Zim::leadingDotSlash ); return formattedUrl.toStdU32String(); diff --git a/src/dict/zipsounds.cc b/src/dict/zipsounds.cc index b6e73b28..50d6f444 100644 --- a/src/dict/zipsounds.cc +++ b/src/dict/zipsounds.cc @@ -4,7 +4,7 @@ #include "zipsounds.hh" #include "dictfile.hh" #include "folding.hh" -#include "utf8.hh" +#include "text.hh" #include "btreeidx.hh" #include "audiolink.hh" @@ -24,7 +24,6 @@ namespace ZipSounds { using std::string; -using gd::wstring; using std::map; using std::multimap; using std::set; @@ -64,19 +63,19 @@ bool indexIsOldOrBad( string const & indexFile ) || header.formatVersion != CurrentFormatVersion; } -wstring stripExtension( string const & str ) +std::u32string stripExtension( string const & str ) { - wstring name; + std::u32string name; try { - name = Utf8::decode( str ); + name = Text::toUtf32( str ); } - catch ( Utf8::exCantDecode & ) { + catch ( Text::exCantDecode & ) { return name; } if ( Filetype::isNameOfSound( str ) ) { - wstring::size_type pos = name.rfind( L'.' ); - if ( pos != wstring::npos ) { + std::u32string::size_type pos = name.rfind( L'.' ); + if ( pos != std::u32string::npos ) { name.erase( pos ); } @@ -118,8 +117,10 @@ public: return getArticleCount(); } - sptr< Dictionary::DataRequest > - getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; + sptr< Dictionary::DataRequest > getArticle( std::u32string const &, + vector< std::u32string > const & alts, + std::u32string const &, + bool ignoreDiacritics ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override; @@ -157,9 +158,9 @@ string ZipSoundsDictionary::getName() noexcept return result; } -sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const & word, - vector< wstring > const & alts, - wstring const &, +sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( std::u32string const & word, + vector< std::u32string > const & alts, + std::u32string const &, bool ignoreDiacritics ) { @@ -173,13 +174,13 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const & chain.insert( chain.end(), altChain.begin(), altChain.end() ); } - multimap< wstring, uint32_t > mainArticles, alternateArticles; + multimap< std::u32string, uint32_t > mainArticles, alternateArticles; set< uint32_t > articlesIncluded; // Some synonims make it that the articles // appear several times. We combat this // by only allowing them to appear once. - wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); + std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word ); if ( ignoreDiacritics ) { wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); } @@ -194,12 +195,12 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const & // We do the case-folded comparison here. - wstring headwordStripped = Folding::applySimpleCaseOnly( x.word ); + std::u32string headwordStripped = Folding::applySimpleCaseOnly( x.word ); if ( ignoreDiacritics ) { headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); } - multimap< wstring, uint32_t > & mapToUse = + multimap< std::u32string, uint32_t > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( x.word ), x.articleOffset ) ); @@ -213,7 +214,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const & string result; - multimap< wstring, uint32_t >::const_iterator i; + multimap< std::u32string, uint32_t >::const_iterator i; result += "
"; @@ -244,7 +245,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const & nameBlock += sz; string displayedName = - mainArticles.size() + alternateArticles.size() > 1 ? name : Utf8::encode( stripExtension( name ) ); + mainArticles.size() + alternateArticles.size() > 1 ? name : Text::toUtf8( stripExtension( name ) ); result += ""; @@ -286,7 +287,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const & nameBlock += sz; string displayedName = - mainArticles.size() + alternateArticles.size() > 1 ? name : Utf8::encode( stripExtension( name ) ); + mainArticles.size() + alternateArticles.size() > 1 ? name : Text::toUtf8( stripExtension( name ) ); result += ""; @@ -316,7 +317,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getResource( string const & { // Remove extension for sound files (like in sound dirs) - wstring strippedName = stripExtension( name ); + std::u32string strippedName = stripExtension( name ); vector< WordArticleLink > chain = findArticles( strippedName ); @@ -430,7 +431,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f // Remove extension for sound files (like in sound dirs) - wstring word = stripExtension( link.word ); + std::u32string word = stripExtension( link.word ); if ( !word.empty() ) { names.addWord( word, offset ); } diff --git a/src/ftshelpers.cc b/src/ftshelpers.cc index fe110318..f190ae02 100644 --- a/src/ftshelpers.cc +++ b/src/ftshelpers.cc @@ -5,7 +5,6 @@ #include #include "fulltextsearch.hh" #include "ftshelpers.hh" -#include "wstring_qt.hh" #include "dictfile.hh" #include "folding.hh" #include "utils.hh" diff --git a/src/ftshelpers.hh b/src/ftshelpers.hh index fa786af0..6a6829cb 100644 --- a/src/ftshelpers.hh +++ b/src/ftshelpers.hh @@ -7,7 +7,6 @@ #include "btreeidx.hh" #include "fulltextsearch.hh" #include "folding.hh" -#include "wstring_qt.hh" namespace FtsHelpers { @@ -44,7 +43,7 @@ public: { if ( ignoreDiacritics_ ) searchString = - QString::fromStdU32String( Folding::applyDiacriticsOnly( gd::removeTrailingZero( searchString_ ) ) ); + QString::fromStdU32String( Folding::applyDiacriticsOnly( Text::removeTrailingZero( searchString_ ) ) ); foundHeadwords = new QList< FTS::FtsHeadword >; results = 0; diff --git a/src/headwordsmodel.cc b/src/headwordsmodel.cc index 20054d14..1eda7b9d 100644 --- a/src/headwordsmodel.cc +++ b/src/headwordsmodel.cc @@ -1,5 +1,4 @@ #include "headwordsmodel.hh" -#include "wstring_qt.hh" HeadwordListModel::HeadwordListModel( QObject * parent ): QAbstractListModel( parent ), @@ -67,7 +66,7 @@ void HeadwordListModel::setFilter( const QRegularExpression & reg ) } } filterWords.clear(); - auto sr = _dict->prefixMatch( gd::removeTrailingZero( reg.pattern() ), maxFilterResults ); + auto sr = _dict->prefixMatch( Text::removeTrailingZero( reg.pattern() ), maxFilterResults ); connect( sr.get(), &Dictionary::Request::finished, this, &HeadwordListModel::requestFinished, Qt::QueuedConnection ); queuedRequests.push_back( sr ); } diff --git a/src/langcoder.cc b/src/langcoder.cc index 04a3c093..95c33581 100644 --- a/src/langcoder.cc +++ b/src/langcoder.cc @@ -3,7 +3,7 @@ #include "langcoder.hh" #include "language.hh" -#include "utf8.hh" +#include "text.hh" #include #include @@ -226,9 +226,9 @@ QString LangCoder::intToCode2( quint32 val ) return QString::fromLatin1( ba ); } -quint32 LangCoder::findIdForLanguage( gd::wstring const & lang ) +quint32 LangCoder::findIdForLanguage( std::u32string const & lang ) { - const auto langFolded = QByteArrayView( Utf8::encode( lang ) ); + const auto langFolded = QByteArrayView( Text::toUtf8( lang ) ); for ( auto const & lc : LANG_CODE_MAP ) { if ( langFolded.compare( lc.lang, Qt::CaseInsensitive ) == 0 ) { diff --git a/src/langcoder.hh b/src/langcoder.hh index 647730f8..797b702c 100644 --- a/src/langcoder.hh +++ b/src/langcoder.hh @@ -2,7 +2,7 @@ #include #include -#include "wstring.hh" +#include "text.hh" struct GDLangCode { @@ -34,7 +34,7 @@ public: /// Finds the id for the given language name, written in english. The search /// is case- and punctuation insensitive. - static quint32 findIdForLanguage( gd::wstring const & ); + static quint32 findIdForLanguage( std::u32string const & ); static quint32 findIdForLanguageCode3( std::string const & ); diff --git a/src/language.cc b/src/language.cc index 72d894fe..dded6763 100644 --- a/src/language.cc +++ b/src/language.cc @@ -465,7 +465,7 @@ BabylonLang getBabylonLangByIndex( int index ) return BabylonDb[ index ]; } -quint32 findBlgLangIDByEnglishName( gd::wstring const & lang ) +quint32 findBlgLangIDByEnglishName( std::u32string const & lang ) { QString enName = QString::fromStdU32String( lang ); for ( const auto & idx : BabylonDb ) { diff --git a/src/language.hh b/src/language.hh index 01e974bb..50486cfb 100644 --- a/src/language.hh +++ b/src/language.hh @@ -4,7 +4,6 @@ #pragma once #include -#include "wstring_qt.hh" /// Language-specific stuff - codes, names, ids etc. namespace Language { @@ -47,5 +46,5 @@ struct BabylonLang const char * localizedName; }; BabylonLang getBabylonLangByIndex( int index ); -quint32 findBlgLangIDByEnglishName( gd::wstring const & lang ); +quint32 findBlgLangIDByEnglishName( std::u32string const & lang ); } // namespace Language diff --git a/src/ui/articleview.cc b/src/ui/articleview.cc index 9c633241..7ed39335 100644 --- a/src/ui/articleview.cc +++ b/src/ui/articleview.cc @@ -10,7 +10,6 @@ #include "utils.hh" #include "webmultimediadownload.hh" #include "wildcard.hh" -#include "wstring_qt.hh" #include #include #include diff --git a/src/wordfinder.cc b/src/wordfinder.cc index 3a9341fb..e2dcc748 100644 --- a/src/wordfinder.cc +++ b/src/wordfinder.cc @@ -3,14 +3,11 @@ #include "wordfinder.hh" #include "folding.hh" -#include "wstring_qt.hh" #include using std::vector; using std::list; -using gd::wstring; -using gd::wchar; using std::map; using std::pair; @@ -134,7 +131,7 @@ void WordFinder::startSearch() allWordWritings[ 0 ] = inputWord.toStdU32String(); for ( const auto & inputDict : *inputDicts ) { - vector< wstring > writings = inputDict->getAlternateWritings( allWordWritings[ 0 ] ); + vector< std::u32string > writings = inputDict->getAlternateWritings( allWordWritings[ 0 ] ); allWordWritings.insert( allWordWritings.end(), writings.begin(), writings.end() ); } @@ -255,7 +252,9 @@ unsigned saturated( unsigned x ) /// both sides by either whitespace, punctuation or begin/end of string. /// If true is returned, pos holds the offset in the haystack. If the offset /// is larger than 255, it is set to 255. -bool hasSurroundedWithWs( wstring const & haystack, wstring const & needle, wstring::size_type & pos ) +bool hasSurroundedWithWs( std::u32string const & haystack, + std::u32string const & needle, + std::u32string::size_type & pos ) { if ( haystack.size() < needle.size() ) { return false; // Needle won't even fit into a haystack @@ -264,7 +263,7 @@ bool hasSurroundedWithWs( wstring const & haystack, wstring const & needle, wstr for ( pos = 0;; ++pos ) { pos = haystack.find( needle, pos ); - if ( pos == wstring::npos ) { + if ( pos == std::u32string::npos ) { return false; // Not found } @@ -290,13 +289,13 @@ void WordFinder::updateResults() updateResultsTimer.stop(); // Can happen when we were done before it'd expire } - wstring original = Folding::applySimpleCaseOnly( allWordWritings[ 0 ] ); + std::u32string original = Folding::applySimpleCaseOnly( allWordWritings[ 0 ] ); for ( auto i = finishedRequests.begin(); i != finishedRequests.end(); ) { for ( size_t count = ( *i )->matchesCount(), x = 0; x < count; ++x ) { - wstring match = ( **i )[ x ].word; - int weight = ( **i )[ x ].weight; - wstring lowerCased = Folding::applySimpleCaseOnly( match ); + std::u32string match = ( **i )[ x ].word; + int weight = ( **i )[ x ].weight; + std::u32string lowerCased = Folding::applySimpleCaseOnly( match ); if ( searchType == ExpressionMatch ) { unsigned ws; @@ -320,7 +319,7 @@ void WordFinder::updateResults() weight = ws; } auto insertResult = - resultsIndex.insert( pair< wstring, ResultsArray::iterator >( lowerCased, resultsArray.end() ) ); + resultsIndex.insert( pair< std::u32string, ResultsArray::iterator >( lowerCased, resultsArray.end() ) ); if ( !insertResult.second ) { // Wasn't inserted since there was already an item -- check the case @@ -369,16 +368,16 @@ void WordFinder::updateResults() }; for ( const auto & allWordWriting : allWordWritings ) { - wstring target = Folding::applySimpleCaseOnly( allWordWriting ); - wstring targetNoFullCase = Folding::applyFullCaseOnly( target ); - wstring targetNoDia = Folding::applyDiacriticsOnly( targetNoFullCase ); - wstring targetNoPunct = Folding::applyPunctOnly( targetNoDia ); - wstring targetNoWs = Folding::applyWhitespaceOnly( targetNoPunct ); + std::u32string target = Folding::applySimpleCaseOnly( allWordWriting ); + std::u32string targetNoFullCase = Folding::applyFullCaseOnly( target ); + std::u32string targetNoDia = Folding::applyDiacriticsOnly( targetNoFullCase ); + std::u32string targetNoPunct = Folding::applyPunctOnly( targetNoDia ); + std::u32string targetNoWs = Folding::applyWhitespaceOnly( targetNoPunct ); - wstring::size_type matchPos = 0; + std::u32string::size_type matchPos = 0; for ( const auto & i : resultsIndex ) { - wstring resultNoFullCase, resultNoDia, resultNoPunct, resultNoWs; + std::u32string resultNoFullCase, resultNoDia, resultNoPunct, resultNoWs; int rank; @@ -441,14 +440,14 @@ void WordFinder::updateResults() // only the first one, storing it in rank. Then we sort the results using // SortByRankAndLength. for ( const auto & allWordWriting : allWordWritings ) { - wstring target = Folding::apply( allWordWriting ); + std::u32string target = Folding::apply( allWordWriting ); for ( const auto & i : resultsIndex ) { - wstring resultFolded = Folding::apply( i.first ); + std::u32string resultFolded = Folding::apply( i.first ); int charsInCommon = 0; - for ( wchar const *t = target.c_str(), *r = resultFolded.c_str(); *t && *t == *r; + for ( char32_t const *t = target.c_str(), *r = resultFolded.c_str(); *t && *t == *r; ++t, ++r, ++charsInCommon ) { ; } diff --git a/src/wordfinder.hh b/src/wordfinder.hh index 49f938bc..189d8098 100644 --- a/src/wordfinder.hh +++ b/src/wordfinder.hh @@ -48,11 +48,11 @@ private: std::vector< sptr< Dictionary::Class > > const * inputDicts; - std::vector< gd::wstring > allWordWritings; // All writings of the inputWord + std::vector< std::u32string > allWordWritings; // All writings of the inputWord struct OneResult { - gd::wstring word; + std::u32string word; int rank; bool wasSuggested; }; @@ -60,7 +60,7 @@ private: // Maps lowercased string to the original one. This catches all duplicates // without case sensitivity. Made as an array and a map indexing that array. using ResultsArray = std::list< OneResult >; - using ResultsIndex = std::map< gd::wstring, ResultsArray::iterator >; + using ResultsIndex = std::map< std::u32string, ResultsArray::iterator >; ResultsArray resultsArray; ResultsIndex resultsIndex;