Compare commits

...

3 commits

Author SHA1 Message Date
shenleban tongying 0c42c300e1
Merge pull request #1987 from shenlebantongying/refactor/gd-text
Some checks are pending
SonarCloud / Build and analyze (push) Waiting to run
refactor: use standard string types and merge wstring(-qt)/utf8/ namespaces to Text
2024-11-23 08:26:21 -05:00
shenleban tongying 1471bc3926 ignore last commit 2024-11-23 08:19:03 -05:00
shenleban tongying f1e158578f refactor: use standard string types and merge string namespaces to Text 2024-11-23 08:15:43 -05:00
62 changed files with 1054 additions and 1078 deletions

View file

@ -21,3 +21,6 @@ c8af0450f1f7f8188004db96e3f53e7e33e2ccad
# remove gddebug.hh and associated functions # remove gddebug.hh and associated functions
76aaed116bdc3aeb53fd61553aedb877baf9b510 76aaed116bdc3aeb53fd61553aedb877baf9b510
# wstring & wchar -> std::u32string & char32_t
f1e158578f62c96059bef1a616b75495adb6e2c6

View file

@ -9,7 +9,6 @@
#include "htmlescape.hh" #include "htmlescape.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "utils.hh" #include "utils.hh"
#include "wstring_qt.hh"
#include <QDir> #include <QDir>
#include <QFile> #include <QFile>
#include <QTextDocumentFragment> #include <QTextDocumentFragment>
@ -21,7 +20,6 @@
using std::vector; using std::vector;
using std::string; using std::string;
using gd::wstring;
using std::set; using std::set;
using std::list; using std::list;
@ -484,7 +482,7 @@ ArticleRequest::ArticleRequest( QString const & word,
// Accumulate main forms // Accumulate main forms
for ( const auto & activeDict : activeDicts ) { for ( const auto & activeDict : activeDicts ) {
auto const s = activeDict->findHeadwordsForSynonym( gd::removeTrailingZero( word ) ); auto const s = activeDict->findHeadwordsForSynonym( Text::removeTrailingZero( word ) );
connect( s.get(), &Dictionary::Request::finished, this, &ArticleRequest::altSearchFinished, Qt::QueuedConnection ); connect( s.get(), &Dictionary::Request::finished, this, &ArticleRequest::altSearchFinished, Qt::QueuedConnection );
@ -521,9 +519,9 @@ void ArticleRequest::altSearchFinished()
altsDone = true; // So any pending signals in queued mode won't mess us up altsDone = true; // So any pending signals in queued mode won't mess us up
vector< wstring > altsVector( alts.begin(), alts.end() ); vector< std::u32string > altsVector( alts.begin(), alts.end() );
wstring wordStd = word.toStdU32String(); std::u32string wordStd = word.toStdU32String();
if ( activeDicts.size() <= 1 ) { if ( activeDicts.size() <= 1 ) {
articleSizeLimit = -1; // Don't collapse article if only one dictionary presented articleSizeLimit = -1; // Don't collapse article if only one dictionary presented
@ -534,7 +532,7 @@ void ArticleRequest::altSearchFinished()
sptr< Dictionary::DataRequest > r = activeDict->getArticle( sptr< Dictionary::DataRequest > r = activeDict->getArticle(
wordStd, wordStd,
altsVector, altsVector,
gd::removeTrailingZero( contexts.value( QString::fromStdString( activeDict->getId() ) ) ), Text::removeTrailingZero( contexts.value( QString::fromStdString( activeDict->getId() ) ) ),
ignoreDiacritics ); ignoreDiacritics );
connect( r.get(), &Dictionary::Request::finished, this, &ArticleRequest::bodyFinished, Qt::QueuedConnection ); connect( r.get(), &Dictionary::Request::finished, this, &ArticleRequest::bodyFinished, Qt::QueuedConnection );
@ -1008,7 +1006,7 @@ void ArticleRequest::individualWordFinished()
WordFinder::SearchResults const & results = stemmedWordFinder->getResults(); WordFinder::SearchResults const & results = stemmedWordFinder->getResults();
if ( results.size() ) { if ( results.size() ) {
wstring source = Folding::applySimpleCaseOnly( currentSplittedWordCompound ); std::u32string source = Folding::applySimpleCaseOnly( currentSplittedWordCompound );
bool hadSomething = false; bool hadSomething = false;
@ -1022,7 +1020,7 @@ void ArticleRequest::individualWordFinished()
// Prefix match found. Check if the aliases are acceptable. // Prefix match found. Check if the aliases are acceptable.
wstring result( Folding::applySimpleCaseOnly( results[ x ].first ) ); std::u32string result( Folding::applySimpleCaseOnly( results[ x ].first ) );
if ( source.size() <= result.size() && result.compare( 0, source.size(), source ) == 0 ) { if ( source.size() <= result.size() && result.compare( 0, source.size(), source ) == 0 ) {
// The resulting string begins with the source one // The resulting string begins with the source one

View file

@ -88,7 +88,7 @@ class ArticleRequest: public Dictionary::DataRequest
QMap< QString, QString > contexts; QMap< QString, QString > contexts;
std::vector< sptr< Dictionary::Class > > activeDicts; std::vector< sptr< Dictionary::Class > > activeDicts;
std::set< gd::wstring, std::less<> > alts; // Accumulated main forms std::set< std::u32string, std::less<> > alts; // Accumulated main forms
std::list< sptr< Dictionary::WordSearchRequest > > altSearches; std::list< sptr< Dictionary::WordSearchRequest > > altSearches;
std::list< sptr< Dictionary::DataRequest > > bodyRequests; std::list< sptr< Dictionary::DataRequest > > bodyRequests;
bool altsDone{ false }; bool altsDone{ false };

View file

@ -2,7 +2,7 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "filetype.hh" #include "filetype.hh"
#include "utf8.hh" #include "text.hh"
#include <ctype.h> #include <ctype.h>
namespace Filetype { namespace Filetype {
@ -26,13 +26,13 @@ string simplifyString( string const & str, bool lowercase )
size_t beginPos = 0; size_t beginPos = 0;
while ( beginPos < str.size() && Utf8::isspace( str[ beginPos ] ) ) { while ( beginPos < str.size() && Text::isspace( str[ beginPos ] ) ) {
++beginPos; ++beginPos;
} }
size_t endPos = str.size(); size_t endPos = str.size();
while ( endPos && Utf8::isspace( str[ endPos - 1 ] ) ) { while ( endPos && Text::isspace( str[ endPos - 1 ] ) ) {
--endPos; --endPos;
} }

View file

@ -3,7 +3,7 @@
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "globalregex.hh" #include "globalregex.hh"
#include "inc_case_folding.hh" #include "inc_case_folding.hh"
@ -13,12 +13,12 @@ namespace Folding {
/// caught by the diacritics folding table, but they are only handled there /// caught by the diacritics folding table, but they are only handled there
/// when they come with their main characters, not by themselves. The rest /// when they come with their main characters, not by themselves. The rest
/// are caught here. /// are caught here.
bool isCombiningMark( wchar ch ) bool isCombiningMark( char32_t ch )
{ {
return QChar::isMark( ch ); return QChar::isMark( ch );
} }
wstring apply( wstring const & in, bool preserveWildcards ) std::u32string apply( std::u32string const & in, bool preserveWildcards )
{ {
// remove diacritics (normalization), white space, punt, // remove diacritics (normalization), white space, punt,
auto temp = QString::fromStdU32String( in ) auto temp = QString::fromStdU32String( in )
@ -32,7 +32,7 @@ wstring apply( wstring const & in, bool preserveWildcards )
// case folding // case folding
std::u32string caseFolded; std::u32string caseFolded;
caseFolded.reserve( temp.size() ); caseFolded.reserve( temp.size() );
wchar buf[ foldCaseMaxOut ]; char32_t buf[ foldCaseMaxOut ];
for ( const char32_t ch : temp ) { for ( const char32_t ch : temp ) {
auto n = foldCase( ch, buf ); auto n = foldCase( ch, buf );
caseFolded.append( buf, n ); caseFolded.append( buf, n );
@ -40,11 +40,11 @@ wstring apply( wstring const & in, bool preserveWildcards )
return caseFolded; return caseFolded;
} }
wstring applySimpleCaseOnly( wstring const & in ) std::u32string applySimpleCaseOnly( std::u32string const & in )
{ {
wchar const * nextChar = in.data(); char32_t const * nextChar = in.data();
wstring out; std::u32string out;
out.reserve( in.size() ); out.reserve( in.size() );
@ -55,27 +55,27 @@ wstring applySimpleCaseOnly( wstring const & in )
return out; return out;
} }
wstring applySimpleCaseOnly( QString const & in ) std::u32string applySimpleCaseOnly( QString const & in )
{ {
//qt only support simple case folding. //qt only support simple case folding.
return in.toCaseFolded().toStdU32String(); return in.toCaseFolded().toStdU32String();
} }
wstring applySimpleCaseOnly( std::string const & in ) std::u32string applySimpleCaseOnly( std::string const & in )
{ {
return applySimpleCaseOnly( Utf8::decode( in ) ); return applySimpleCaseOnly( Text::toUtf32( in ) );
// return QString::fromStdString( in ).toCaseFolded().toStdU32String(); // return QString::fromStdString( in ).toCaseFolded().toStdU32String();
} }
wstring applyFullCaseOnly( wstring const & in ) std::u32string applyFullCaseOnly( std::u32string const & in )
{ {
wstring caseFolded; std::u32string caseFolded;
caseFolded.reserve( in.size() * foldCaseMaxOut ); caseFolded.reserve( in.size() * foldCaseMaxOut );
wchar const * nextChar = in.data(); char32_t const * nextChar = in.data();
wchar buf[ foldCaseMaxOut ]; char32_t buf[ foldCaseMaxOut ];
for ( size_t left = in.size(); left--; ) { for ( size_t left = in.size(); left--; ) {
caseFolded.append( buf, foldCase( *nextChar++, buf ) ); caseFolded.append( buf, foldCase( *nextChar++, buf ) );
@ -84,17 +84,17 @@ wstring applyFullCaseOnly( wstring const & in )
return caseFolded; return caseFolded;
} }
wstring applyDiacriticsOnly( wstring const & in ) std::u32string applyDiacriticsOnly( std::u32string const & in )
{ {
auto noAccent = QString::fromStdU32String( in ).normalized( QString::NormalizationForm_KD ).remove( RX::accentMark ); auto noAccent = QString::fromStdU32String( in ).normalized( QString::NormalizationForm_KD ).remove( RX::accentMark );
return noAccent.toStdU32String(); return noAccent.toStdU32String();
} }
wstring applyPunctOnly( wstring const & in ) std::u32string applyPunctOnly( std::u32string const & in )
{ {
wchar const * nextChar = in.data(); char32_t const * nextChar = in.data();
wstring out; std::u32string out;
out.reserve( in.size() ); out.reserve( in.size() );
@ -119,11 +119,11 @@ QString applyPunctOnly( QString const & in )
return out; return out;
} }
wstring applyWhitespaceOnly( wstring const & in ) std::u32string applyWhitespaceOnly( std::u32string const & in )
{ {
wchar const * nextChar = in.data(); char32_t const * nextChar = in.data();
wstring out; std::u32string out;
out.reserve( in.size() ); out.reserve( in.size() );
@ -136,11 +136,11 @@ wstring applyWhitespaceOnly( wstring const & in )
return out; return out;
} }
wstring applyWhitespaceAndPunctOnly( wstring const & in ) std::u32string applyWhitespaceAndPunctOnly( std::u32string const & in )
{ {
wchar const * nextChar = in.data(); char32_t const * nextChar = in.data();
wstring out; std::u32string out;
out.reserve( in.size() ); out.reserve( in.size() );
@ -153,26 +153,26 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
return out; return out;
} }
bool isWhitespace( wchar ch ) bool isWhitespace( char32_t ch )
{ {
//invisible character should be treated as whitespace as well. //invisible character should be treated as whitespace as well.
return QChar::isSpace( ch ) || !QChar::isPrint( ch ); return QChar::isSpace( ch ) || !QChar::isPrint( ch );
} }
bool isWhitespaceOrPunct( wchar ch ) bool isWhitespaceOrPunct( char32_t ch )
{ {
return isWhitespace( ch ) || QChar::isPunct( ch ); return isWhitespace( ch ) || QChar::isPunct( ch );
} }
bool isPunct( wchar ch ) bool isPunct( char32_t ch )
{ {
return QChar::isPunct( ch ); return QChar::isPunct( ch );
} }
wstring trimWhitespaceOrPunct( wstring const & in ) std::u32string trimWhitespaceOrPunct( std::u32string const & in )
{ {
wchar const * wordBegin = in.c_str(); char32_t const * wordBegin = in.c_str();
wstring::size_type wordSize = in.size(); std::u32string::size_type wordSize = in.size();
// Skip any leading whitespace // Skip any leading whitespace
while ( *wordBegin && Folding::isWhitespaceOrPunct( *wordBegin ) ) { while ( *wordBegin && Folding::isWhitespaceOrPunct( *wordBegin ) ) {
@ -185,7 +185,7 @@ wstring trimWhitespaceOrPunct( wstring const & in )
--wordSize; --wordSize;
} }
return wstring( wordBegin, wordSize ); return std::u32string( wordBegin, wordSize );
} }
QString trimWhitespaceOrPunct( QString const & in ) QString trimWhitespaceOrPunct( QString const & in )
@ -209,13 +209,13 @@ QString trimWhitespaceOrPunct( QString const & in )
return in.mid( wordBegin, wordSize ); return in.mid( wordBegin, wordSize );
} }
wstring trimWhitespace( wstring const & in ) std::u32string trimWhitespace( std::u32string const & in )
{ {
if ( in.empty() ) { if ( in.empty() ) {
return in; return in;
} }
wchar const * wordBegin = in.c_str(); char32_t const * wordBegin = in.c_str();
wstring::size_type wordSize = in.size(); std::u32string::size_type wordSize = in.size();
// Skip any leading whitespace // Skip any leading whitespace
while ( *wordBegin && Folding::isWhitespace( *wordBegin ) ) { while ( *wordBegin && Folding::isWhitespace( *wordBegin ) ) {
@ -228,7 +228,7 @@ wstring trimWhitespace( wstring const & in )
--wordSize; --wordSize;
} }
return wstring( wordBegin, wordSize ); return std::u32string( wordBegin, wordSize );
} }
QString trimWhitespace( QString const & in ) QString trimWhitespace( QString const & in )

View file

@ -3,7 +3,7 @@
#pragma once #pragma once
#include "wstring.hh" #include "text.hh"
#include <QString> #include <QString>
/// Folding provides means to translate several possible ways to write a /// Folding provides means to translate several possible ways to write a
@ -17,8 +17,6 @@
namespace Folding { namespace Folding {
using gd::wstring;
using gd::wchar;
/// The algorithm's version. /// The algorithm's version.
enum { enum {
@ -27,48 +25,48 @@ enum {
/// Applies the folding algorithm to each character in the given string, /// Applies the folding algorithm to each character in the given string,
/// making another one as a result. /// making another one as a result.
wstring apply( wstring const &, bool preserveWildcards = false ); std::u32string apply( std::u32string const &, bool preserveWildcards = false );
/// Applies only simple case folding algorithm. Since many dictionaries have /// Applies only simple case folding algorithm. Since many dictionaries have
/// different case style, we interpret words differing only by case as synonyms. /// different case style, we interpret words differing only by case as synonyms.
wstring applySimpleCaseOnly( wstring const & ); std::u32string applySimpleCaseOnly( std::u32string const & );
wstring applySimpleCaseOnly( QString const & in ); std::u32string applySimpleCaseOnly( QString const & in );
wstring applySimpleCaseOnly( std::string const & in ); std::u32string applySimpleCaseOnly( std::string const & in );
/// Applies only full case folding algorithm. This includes simple case, but also /// Applies only full case folding algorithm. This includes simple case, but also
/// decomposing ligatures and complex letters. /// decomposing ligatures and complex letters.
wstring applyFullCaseOnly( wstring const & ); std::u32string applyFullCaseOnly( std::u32string const & );
/// Applies only diacritics folding algorithm. /// Applies only diacritics folding algorithm.
wstring applyDiacriticsOnly( wstring const & ); std::u32string applyDiacriticsOnly( std::u32string const & );
/// Applies only punctuation folding algorithm. /// Applies only punctuation folding algorithm.
wstring applyPunctOnly( wstring const & ); std::u32string applyPunctOnly( std::u32string const & );
QString applyPunctOnly( QString const & in ); QString applyPunctOnly( QString const & in );
/// Applies only whitespace folding algorithm. /// Applies only whitespace folding algorithm.
wstring applyWhitespaceOnly( wstring const & ); std::u32string applyWhitespaceOnly( std::u32string const & );
/// Applies only whitespace&punctuation folding algorithm. /// Applies only whitespace&punctuation folding algorithm.
wstring applyWhitespaceAndPunctOnly( wstring const & ); std::u32string applyWhitespaceAndPunctOnly( std::u32string const & );
/// Returns true if the given character is any form of whitespace, false /// Returns true if the given character is any form of whitespace, false
/// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also /// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
/// includes \n, \r and \t. /// includes \n, \r and \t.
bool isWhitespace( wchar ch ); bool isWhitespace( char32_t ch );
bool isWhitespaceOrPunct( wchar ch ); bool isWhitespaceOrPunct( char32_t ch );
/// Returns true if the given character is any form of punctuation, false /// Returns true if the given character is any form of punctuation, false
/// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes. /// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.
bool isPunct( wchar ch ); bool isPunct( char32_t ch );
/// Removes any whitespace or punctuation from the beginning and the end of /// Removes any whitespace or punctuation from the beginning and the end of
/// the word. /// the word.
wstring trimWhitespaceOrPunct( wstring const & ); std::u32string trimWhitespaceOrPunct( std::u32string const & );
QString trimWhitespaceOrPunct( QString const & in ); QString trimWhitespaceOrPunct( QString const & in );
/// Removes any whitespace from the beginning and the end of /// Removes any whitespace from the beginning and the end of
/// the word. /// the word.
wstring trimWhitespace( wstring const & ); std::u32string trimWhitespace( std::u32string const & );
QString trimWhitespace( QString const & in ); QString trimWhitespace( QString const & in );
/// Same as apply( wstring ), but without any heap operations, therefore /// Same as apply( wstring ), but without any heap operations, therefore
@ -86,6 +84,6 @@ QString unescapeWildcardSymbols( QString const & );
QString escapeWildcardSymbols( QString const & ); QString escapeWildcardSymbols( QString const & );
/// Tests if the given char is one of the Unicode combining marks. /// Tests if the given char is one of the Unicode combining marks.
bool isCombiningMark( wchar ch ); bool isCombiningMark( char32_t ch );
} // namespace Folding } // namespace Folding

View file

@ -5,7 +5,6 @@
#include <vector> #include <vector>
#include <errno.h> #include <errno.h>
#include <string.h> #include <string.h>
#include "wstring_qt.hh"
char const * const Iconv::GdWchar = "UTF-32LE"; char const * const Iconv::GdWchar = "UTF-32LE";
char const * const Iconv::Utf16Le = "UTF-16LE"; char const * const Iconv::Utf16Le = "UTF-16LE";
@ -80,7 +79,7 @@ QString Iconv::convert( void const *& inBuf, size_t & inBytesLeft )
return QString::fromUtf8( &outBuf.front(), datasize ); return QString::fromUtf8( &outBuf.front(), datasize );
} }
gd::wstring Iconv::toWstring( char const * fromEncoding, void const * fromData, size_t dataSize ) std::u32string Iconv::toWstring( char const * fromEncoding, void const * fromData, size_t dataSize )
{ {
/// Special-case the dataSize == 0 to avoid any kind of iconv-specific /// Special-case the dataSize == 0 to avoid any kind of iconv-specific

View file

@ -5,7 +5,7 @@
#include <QString> #include <QString>
#include "wstring.hh" #include "text.hh"
#include "ex.hh" #include "ex.hh"
#include <iconv.h> #include <iconv.h>
@ -35,7 +35,7 @@ public:
QString convert( void const *& inBuf, size_t & inBytesLeft ); QString convert( void const *& inBuf, size_t & inBytesLeft );
// Converts a given block of data from the given encoding to a wide string. // Converts a given block of data from the given encoding to a wide string.
static gd::wstring toWstring( char const * fromEncoding, void const * fromData, size_t dataSize ); static std::u32string toWstring( char const * fromEncoding, void const * fromData, size_t dataSize );
// Converts a given block of data from the given encoding to an utf8-encoded // Converts a given block of data from the given encoding to an utf8-encoded
// string. // string.

View file

@ -1,15 +1,21 @@
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org> /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "utf8.hh" #include "text.hh"
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include <QByteArray> #include <QByteArray>
#include <QString> #include <QString>
#include <QList>
namespace Utf8 { namespace Text {
size_t encode( wchar const * in, size_t inSize, char * out_ )
/// Encodes the given UTF-32 into UTF-8. The inSize specifies the number
/// of wide characters the 'in' pointer points to. The 'out' buffer must be
/// at least inSize * 4 bytes long. The function returns the number of chars
/// stored in the 'out' buffer. The result is not 0-terminated.
size_t encode( char32_t const * in, size_t inSize, char * out_ )
{ {
unsigned char * out = (unsigned char *)out_; unsigned char * out = (unsigned char *)out_;
@ -37,13 +43,18 @@ size_t encode( wchar const * in, size_t inSize, char * out_ )
return out - (unsigned char *)out_; return out - (unsigned char *)out_;
} }
long decode( char const * in_, size_t inSize, wchar * out_ ) /// Decodes the given UTF-8 into UTF-32. The inSize specifies the number
/// of bytes the 'in' pointer points to. The 'out' buffer must be at least
/// inSize wide characters long. If the given UTF-8 is invalid, the decode
/// function returns -1, otherwise it returns the number of wide characters
/// stored in the 'out' buffer. The result is not 0-terminated.
long decode( char const * in_, size_t inSize, char32_t * out_ )
{ {
unsigned char const * in = (unsigned char const *)in_; unsigned char const * in = (unsigned char const *)in_;
wchar * out = out_; char32_t * out = out_;
while ( inSize-- ) { while ( inSize-- ) {
wchar result; char32_t result;
if ( *in & 0x80 ) { if ( *in & 0x80 ) {
if ( *in & 0x40 ) { if ( *in & 0x40 ) {
@ -61,22 +72,22 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
inSize -= 3; inSize -= 3;
result = ( (wchar)*in++ & 7 ) << 18; result = ( (char32_t)*in++ & 7 ) << 18;
if ( ( *in & 0xC0 ) != 0x80 ) { if ( ( *in & 0xC0 ) != 0x80 ) {
return -1; return -1;
} }
result |= ( (wchar)*in++ & 0x3F ) << 12; result |= ( (char32_t)*in++ & 0x3F ) << 12;
if ( ( *in & 0xC0 ) != 0x80 ) { if ( ( *in & 0xC0 ) != 0x80 ) {
return -1; return -1;
} }
result |= ( (wchar)*in++ & 0x3F ) << 6; result |= ( (char32_t)*in++ & 0x3F ) << 6;
if ( ( *in & 0xC0 ) != 0x80 ) { if ( ( *in & 0xC0 ) != 0x80 ) {
return -1; return -1;
} }
result |= (wchar)*in++ & 0x3F; result |= (char32_t)*in++ & 0x3F;
} }
else { else {
// Three-byte sequence // Three-byte sequence
@ -87,17 +98,17 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
inSize -= 2; inSize -= 2;
result = ( (wchar)*in++ & 0xF ) << 12; result = ( (char32_t)*in++ & 0xF ) << 12;
if ( ( *in & 0xC0 ) != 0x80 ) { if ( ( *in & 0xC0 ) != 0x80 ) {
return -1; return -1;
} }
result |= ( (wchar)*in++ & 0x3F ) << 6; result |= ( (char32_t)*in++ & 0x3F ) << 6;
if ( ( *in & 0xC0 ) != 0x80 ) { if ( ( *in & 0xC0 ) != 0x80 ) {
return -1; return -1;
} }
result |= (wchar)*in++ & 0x3F; result |= (char32_t)*in++ & 0x3F;
} }
} }
else { else {
@ -108,12 +119,12 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
--inSize; --inSize;
result = ( (wchar)*in++ & 0x1F ) << 6; result = ( (char32_t)*in++ & 0x1F ) << 6;
if ( ( *in & 0xC0 ) != 0x80 ) { if ( ( *in & 0xC0 ) != 0x80 ) {
return -1; return -1;
} }
result |= (wchar)*in++ & 0x3F; result |= (char32_t)*in++ & 0x3F;
} }
} }
else { else {
@ -132,7 +143,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
return out - out_; return out - out_;
} }
string encode( wstring const & in ) noexcept std::string toUtf8( std::u32string const & in ) noexcept
{ {
if ( in.empty() ) { if ( in.empty() ) {
return {}; return {};
@ -140,16 +151,16 @@ string encode( wstring const & in ) noexcept
std::vector< char > buffer( in.size() * 4 ); std::vector< char > buffer( in.size() * 4 );
return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) ); return { &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) };
} }
wstring decode( string const & in ) std::u32string toUtf32( std::string const & in )
{ {
if ( in.empty() ) { if ( in.empty() ) {
return {}; return {};
} }
std::vector< wchar > buffer( in.size() ); std::vector< char32_t > buffer( in.size() );
long result = decode( in.data(), in.size(), &buffer.front() ); long result = decode( in.data(), in.size(), &buffer.front() );
@ -157,7 +168,7 @@ wstring decode( string const & in )
throw exCantDecode( in ); throw exCantDecode( in );
} }
return wstring( &buffer.front(), result ); return std::u32string( &buffer.front(), result );
} }
bool isspace( int c ) bool isspace( int c )
@ -247,29 +258,29 @@ LineFeed initLineFeed( const Encoding e )
{ {
LineFeed lf{}; LineFeed lf{};
switch ( e ) { switch ( e ) {
case Utf8::Utf32LE: case Utf32LE:
lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 }; lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
lf.length = 4; lf.length = 4;
break; break;
case Utf8::Utf32BE: case Utf32BE:
lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A }; lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
lf.length = 4; lf.length = 4;
break; break;
case Utf8::Utf16LE: case Utf16LE:
lf.lineFeed = new char[ 2 ]{ 0x0A, 0 }; lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
lf.length = 2; lf.length = 2;
break; break;
case Utf8::Utf16BE: case Utf16BE:
lf.lineFeed = new char[ 2 ]{ 0, 0x0A }; lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
lf.length = 2; lf.length = 2;
break; break;
case Utf8::Windows1252: case Windows1252:
case Utf8::Windows1251: case Windows1251:
case Utf8::Utf8: case Utf8:
case Utf8::Windows1250: case Windows1250:
default: default:
lf.length = 1; lf.length = 1;
lf.lineFeed = new char[ 1 ]{ 0x0A }; lf.lineFeed = new char[ 1 ]{ 0x0A };
@ -277,4 +288,36 @@ LineFeed initLineFeed( const Encoding e )
return lf; return lf;
} }
} // namespace Utf8 // When convert non-BMP characters to wstring,the ending char maybe \0 .This method remove the tailing \0 from the wstring
// as \0 is sensitive in the index. This method will be only used with index related operations like store/query.
std::u32string removeTrailingZero( std::u32string const & v )
{
int n = v.size();
while ( n > 0 && v[ n - 1 ] == 0 ) {
n--;
}
return std::u32string( v.data(), n );
}
std::u32string removeTrailingZero( QString const & in )
{
QList< unsigned int > v = in.toUcs4();
int n = v.size();
while ( n > 0 && v[ n - 1 ] == 0 ) {
n--;
}
if ( n != v.size() ) {
v.resize( n );
}
return std::u32string( (const char32_t *)v.constData(), v.size() );
}
std::u32string normalize( const std::u32string & str )
{
return QString::fromStdU32String( str ).normalized( QString::NormalizationForm_C ).toStdU32String();
}
} // namespace Text

50
src/common/text.hh Normal file
View file

@ -0,0 +1,50 @@
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#pragma once
#include <cstdio>
#include <QByteArray>
#include <string>
#include "ex.hh"
/// Facilities to process Text, focusing on Unicode
namespace Text {
DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )
// Those are possible encodings for .dsl files
enum Encoding {
Utf16LE,
Utf16BE,
Windows1252,
Windows1251,
Windows1250,
Utf8,
Utf32BE,
Utf32LE,
};
std::string toUtf8( std::u32string const & ) noexcept;
std::u32string toUtf32( std::string const & );
/// Since the standard isspace() is locale-specific, we need something
/// that would never mess up our utf8 input. The stock one worked fine under
/// Linux but was messing up strings under Windows.
bool isspace( int c );
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length );
char const * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );
struct LineFeed
{
int length;
char * lineFeed;
};
LineFeed initLineFeed( Encoding e );
std::u32string removeTrailingZero( std::u32string const & v );
std::u32string removeTrailingZero( QString const & in );
std::u32string normalize( std::u32string const & );
} // namespace Text

View file

@ -1,68 +0,0 @@
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#pragma once
#include <cstdio>
#include <QByteArray>
#include <string>
#include "ex.hh"
#include "wstring.hh"
/// A simple UTF-8 encoder/decoder. Some dictionary backends only require
/// utf8, so we have this separately, removing the iconv dependency for them.
/// Besides, utf8 is quite ubiquitous now, and its use is spreaded over many
/// places.
namespace Utf8 {
// Those are possible encodings for .dsl files
enum Encoding {
Utf16LE,
Utf16BE,
Windows1252,
Windows1251,
Windows1250,
Utf8, // This is an extension. Detected solely by the UTF8 BOM.
Utf32BE,
Utf32LE,
};
using std::string;
using gd::wstring;
using gd::wchar;
DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )
/// Encodes the given UCS-4 into UTF-8. The inSize specifies the number
/// of wide characters the 'in' pointer points to. The 'out' buffer must be
/// at least inSize * 4 bytes long. The function returns the number of chars
/// stored in the 'out' buffer. The result is not 0-terminated.
size_t encode( wchar const * in, size_t inSize, char * out );
/// Decodes the given UTF-8 into UCS-32. The inSize specifies the number
/// of bytes the 'in' pointer points to. The 'out' buffer must be at least
/// inSize wide characters long. If the given UTF-8 is invalid, the decode
/// function returns -1, otherwise it returns the number of wide characters
/// stored in the 'out' buffer. The result is not 0-terminated.
long decode( char const * in, size_t inSize, wchar * out );
/// Versions for non time-critical code.
string encode( wstring const & ) noexcept;
wstring decode( string const & );
/// Since the standard isspace() is locale-specific, we need something
/// that would never mess up our utf8 input. The stock one worked fine under
/// Linux but was messing up strings under Windows.
bool isspace( int c );
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length );
char const * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );
struct LineFeed
{
int length;
char * lineFeed;
};
LineFeed initLineFeed( Encoding e );
} // namespace Utf8

View file

@ -1,17 +0,0 @@
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#pragma once
#include <string>
///
/// Aliases for legacy reasons.
///
/// For new code, just use the standardized std::u32string for UTF-32 strings instead.
///
namespace gd {
using wchar = char32_t;
using wstring = std::u32string;
} // namespace gd

View file

@ -1,38 +0,0 @@
#include "wstring_qt.hh"
#include <QList>
namespace gd {
// When convert non-BMP characters to wstring,the ending char maybe \0 .This method remove the tailing \0 from the wstring
// as \0 is sensitive in the index. This method will be only used with index related operations like store/query.
wstring removeTrailingZero( wstring const & v )
{
int n = v.size();
while ( n > 0 && v[ n - 1 ] == 0 ) {
n--;
}
return wstring( v.data(), n );
}
wstring removeTrailingZero( QString const & in )
{
QList< unsigned int > v = in.toUcs4();
int n = v.size();
while ( n > 0 && v[ n - 1 ] == 0 ) {
n--;
}
if ( n != v.size() ) {
v.resize( n );
}
return wstring( (const wchar *)v.constData(), v.size() );
}
wstring normalize( const wstring & str )
{
return QString::fromStdU32String( str ).normalized( QString::NormalizationForm_C ).toStdU32String();
}
} // namespace gd

View file

@ -1,16 +0,0 @@
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#pragma once
/// This file adds conversions between gd::wstring and QString. See wstring.hh
/// for more details on gd::wstring.
#include "wstring.hh"
#include <QString>
namespace gd {
wstring removeTrailingZero( wstring const & v );
wstring removeTrailingZero( QString const & in );
wstring normalize( wstring const & );
} // namespace gd

View file

@ -4,7 +4,7 @@
#include "aard.hh" #include "aard.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "chunkedstorage.hh" #include "chunkedstorage.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "decompress.hh" #include "decompress.hh"
@ -29,7 +29,6 @@ using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using std::string; using std::string;
using gd::wstring;
using BtreeIndexing::WordArticleLink; using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
@ -236,8 +235,10 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
QString const & getDescription() override; QString const & getDescription() override;
@ -601,8 +602,8 @@ AardDictionary::getSearchResults( QString const & searchString, int searchMode,
class AardArticleRequest: public Dictionary::DataRequest class AardArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
AardDictionary & dict; AardDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -611,8 +612,8 @@ class AardArticleRequest: public Dictionary::DataRequest
public: public:
AardArticleRequest( wstring const & word_, AardArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
AardDictionary & dict_, AardDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -656,13 +657,13 @@ void AardArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles;
set< quint32 > articlesIncluded; // Some synonims make it that the articles set< quint32 > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -693,12 +694,12 @@ void AardArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< std::u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
@ -714,7 +715,7 @@ void AardArticleRequest::run()
string result; string result;
multimap< wstring, pair< string, string > >::const_iterator i; multimap< std::u32string, pair< string, string > >::const_iterator i;
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
result += "<h3>"; result += "<h3>";
@ -737,9 +738,9 @@ void AardArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > AardDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > AardDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -915,7 +916,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
// Insert new entry // Insert new entry
wstring word = Utf8::decode( string( data.data(), wordSize ) ); std::u32string word = Text::toUtf32( string( data.data(), wordSize ) );
if ( maxHeadwordsToExpand && dictHeader.wordsCount >= maxHeadwordsToExpand ) { if ( maxHeadwordsToExpand && dictHeader.wordsCount >= maxHeadwordsToExpand ) {
indexedWords.addSingleWord( word, articleOffset ); indexedWords.addSingleWord( word, articleOffset );
} }

View file

@ -11,7 +11,7 @@
#include "htmlescape.hh" #include "htmlescape.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "language.hh" #include "language.hh"
#include "utf8.hh" #include "text.hh"
#include "utils.hh" #include "utils.hh"
#include <ctype.h> #include <ctype.h>
#include <list> #include <list>
@ -30,8 +30,6 @@ namespace Bgl {
using std::map; using std::map;
using std::multimap; using std::multimap;
using std::set; using std::set;
using gd::wstring;
using gd::wchar;
using std::list; using std::list;
using std::pair; using std::pair;
using std::string; using std::string;
@ -111,7 +109,7 @@ void trimWs( string & word )
if ( word.size() ) { if ( word.size() ) {
unsigned begin = 0; unsigned begin = 0;
while ( begin < word.size() && Utf8::isspace( word[ begin ] ) ) { while ( begin < word.size() && Text::isspace( word[ begin ] ) ) {
++begin; ++begin;
} }
@ -123,7 +121,7 @@ void trimWs( string & word )
// Doesn't consist of ws entirely, so must end with just isspace() // Doesn't consist of ws entirely, so must end with just isspace()
// condition. // condition.
while ( Utf8::isspace( word[ end - 1 ] ) ) { while ( Text::isspace( word[ end - 1 ] ) ) {
--end; --end;
} }
@ -137,7 +135,7 @@ void trimWs( string & word )
void addEntryToIndex( string & word, void addEntryToIndex( string & word,
uint32_t articleOffset, uint32_t articleOffset,
IndexedWords & indexedWords, IndexedWords & indexedWords,
vector< wchar > & wcharBuffer ) vector< char32_t > & wcharBuffer )
{ {
// Strip any leading or trailing whitespaces // Strip any leading or trailing whitespaces
trimWs( word ); trimWs( word );
@ -159,7 +157,7 @@ void addEntryToIndex( string & word,
} }
// Convert the word from utf8 to wide chars // Convert the word from utf8 to wide chars
indexedWords.addWord( Utf8::decode( word ), articleOffset ); indexedWords.addWord( Text::toUtf32( word ), articleOffset );
} }
class BglDictionary: public BtreeIndexing::BtreeDictionary class BglDictionary: public BtreeIndexing::BtreeDictionary
@ -193,10 +191,12 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override;
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -387,7 +387,7 @@ void BglDictionary::getArticleText( uint32_t articleAddress, QString & headword,
headword = QString::fromUtf8( headwordStr.data(), headwordStr.size() ); headword = QString::fromUtf8( headwordStr.data(), headwordStr.size() );
wstring wstr = Utf8::decode( articleStr ); std::u32string wstr = Text::toUtf32( articleStr );
if ( getLangTo() == LangCoder::code2toInt( "he" ) ) { if ( getLangTo() == LangCoder::code2toInt( "he" ) ) {
for ( char32_t & i : wstr ) { for ( char32_t & i : wstr ) {
@ -436,7 +436,7 @@ void BglDictionary::makeFTSIndex( QAtomicInt & isCancelled )
class BglHeadwordsRequest: public Dictionary::WordSearchRequest class BglHeadwordsRequest: public Dictionary::WordSearchRequest
{ {
wstring str; std::u32string str;
BglDictionary & dict; BglDictionary & dict;
QAtomicInt isCancelled; QAtomicInt isCancelled;
@ -444,7 +444,7 @@ class BglHeadwordsRequest: public Dictionary::WordSearchRequest
public: public:
BglHeadwordsRequest( wstring const & word_, BglDictionary & dict_ ): BglHeadwordsRequest( std::u32string const & word_, BglDictionary & dict_ ):
str( word_ ), str( word_ ),
dict( dict_ ) dict( dict_ )
{ {
@ -476,7 +476,7 @@ void BglHeadwordsRequest::run()
vector< WordArticleLink > chain = dict.findArticles( str ); vector< WordArticleLink > chain = dict.findArticles( str );
wstring caseFolded = Folding::applySimpleCaseOnly( str ); std::u32string caseFolded = Folding::applySimpleCaseOnly( str );
for ( auto & x : chain ) { for ( auto & x : chain ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
@ -488,11 +488,11 @@ void BglHeadwordsRequest::run()
dict.loadArticle( x.articleOffset, headword, displayedHeadword, articleText ); dict.loadArticle( x.articleOffset, headword, displayedHeadword, articleText );
wstring headwordDecoded; std::u32string headwordDecoded;
try { try {
headwordDecoded = Utf8::decode( removePostfix( headword ) ); headwordDecoded = Text::toUtf32( removePostfix( headword ) );
} }
catch ( Utf8::exCantDecode & ) { catch ( Text::exCantDecode & ) {
} }
if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) && !headwordDecoded.empty() ) { if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) && !headwordDecoded.empty() ) {
@ -507,7 +507,7 @@ void BglHeadwordsRequest::run()
finish(); finish();
} }
sptr< Dictionary::WordSearchRequest > BglDictionary::findHeadwordsForSynonym( wstring const & word ) sptr< Dictionary::WordSearchRequest > BglDictionary::findHeadwordsForSynonym( std::u32string const & word )
{ {
return synonymSearchEnabled ? std::make_shared< BglHeadwordsRequest >( word, *this ) : return synonymSearchEnabled ? std::make_shared< BglHeadwordsRequest >( word, *this ) :
@ -547,8 +547,8 @@ string postfixToSuperscript( string const & in )
class BglArticleRequest: public Dictionary::DataRequest class BglArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
BglDictionary & dict; BglDictionary & dict;
QAtomicInt isCancelled; QAtomicInt isCancelled;
@ -557,8 +557,8 @@ class BglArticleRequest: public Dictionary::DataRequest
public: public:
BglArticleRequest( wstring const & word_, BglArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
BglDictionary & dict_, BglDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -590,11 +590,11 @@ public:
void BglArticleRequest::fixHebString( string & hebStr ) // Hebrew support - convert non-unicode to unicode void BglArticleRequest::fixHebString( string & hebStr ) // Hebrew support - convert non-unicode to unicode
{ {
wstring hebWStr; std::u32string hebWStr;
try { try {
hebWStr = Utf8::decode( hebStr ); hebWStr = Text::toUtf32( hebStr );
} }
catch ( Utf8::exCantDecode & ) { catch ( Text::exCantDecode & ) {
hebStr = "Utf-8 decoding error"; hebStr = "Utf-8 decoding error";
return; return;
} }
@ -608,7 +608,7 @@ void BglArticleRequest::fixHebString( string & hebStr ) // Hebrew support - conv
i += 1488 - 224; // Convert to Hebrew unicode i += 1488 - 224; // Convert to Hebrew unicode
} }
} }
hebStr = Utf8::encode( hebWStr ); hebStr = Text::toUtf8( hebWStr );
} }
void BglArticleRequest::fixHebArticle( string & hebArticle ) // Hebrew support - remove extra chars at the end void BglArticleRequest::fixHebArticle( string & hebArticle ) // Hebrew support - remove extra chars at the end
@ -644,7 +644,7 @@ void BglArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
@ -653,7 +653,7 @@ void BglArticleRequest::run()
// the bodies to account for this. // the bodies to account for this.
set< QByteArray > articleBodiesIncluded; set< QByteArray > articleBodiesIncluded;
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -681,7 +681,7 @@ void BglArticleRequest::run()
// We do the case-folded and postfix-less comparison here. // We do the case-folded and postfix-less comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( removePostfix( headword ) ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( removePostfix( headword ) );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
@ -704,7 +704,7 @@ void BglArticleRequest::run()
continue; // Already had this body continue; // Already had this body
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< std::u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( targetHeadword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( targetHeadword, articleText ) ) );
@ -725,7 +725,7 @@ void BglArticleRequest::run()
string result; string result;
multimap< wstring, pair< string, string > >::const_iterator i; multimap< std::u32string, pair< string, string > >::const_iterator i;
string cleaner = Utils::Html::getHtmlCleaner(); string cleaner = Utils::Html::getHtmlCleaner();
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
@ -802,9 +802,9 @@ void BglArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > BglDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > BglDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -1085,7 +1085,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
IndexedWords indexedWords; IndexedWords indexedWords;
// We use this buffer to decode utf8 into it. // We use this buffer to decode utf8 into it.
vector< wchar > wcharBuffer; vector< char32_t > wcharBuffer;
ChunkedStorage::Writer chunks( idx ); ChunkedStorage::Writer chunks( idx );

View file

@ -3,11 +3,10 @@
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
#include <stdlib.h> #include <stdlib.h>
#include "wstring_qt.hh"
#include "utils.hh" #include "utils.hh"
#include <QRegularExpression> #include <QRegularExpression>
@ -19,8 +18,6 @@
namespace BtreeIndexing { namespace BtreeIndexing {
using gd::wstring;
using gd::wchar;
using std::pair; using std::pair;
enum { enum {
@ -59,14 +56,14 @@ void BtreeIndex::openIndex( IndexInfo const & indexInfo, File::Index & file, QMu
} }
vector< WordArticleLink > vector< WordArticleLink >
BtreeIndex::findArticles( wstring const & search_word, bool ignoreDiacritics, uint32_t maxMatchCount ) BtreeIndex::findArticles( std::u32string const & search_word, bool ignoreDiacritics, uint32_t maxMatchCount )
{ {
//First trim ending zero //First trim ending zero
wstring word = gd::removeTrailingZero( search_word ); std::u32string word = Text::removeTrailingZero( search_word );
vector< WordArticleLink > result; vector< WordArticleLink > result;
try { try {
wstring folded = Folding::apply( word ); std::u32string folded = Folding::apply( word );
if ( folded.empty() ) { if ( folded.empty() ) {
folded = Folding::applyWhitespaceOnly( word ); folded = Folding::applyWhitespaceOnly( word );
} }
@ -100,7 +97,7 @@ BtreeIndex::findArticles( wstring const & search_word, bool ignoreDiacritics, ui
BtreeWordSearchRequest::BtreeWordSearchRequest( BtreeDictionary & dict_, BtreeWordSearchRequest::BtreeWordSearchRequest( BtreeDictionary & dict_,
wstring const & str_, std::u32string const & str_,
unsigned minLength_, unsigned minLength_,
int maxSuffixVariation_, int maxSuffixVariation_,
bool allowMiddleMatches_, bool allowMiddleMatches_,
@ -137,11 +134,11 @@ void BtreeWordSearchRequest::findMatches()
bool useWildcards = false; bool useWildcards = false;
if ( allowMiddleMatches ) { if ( allowMiddleMatches ) {
useWildcards = ( str.find( '*' ) != wstring::npos || str.find( '?' ) != wstring::npos useWildcards = ( str.find( '*' ) != std::u32string::npos || str.find( '?' ) != std::u32string::npos
|| str.find( '[' ) != wstring::npos || str.find( ']' ) != wstring::npos ); || str.find( '[' ) != std::u32string::npos || str.find( ']' ) != std::u32string::npos );
} }
wstring folded = Folding::apply( str ); std::u32string folded = Folding::apply( str );
int minMatchLength = 0; int minMatchLength = 0;
@ -154,7 +151,7 @@ void BtreeWordSearchRequest::findMatches()
regexp.setPatternOptions( QRegularExpression::CaseInsensitiveOption ); regexp.setPatternOptions( QRegularExpression::CaseInsensitiveOption );
bool bNoLetters = folded.empty(); bool bNoLetters = folded.empty();
wstring foldedWithWildcards; std::u32string foldedWithWildcards;
if ( bNoLetters ) { if ( bNoLetters ) {
foldedWithWildcards = Folding::applyWhitespaceOnly( str ); foldedWithWildcards = Folding::applyWhitespaceOnly( str );
@ -268,9 +265,9 @@ void BtreeWordSearchRequest::findMatches()
vector< WordArticleLink > chain = dict.readChain( chainOffset ); vector< WordArticleLink > chain = dict.readChain( chainOffset );
wstring chainHead = Utf8::decode( chain[ 0 ].word ); std::u32string chainHead = Text::toUtf32( chain[ 0 ].word );
wstring resultFolded = Folding::apply( chainHead ); std::u32string resultFolded = Folding::apply( chainHead );
if ( resultFolded.empty() ) { if ( resultFolded.empty() ) {
resultFolded = Folding::applyWhitespaceOnly( chainHead ); resultFolded = Folding::applyWhitespaceOnly( chainHead );
} }
@ -286,9 +283,9 @@ void BtreeWordSearchRequest::findMatches()
break; break;
} }
if ( useWildcards ) { if ( useWildcards ) {
wstring word = Utf8::decode( x.prefix + x.word ); std::u32string word = Text::toUtf32( x.prefix + x.word );
wstring result = Folding::applyDiacriticsOnly( word ); std::u32string result = Folding::applyDiacriticsOnly( word );
if ( result.size() >= (wstring::size_type)minMatchLength ) { if ( result.size() >= (std::u32string::size_type)minMatchLength ) {
QRegularExpressionMatch match = regexp.match( QString::fromStdU32String( result ) ); QRegularExpressionMatch match = regexp.match( QString::fromStdU32String( result ) );
if ( match.hasMatch() && match.capturedStart() == 0 ) { if ( match.hasMatch() && match.capturedStart() == 0 ) {
addMatch( word ); addMatch( word );
@ -298,10 +295,10 @@ void BtreeWordSearchRequest::findMatches()
else { else {
// Skip middle matches, if requested. If suffix variation is specified, // Skip middle matches, if requested. If suffix variation is specified,
// make sure the string isn't larger than requested. // make sure the string isn't larger than requested.
if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( x.prefix ) ).empty() ) if ( ( allowMiddleMatches || Folding::apply( Text::toUtf32( x.prefix ) ).empty() )
&& ( maxSuffixVariation < 0 && ( maxSuffixVariation < 0
|| (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) ) { || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) ) {
addMatch( Utf8::decode( x.prefix + x.word ) ); addMatch( Text::toUtf32( x.prefix + x.word ) );
} }
} }
if ( matches.size() >= maxResults ) { if ( matches.size() >= maxResults ) {
@ -393,13 +390,14 @@ BtreeWordSearchRequest::~BtreeWordSearchRequest()
f.waitForFinished(); f.waitForFinished();
} }
sptr< Dictionary::WordSearchRequest > BtreeDictionary::prefixMatch( wstring const & str, unsigned long maxResults ) sptr< Dictionary::WordSearchRequest > BtreeDictionary::prefixMatch( std::u32string const & str,
unsigned long maxResults )
{ {
return std::make_shared< BtreeWordSearchRequest >( *this, str, 0, -1, true, maxResults ); return std::make_shared< BtreeWordSearchRequest >( *this, str, 0, -1, true, maxResults );
} }
sptr< Dictionary::WordSearchRequest > BtreeDictionary::stemmedMatch( wstring const & str, sptr< Dictionary::WordSearchRequest > BtreeDictionary::stemmedMatch( std::u32string const & str,
unsigned minLength, unsigned minLength,
unsigned maxSuffixVariation, unsigned maxSuffixVariation,
unsigned long maxResults ) unsigned long maxResults )
@ -437,8 +435,11 @@ void BtreeIndex::readNode( uint32_t offset, vector< char > & out )
} }
} }
char const * BtreeIndex::findChainOffsetExactOrPrefix( char const * BtreeIndex::findChainOffsetExactOrPrefix( std::u32string const & target,
wstring const & target, bool & exactMatch, vector< char > & extLeaf, uint32_t & nextLeaf, char const *& leafEnd ) bool & exactMatch,
vector< char > & extLeaf,
uint32_t & nextLeaf,
char const *& leafEnd )
{ {
if ( !idxFile ) { if ( !idxFile ) {
throw exIndexWasNotOpened(); throw exIndexWasNotOpened();
@ -449,7 +450,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix(
// Lookup the index by traversing the index btree // Lookup the index by traversing the index btree
// vector< wchar > wcharBuffer; // vector< wchar > wcharBuffer;
wstring w_word; std::u32string w_word;
exactMatch = false; exactMatch = false;
// Read a node // Read a node
@ -530,7 +531,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix(
size_t wordSize = strlen( closestString ); size_t wordSize = strlen( closestString );
w_word = Utf8::decode( string( closestString, wordSize ) ); w_word = Text::toUtf32( string( closestString, wordSize ) );
compareResult = target.compare( w_word ); compareResult = target.compare( w_word );
@ -649,9 +650,9 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix(
size_t wordSize = strlen( ptr ); size_t wordSize = strlen( ptr );
w_word = Utf8::decode( string( ptr, wordSize ) ); w_word = Text::toUtf32( string( ptr, wordSize ) );
wstring foldedWord = Folding::apply( w_word ); std::u32string foldedWord = Folding::apply( w_word );
if ( foldedWord.empty() ) { if ( foldedWord.empty() ) {
foldedWord = Folding::applyWhitespaceOnly( w_word ); foldedWord = Folding::applyWhitespaceOnly( w_word );
} }
@ -750,9 +751,9 @@ vector< WordArticleLink > BtreeIndex::readChain( char const *& ptr, uint32_t max
return result; return result;
} }
void BtreeIndex::antialias( wstring const & str, vector< WordArticleLink > & chain, bool ignoreDiacritics ) void BtreeIndex::antialias( std::u32string const & str, vector< WordArticleLink > & chain, bool ignoreDiacritics )
{ {
wstring caseFolded = Folding::applySimpleCaseOnly( gd::normalize( str ) ); std::u32string caseFolded = Folding::applySimpleCaseOnly( Text::normalize( str ) );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
caseFolded = Folding::applyDiacriticsOnly( caseFolded ); caseFolded = Folding::applyDiacriticsOnly( caseFolded );
} }
@ -764,8 +765,8 @@ void BtreeIndex::antialias( wstring const & str, vector< WordArticleLink > & cha
for ( unsigned x = chain.size(); x--; ) { for ( unsigned x = chain.size(); x--; ) {
// If after applying case folding to each word they wouldn't match, we // If after applying case folding to each word they wouldn't match, we
// drop the entry. // drop the entry.
wstring entry = std::u32string entry =
Folding::applySimpleCaseOnly( gd::normalize( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) ); Folding::applySimpleCaseOnly( Text::normalize( Text::toUtf32( chain[ x ].prefix + chain[ x ].word ) ) );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
entry = Folding::applyDiacriticsOnly( entry ); entry = Folding::applyDiacriticsOnly( entry );
} }
@ -923,9 +924,9 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
return offset; return offset;
} }
void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset, unsigned int maxHeadwordSize ) void IndexedWords::addWord( std::u32string const & index_word, uint32_t articleOffset, unsigned int maxHeadwordSize )
{ {
wstring word = gd::removeTrailingZero( index_word ); std::u32string word = Text::removeTrailingZero( index_word );
string::size_type wordSize = word.size(); string::size_type wordSize = word.size();
// Safeguard us against various bugs here. Don't attempt adding words // Safeguard us against various bugs here. Don't attempt adding words
@ -945,7 +946,7 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset,
wordSize = word.size(); wordSize = word.size();
} }
wchar const * wordBegin = word.c_str(); char32_t const * wordBegin = word.c_str();
// Skip any leading whitespace // Skip any leading whitespace
while ( *wordBegin && Folding::isWhitespace( *wordBegin ) ) { while ( *wordBegin && Folding::isWhitespace( *wordBegin ) ) {
@ -958,7 +959,7 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset,
--wordSize; --wordSize;
} }
wchar const * nextChar = wordBegin; char32_t const * nextChar = wordBegin;
vector< char > utfBuffer( wordSize * 4 ); vector< char > utfBuffer( wordSize * 4 );
@ -970,11 +971,11 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset,
if ( !*nextChar ) // End of string ends everything if ( !*nextChar ) // End of string ends everything
{ {
if ( wordsAdded == 0 ) { if ( wordsAdded == 0 ) {
wstring folded = Folding::applyWhitespaceOnly( wstring( wordBegin, wordSize ) ); std::u32string folded = Folding::applyWhitespaceOnly( std::u32string( wordBegin, wordSize ) );
if ( !folded.empty() ) { if ( !folded.empty() ) {
auto i = insert( { Utf8::encode( folded ), vector< WordArticleLink >() } ).first; auto i = insert( { Text::toUtf8( folded ), vector< WordArticleLink >() } ).first;
string utfWord = Utf8::encode( wstring( wordBegin, wordSize ) ); string utfWord = Text::toUtf8( std::u32string( wordBegin, wordSize ) );
string utfPrefix; string utfPrefix;
i->second.emplace_back( utfWord, articleOffset, utfPrefix ); i->second.emplace_back( utfWord, articleOffset, utfPrefix );
} }
@ -988,15 +989,15 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset,
} }
// Insert this word // Insert this word
wstring folded = Folding::apply( nextChar ); std::u32string folded = Folding::apply( nextChar );
auto name = Utf8::encode( folded ); auto name = Text::toUtf8( folded );
auto i = insert( { std::move( name ), vector< WordArticleLink >() } ).first; auto i = insert( { std::move( name ), vector< WordArticleLink >() } ).first;
if ( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches if ( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches
{ {
string utfWord = Utf8::encode( wstring( nextChar, wordSize - ( nextChar - wordBegin ) ) ); string utfWord = Text::toUtf8( std::u32string( nextChar, wordSize - ( nextChar - wordBegin ) ) );
string utfPrefix = Utf8::encode( wstring( wordBegin, nextChar - wordBegin ) ); string utfPrefix = Text::toUtf8( std::u32string( wordBegin, nextChar - wordBegin ) );
i->second.emplace_back( std::move( utfWord ), articleOffset, std::move( utfPrefix ) ); i->second.emplace_back( std::move( utfWord ), articleOffset, std::move( utfPrefix ) );
// reduce the vector reallocation. // reduce the vector reallocation.
@ -1020,14 +1021,14 @@ void IndexedWords::addWord( wstring const & index_word, uint32_t articleOffset,
} }
} }
void IndexedWords::addSingleWord( wstring const & index_word, uint32_t articleOffset ) void IndexedWords::addSingleWord( std::u32string const & index_word, uint32_t articleOffset )
{ {
wstring const & word = gd::removeTrailingZero( index_word ); std::u32string const & word = Text::removeTrailingZero( index_word );
wstring folded = Folding::apply( word ); std::u32string folded = Folding::apply( word );
if ( folded.empty() ) { if ( folded.empty() ) {
folded = Folding::applyWhitespaceOnly( word ); folded = Folding::applyWhitespaceOnly( word );
} }
operator[]( Utf8::encode( folded ) ).emplace_back( Utf8::encode( word ), articleOffset ); operator[]( Text::toUtf8( folded ) ).emplace_back( Text::toUtf8( word ), articleOffset );
} }
IndexInfo buildIndex( IndexedWords const & indexedWords, File::Index & file ) IndexInfo buildIndex( IndexedWords const & indexedWords, File::Index & file )

View file

@ -18,7 +18,6 @@
namespace BtreeIndexing { namespace BtreeIndexing {
using std::string; using std::string;
using gd::wstring;
using std::vector; using std::vector;
using std::map; using std::map;
@ -80,7 +79,8 @@ public:
/// Finds articles that match the given string. A case-insensitive search /// Finds articles that match the given string. A case-insensitive search
/// is performed. /// is performed.
vector< WordArticleLink > findArticles( wstring const &, bool ignoreDiacritics = false, uint32_t maxMatchCount = -1 ); vector< WordArticleLink >
findArticles( std::u32string const &, bool ignoreDiacritics = false, uint32_t maxMatchCount = -1 );
/// Find all unique article links in the index /// Find all unique article links in the index
void findAllArticleLinks( QList< WordArticleLink > & articleLinks ); void findAllArticleLinks( QList< WordArticleLink > & articleLinks );
@ -116,8 +116,11 @@ protected:
/// case, the returned pointer wouldn't belong to 'leaf' at all. To that end, /// case, the returned pointer wouldn't belong to 'leaf' at all. To that end,
/// the leafEnd pointer always holds the pointer to the first byte outside /// the leafEnd pointer always holds the pointer to the first byte outside
/// the node data. /// the node data.
char const * findChainOffsetExactOrPrefix( char const * findChainOffsetExactOrPrefix( std::u32string const & target,
wstring const & target, bool & exactMatch, vector< char > & leaf, uint32_t & nextLeaf, char const *& leafEnd ); bool & exactMatch,
vector< char > & leaf,
uint32_t & nextLeaf,
char const *& leafEnd );
/// Reads a node or leaf at the given offset. Just uncompresses its data /// Reads a node or leaf at the given offset. Just uncompresses its data
/// to the given vector and does nothing more. /// to the given vector and does nothing more.
@ -129,7 +132,7 @@ protected:
/// Drops any aliases which arose due to folding. Only case-folded aliases /// Drops any aliases which arose due to folding. Only case-folded aliases
/// are left. /// are left.
void antialias( wstring const &, vector< WordArticleLink > &, bool ignoreDiactitics ); void antialias( std::u32string const &, vector< WordArticleLink > &, bool ignoreDiactitics );
protected: protected:
@ -161,10 +164,10 @@ public:
/// This function does the search using the btree index. Derivatives usually /// This function does the search using the btree index. Derivatives usually
/// need not to implement this function. /// need not to implement this function.
virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &, unsigned long ); virtual sptr< Dictionary::WordSearchRequest > prefixMatch( std::u32string const &, unsigned long );
virtual sptr< Dictionary::WordSearchRequest > virtual sptr< Dictionary::WordSearchRequest >
stemmedMatch( wstring const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ); stemmedMatch( std::u32string const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults );
virtual bool isLocalDictionary() virtual bool isLocalDictionary()
{ {
@ -210,7 +213,7 @@ class BtreeWordSearchRequest: public Dictionary::WordSearchRequest
{ {
protected: protected:
BtreeDictionary & dict; BtreeDictionary & dict;
wstring str; std::u32string str;
unsigned long maxResults; unsigned long maxResults;
unsigned minLength; unsigned minLength;
int maxSuffixVariation; int maxSuffixVariation;
@ -221,7 +224,7 @@ protected:
public: public:
BtreeWordSearchRequest( BtreeDictionary & dict_, BtreeWordSearchRequest( BtreeDictionary & dict_,
wstring const & str_, std::u32string const & str_,
unsigned minLength_, unsigned minLength_,
int maxSuffixVariation_, int maxSuffixVariation_,
bool allowMiddleMatches_, bool allowMiddleMatches_,
@ -251,11 +254,11 @@ struct IndexedWords: public map< string, vector< WordArticleLink > >
/// Instead of adding to the map directly, use this function. It does folding /// Instead of adding to the map directly, use this function. It does folding
/// itself, and for phrases/sentences it adds additional entries beginning with /// itself, and for phrases/sentences it adds additional entries beginning with
/// each new word. /// each new word.
void addWord( wstring const & word, uint32_t articleOffset, unsigned int maxHeadwordSize = 100U ); void addWord( std::u32string const & word, uint32_t articleOffset, unsigned int maxHeadwordSize = 100U );
/// Differs from addWord() in that it only adds a single entry. We use this /// Differs from addWord() in that it only adds a single entry. We use this
/// for zip's file names. /// for zip's file names.
void addSingleWord( wstring const & word, uint32_t articleOffset ); void addSingleWord( std::u32string const & word, uint32_t articleOffset );
}; };
/// Builds the index, as a compressed btree. Returns IndexInfo. /// Builds the index, as a compressed btree. Returns IndexInfo.

View file

@ -4,7 +4,7 @@
#include "dictdfiles.hh" #include "dictdfiles.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "dictzip.hh" #include "dictzip.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
#include "langcoder.hh" #include "langcoder.hh"
@ -29,7 +29,6 @@ using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using std::string; using std::string;
using gd::wstring;
using std::vector; using std::vector;
using std::list; using std::list;
@ -113,8 +112,10 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
QString const & getDescription() override; QString const & getDescription() override;
@ -234,9 +235,9 @@ uint32_t decodeBase64( string const & str )
return number; return number;
} }
sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > DictdDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -251,13 +252,13 @@ sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & wor
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, string > mainArticles, alternateArticles; multimap< std::u32string, string > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonyms make it that the articles set< uint32_t > articlesIncluded; // Some synonyms make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -377,12 +378,12 @@ sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & wor
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( x.word ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( x.word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, string > & mapToUse = multimap< std::u32string, string > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( x.word ), articleText ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( x.word ), articleText ) );
@ -396,7 +397,7 @@ sptr< Dictionary::DataRequest > DictdDictionary::getArticle( wstring const & wor
string result; string result;
multimap< wstring, string >::const_iterator i; multimap< std::u32string, string >::const_iterator i;
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
result += i->second; result += i->second;
@ -422,7 +423,8 @@ QString const & DictdDictionary::getDescription()
return dictionaryDescription; return dictionaryDescription;
} }
sptr< Dictionary::DataRequest > req = getArticle( U"00databaseinfo", vector< wstring >(), wstring(), false ); sptr< Dictionary::DataRequest > req =
getArticle( U"00databaseinfo", vector< std::u32string >(), std::u32string(), false );
if ( req->dataSize() > 0 ) { if ( req->dataSize() > 0 ) {
dictionaryDescription = QString::fromUtf8( req->getFullData().data(), req->getFullData().size() ); dictionaryDescription = QString::fromUtf8( req->getFullData().data(), req->getFullData().size() );
@ -629,10 +631,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Handle the forth entry, if it exists. From dictfmt man: // Handle the forth entry, if it exists. From dictfmt man:
// When --index-keep-orig option is used fourth column is created // When --index-keep-orig option is used fourth column is created
// (if necessary) in .index file. // (if necessary) in .index file.
indexedWords.addWord( Utf8::decode( string( tab3 + 1, strlen( tab3 + 1 ) ) ), curOffset ); indexedWords.addWord( Text::toUtf32( string( tab3 + 1, strlen( tab3 + 1 ) ) ), curOffset );
++idxHeader.wordCount; ++idxHeader.wordCount;
} }
indexedWords.addWord( Utf8::decode( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset ); indexedWords.addWord( Text::toUtf32( string( buf, strchr( buf, '\t' ) - buf ) ), curOffset );
++idxHeader.wordCount; ++idxHeader.wordCount;
++idxHeader.articleCount; ++idxHeader.articleCount;
@ -657,7 +659,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
eol = articleBody; // No headword itself eol = articleBody; // No headword itself
} }
if ( eol ) { if ( eol ) {
while ( *eol && Utf8::isspace( *eol ) ) { while ( *eol && Text::isspace( *eol ) ) {
++eol; // skip spaces ++eol; // skip spaces
} }

View file

@ -177,7 +177,7 @@ void Class::deferredInit()
//base method. //base method.
} }
sptr< WordSearchRequest > Class::stemmedMatch( wstring const & /*str*/, sptr< WordSearchRequest > Class::stemmedMatch( std::u32string const & /*str*/,
unsigned /*minLength*/, unsigned /*minLength*/,
unsigned /*maxSuffixVariation*/, unsigned /*maxSuffixVariation*/,
unsigned long /*maxResults*/ ) unsigned long /*maxResults*/ )
@ -185,12 +185,12 @@ sptr< WordSearchRequest > Class::stemmedMatch( wstring const & /*str*/,
return std::make_shared< WordSearchRequestInstant >(); return std::make_shared< WordSearchRequestInstant >();
} }
sptr< WordSearchRequest > Class::findHeadwordsForSynonym( wstring const & ) sptr< WordSearchRequest > Class::findHeadwordsForSynonym( std::u32string const & )
{ {
return std::make_shared< WordSearchRequestInstant >(); return std::make_shared< WordSearchRequestInstant >();
} }
vector< wstring > Class::getAlternateWritings( wstring const & ) noexcept vector< std::u32string > Class::getAlternateWritings( std::u32string const & ) noexcept
{ {
return {}; return {};
} }

View file

@ -19,7 +19,7 @@
#include "langcoder.hh" #include "langcoder.hh"
#include "sptr.hh" #include "sptr.hh"
#include "utils.hh" #include "utils.hh"
#include "wstring.hh" #include "text.hh"
#include <QtGlobal> #include <QtGlobal>
/// Abstract dictionary-related stuff /// Abstract dictionary-related stuff
@ -27,7 +27,6 @@ namespace Dictionary {
using std::vector; using std::vector;
using std::string; using std::string;
using gd::wstring;
using std::map; using std::map;
DEF_EX( Ex, "Dictionary error", std::exception ) DEF_EX( Ex, "Dictionary error", std::exception )
@ -124,19 +123,19 @@ private:
/// algorithms. Positive values are used by morphology matches. /// algorithms. Positive values are used by morphology matches.
struct WordMatch struct WordMatch
{ {
wstring word; std::u32string word;
int weight; int weight;
WordMatch(): WordMatch():
weight( 0 ) weight( 0 )
{ {
} }
WordMatch( wstring const & word_ ): WordMatch( std::u32string const & word_ ):
word( word_ ), word( word_ ),
weight( 0 ) weight( 0 )
{ {
} }
WordMatch( wstring const & word_, int weight_ ): WordMatch( std::u32string const & word_, int weight_ ):
word( word_ ), word( word_ ),
weight( weight_ ) weight( weight_ )
{ {
@ -431,7 +430,7 @@ public:
/// prefix results should be added. Not more than maxResults results should /// prefix results should be added. Not more than maxResults results should
/// be stored. The whole operation is supposed to be fast, though some /// be stored. The whole operation is supposed to be fast, though some
/// dictionaries, the network ones particularly, may of course be slow. /// dictionaries, the network ones particularly, may of course be slow.
virtual sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) = 0; virtual sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) = 0;
/// Looks up a given word in the dictionary, aiming to find different forms /// Looks up a given word in the dictionary, aiming to find different forms
/// of the given word by allowing suffix variations. This means allowing words /// of the given word by allowing suffix variations. This means allowing words
@ -442,20 +441,20 @@ public:
/// in the middle of a phrase got matched should be returned. /// in the middle of a phrase got matched should be returned.
/// The default implementation does nothing, returning an empty result. /// The default implementation does nothing, returning an empty result.
virtual sptr< WordSearchRequest > virtual sptr< WordSearchRequest >
stemmedMatch( wstring const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ); stemmedMatch( std::u32string const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults );
/// Finds known headwords for the given word, that is, the words for which /// Finds known headwords for the given word, that is, the words for which
/// the given word is a synonym. If a dictionary can't perform this operation, /// the given word is a synonym. If a dictionary can't perform this operation,
/// it should leave the default implementation which always returns an empty /// it should leave the default implementation which always returns an empty
/// result. /// result.
virtual sptr< WordSearchRequest > findHeadwordsForSynonym( wstring const & ); virtual sptr< WordSearchRequest > findHeadwordsForSynonym( std::u32string const & );
/// For a given word, provides alternate writings of it which are to be looked /// For a given word, provides alternate writings of it which are to be looked
/// up alongside with it. Transliteration dictionaries implement this. The /// up alongside with it. Transliteration dictionaries implement this. The
/// default implementation returns an empty list. Note that this function is /// default implementation returns an empty list. Note that this function is
/// supposed to be very fast and simple, and the results are thus returned /// supposed to be very fast and simple, and the results are thus returned
/// synchronously. /// synchronously.
virtual vector< wstring > getAlternateWritings( wstring const & ) noexcept; virtual vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept;
/// Returns a definition for the given word. The definition should /// Returns a definition for the given word. The definition should
/// be an html fragment (without html/head/body tags) in an utf8 encoding. /// be an html fragment (without html/head/body tags) in an utf8 encoding.
@ -464,10 +463,10 @@ public:
/// synonyms for the main word. /// synonyms for the main word.
/// context is a dictionary-specific data, currently only used for the /// context is a dictionary-specific data, currently only used for the
/// 'Websites' feature. /// 'Websites' feature.
virtual sptr< DataRequest > getArticle( wstring const &, virtual sptr< DataRequest > getArticle( std::u32string const &,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const & context = wstring(), std::u32string const & context = std::u32string(),
bool ignoreDiacritics = false ) = 0; bool ignoreDiacritics = false ) = 0;
/// Loads contents of a resource named 'name' into the 'data' vector. This is /// Loads contents of a resource named 'name' into the 'data' vector. This is
/// usually a picture file referenced in the article or something like that. /// usually a picture file referenced in the article or something like that.

View file

@ -2,7 +2,6 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "dictserver.hh" #include "dictserver.hh"
#include "wstring_qt.hh"
#include <QTimer> #include <QTimer>
#include <QUrl> #include <QUrl>
#include <QTcpSocket> #include <QTcpSocket>
@ -314,9 +313,10 @@ public:
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) override; sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) override;
sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; sptr< DataRequest >
getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override;
quint32 getLangFrom() const override quint32 getLangFrom() const override
{ {
@ -387,7 +387,7 @@ class DictServerWordSearchRequest: public Dictionary::WordSearchRequest
{ {
Q_OBJECT Q_OBJECT
QAtomicInt isCancelled; QAtomicInt isCancelled;
wstring word; std::u32string word;
QString errorString; QString errorString;
DictServerDictionary & dict; DictServerDictionary & dict;
@ -402,7 +402,7 @@ class DictServerWordSearchRequest: public Dictionary::WordSearchRequest
public: public:
DictServerWordSearchRequest( wstring word_, DictServerDictionary & dict_ ): DictServerWordSearchRequest( std::u32string word_, DictServerDictionary & dict_ ):
word( std::move( word_ ) ), word( std::move( word_ ) ),
dict( dict_ ), dict( dict_ ),
dictImpl( new DictServerImpl( this, dict_.url, "GoldenDict-w" ) ) dictImpl( new DictServerImpl( this, dict_.url, "GoldenDict-w" ) )
@ -562,7 +562,7 @@ void DictServer::DictServerWordSearchRequest::addMatchedWord( const QString & st
class DictServerArticleRequest: public Dictionary::DataRequest class DictServerArticleRequest: public Dictionary::DataRequest
{ {
QAtomicInt isCancelled; QAtomicInt isCancelled;
wstring word; std::u32string word;
QString errorString; QString errorString;
DictServerDictionary & dict; DictServerDictionary & dict;
string articleData; string articleData;
@ -578,7 +578,7 @@ class DictServerArticleRequest: public Dictionary::DataRequest
public: public:
DictServerImpl * dictImpl; DictServerImpl * dictImpl;
DictServerArticleRequest( wstring word_, DictServerDictionary & dict_ ): DictServerArticleRequest( std::u32string word_, DictServerDictionary & dict_ ):
word( std::move( word_ ) ), word( std::move( word_ ) ),
dict( dict_ ), dict( dict_ ),
dictImpl( new DictServerImpl( this, dict_.url, "GoldenDict-t" ) ) dictImpl( new DictServerImpl( this, dict_.url, "GoldenDict-t" ) )
@ -870,7 +870,7 @@ void DictServerArticleRequest::cancel()
finish(); finish();
} }
sptr< WordSearchRequest > DictServerDictionary::prefixMatch( wstring const & word, unsigned long maxResults ) sptr< WordSearchRequest > DictServerDictionary::prefixMatch( std::u32string const & word, unsigned long maxResults )
{ {
(void)maxResults; (void)maxResults;
if ( word.size() > 80 ) { if ( word.size() > 80 ) {
@ -883,8 +883,10 @@ sptr< WordSearchRequest > DictServerDictionary::prefixMatch( wstring const & wor
} }
} }
sptr< DataRequest > sptr< DataRequest > DictServerDictionary::getArticle( std::u32string const & word,
DictServerDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) vector< std::u32string > const &,
std::u32string const &,
bool )
{ {
if ( word.size() > 80 ) { if ( word.size() > 80 ) {

View file

@ -5,7 +5,7 @@
#include "dsl_details.hh" #include "dsl_details.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "chunkedstorage.hh" #include "chunkedstorage.hh"
#include "dictzip.hh" #include "dictzip.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
@ -13,7 +13,6 @@
#include "filetype.hh" #include "filetype.hh"
#include "audiolink.hh" #include "audiolink.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "wstring_qt.hh"
#include "indexedzip.hh" #include "indexedzip.hh"
#include "tiff.hh" #include "tiff.hh"
#include "ftshelpers.hh" #include "ftshelpers.hh"
@ -44,11 +43,9 @@ using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using std::string; using std::string;
using gd::wstring;
using gd::wchar;
using std::vector; using std::vector;
using std::list; using std::list;
using Utf8::Encoding; using Text::Encoding;
using BtreeIndexing::WordArticleLink; using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
@ -100,8 +97,8 @@ struct InsidedCard
{ {
uint32_t offset; uint32_t offset;
uint32_t size; uint32_t size;
QList< wstring > headwords; QList< std::u32string > headwords;
InsidedCard( uint32_t _offset, uint32_t _size, QList< wstring > const & words ): InsidedCard( uint32_t _offset, uint32_t _size, QList< std::u32string > const & words ):
offset( _offset ), offset( _offset ),
size( _size ), size( _size ),
headwords( words ) headwords( words )
@ -144,7 +141,7 @@ class DslDictionary: public BtreeIndexing::BtreeDictionary
int optionalPartNom; int optionalPartNom;
quint8 articleNom; quint8 articleNom;
wstring currentHeadword; std::u32string currentHeadword;
string resourceDir1, resourceDir2; string resourceDir1, resourceDir2;
public: public:
@ -187,8 +184,10 @@ public:
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -232,15 +231,15 @@ private:
/// Loads the article. Does not process the DSL language. /// Loads the article. Does not process the DSL language.
void loadArticle( uint32_t address, void loadArticle( uint32_t address,
wstring const & requestedHeadwordFolded, std::u32string const & requestedHeadwordFolded,
bool ignoreDiacritics, bool ignoreDiacritics,
wstring & tildeValue, std::u32string & tildeValue,
wstring & displayedHeadword, std::u32string & displayedHeadword,
unsigned & headwordIndex, unsigned & headwordIndex,
wstring & articleText ); std::u32string & articleText );
/// Converts DSL language to an Html. /// Converts DSL language to an Html.
string dslToHtml( wstring const &, wstring const & headword = wstring() ); string dslToHtml( std::u32string const &, std::u32string const & headword = std::u32string() );
// Parts of dslToHtml() // Parts of dslToHtml()
string nodeToHtml( ArticleDom::Node const & ); string nodeToHtml( ArticleDom::Node const & );
@ -452,7 +451,7 @@ void DslDictionary::loadIcon() noexcept
/// so nbsp is not a whitespace character for Dsl compiler. /// so nbsp is not a whitespace character for Dsl compiler.
/// For now we have only space and tab, since those are most likely the only /// For now we have only space and tab, since those are most likely the only
/// ones recognized as spaces by that compiler. /// ones recognized as spaces by that compiler.
bool isDslWs( wchar ch ) bool isDslWs( char32_t ch )
{ {
switch ( ch ) { switch ( ch ) {
case ' ': case ' ':
@ -464,14 +463,14 @@ bool isDslWs( wchar ch )
} }
void DslDictionary::loadArticle( uint32_t address, void DslDictionary::loadArticle( uint32_t address,
wstring const & requestedHeadwordFolded, std::u32string const & requestedHeadwordFolded,
bool ignoreDiacritics, bool ignoreDiacritics,
wstring & tildeValue, std::u32string & tildeValue,
wstring & displayedHeadword, std::u32string & displayedHeadword,
unsigned & headwordIndex, unsigned & headwordIndex,
wstring & articleText ) std::u32string & articleText )
{ {
wstring articleData; std::u32string articleData;
{ {
vector< char > chunk; vector< char > chunk;
@ -507,7 +506,7 @@ void DslDictionary::loadArticle( uint32_t address,
else { else {
try { try {
articleData = articleData =
Iconv::toWstring( Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize ); Iconv::toWstring( Text::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize );
free( articleBody ); free( articleBody );
// Strip DSL comments // Strip DSL comments
@ -528,27 +527,27 @@ void DslDictionary::loadArticle( uint32_t address,
// Check is we retrieve insided card // Check is we retrieve insided card
bool insidedCard = isDslWs( articleData.at( 0 ) ); bool insidedCard = isDslWs( articleData.at( 0 ) );
wstring tildeValueWithUnsorted; // This one has unsorted parts left std::u32string tildeValueWithUnsorted; // This one has unsorted parts left
for ( headwordIndex = 0;; ) { for ( headwordIndex = 0;; ) {
size_t begin = pos; size_t begin = pos;
pos = articleData.find_first_of( U"\n\r", begin ); pos = articleData.find_first_of( U"\n\r", begin );
if ( pos == wstring::npos ) { if ( pos == std::u32string::npos ) {
pos = articleData.size(); pos = articleData.size();
} }
if ( !foundDisplayedHeadword ) { if ( !foundDisplayedHeadword ) {
// Process the headword // Process the headword
wstring rawHeadword = wstring( articleData, begin, pos - begin ); std::u32string rawHeadword = std::u32string( articleData, begin, pos - begin );
if ( insidedCard && !rawHeadword.empty() && isDslWs( rawHeadword[ 0 ] ) ) { if ( insidedCard && !rawHeadword.empty() && isDslWs( rawHeadword[ 0 ] ) ) {
// Headword of the insided card // Headword of the insided card
wstring::size_type hpos = rawHeadword.find( L'@' ); std::u32string::size_type hpos = rawHeadword.find( L'@' );
if ( hpos != string::npos ) { if ( hpos != string::npos ) {
wstring head = Folding::trimWhitespace( rawHeadword.substr( hpos + 1 ) ); std::u32string head = Folding::trimWhitespace( rawHeadword.substr( hpos + 1 ) );
hpos = head.find( L'~' ); hpos = head.find( L'~' );
while ( hpos != string::npos ) { while ( hpos != string::npos ) {
if ( hpos == 0 || head[ hpos ] != L'\\' ) { if ( hpos == 0 || head[ hpos ] != L'\\' ) {
break; break;
@ -569,7 +568,7 @@ void DslDictionary::loadArticle( uint32_t address,
// We need our tilde expansion value // We need our tilde expansion value
tildeValue = rawHeadword; tildeValue = rawHeadword;
list< wstring > lst; list< std::u32string > lst;
expandOptionalParts( tildeValue, &lst ); expandOptionalParts( tildeValue, &lst );
@ -581,7 +580,7 @@ void DslDictionary::loadArticle( uint32_t address,
processUnsortedParts( tildeValue, false ); processUnsortedParts( tildeValue, false );
} }
wstring str = rawHeadword; std::u32string str = rawHeadword;
if ( hadFirstHeadword ) { if ( hadFirstHeadword ) {
expandTildes( str, tildeValueWithUnsorted ); expandTildes( str, tildeValueWithUnsorted );
@ -591,7 +590,7 @@ void DslDictionary::loadArticle( uint32_t address,
str = Folding::applySimpleCaseOnly( str ); str = Folding::applySimpleCaseOnly( str );
list< wstring > lst; list< std::u32string > lst;
expandOptionalParts( str, &lst ); expandOptionalParts( str, &lst );
// Does one of the results match the requested word? If so, we'd choose // Does one of the results match the requested word? If so, we'd choose
@ -657,15 +656,15 @@ void DslDictionary::loadArticle( uint32_t address,
// Check for begin article text // Check for begin article text
if ( insidedCard ) { if ( insidedCard ) {
// Check for next insided headword // Check for next insided headword
wstring::size_type hpos = articleData.find_first_of( U"\n\r", pos ); std::u32string::size_type hpos = articleData.find_first_of( U"\n\r", pos );
if ( hpos == wstring::npos ) { if ( hpos == std::u32string::npos ) {
hpos = articleData.size(); hpos = articleData.size();
} }
wstring str = wstring( articleData, pos, hpos - pos ); std::u32string str = std::u32string( articleData, pos, hpos - pos );
hpos = str.find( L'@' ); hpos = str.find( L'@' );
if ( hpos == wstring::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) { if ( hpos == std::u32string::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) {
break; break;
} }
} }
@ -687,18 +686,18 @@ void DslDictionary::loadArticle( uint32_t address,
} }
if ( pos != articleData.size() ) { if ( pos != articleData.size() ) {
articleText = wstring( articleData, pos ); articleText = std::u32string( articleData, pos );
} }
else { else {
articleText.clear(); articleText.clear();
} }
} }
string DslDictionary::dslToHtml( wstring const & str, wstring const & headword ) string DslDictionary::dslToHtml( std::u32string const & str, std::u32string const & headword )
{ {
// Normalize the string // Normalize the string
wstring normalizedStr = gd::normalize( str ); std::u32string normalizedStr = Text::normalize( str );
currentHeadword = headword; currentHeadword = headword;
ArticleDom dom( normalizedStr, getName(), headword ); ArticleDom dom( normalizedStr, getName(), headword );
@ -733,7 +732,7 @@ string DslDictionary::getNodeLink( ArticleDom::Node const & node )
} }
} }
if ( link.empty() ) { if ( link.empty() ) {
link = Html::escape( Filetype::simplifyString( Utf8::encode( node.renderAsText() ), false ) ); link = Html::escape( Filetype::simplifyString( Text::toUtf8( node.renderAsText() ), false ) );
} }
return link; return link;
@ -744,7 +743,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
string result; string result;
if ( !node.isTag ) { if ( !node.isTag ) {
result = Html::escape( Utf8::encode( node.text ) ); result = Html::escape( Text::toUtf8( node.text ) );
// Handle all end-of-line // Handle all end-of-line
@ -784,7 +783,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
result += "<span class=\"c_default_color\">" + processNodeChildren( node ) + "</span>"; result += "<span class=\"c_default_color\">" + processNodeChildren( node ) + "</span>";
} }
else { else {
result += "<font color=\"" + Html::escape( Utf8::encode( node.tagAttrs ) ) + "\">" + processNodeChildren( node ) result += "<font color=\"" + Html::escape( Text::toUtf8( node.tagAttrs ) ) + "\">" + processNodeChildren( node )
+ "</font>"; + "</font>";
} }
} }
@ -797,7 +796,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
result += "<div class=\"dsl_m\">" + processNodeChildren( node ) + "</div>"; result += "<div class=\"dsl_m\">" + processNodeChildren( node ) + "</div>";
} }
else if ( node.tagName.size() == 2 && node.tagName[ 0 ] == L'm' && iswdigit( node.tagName[ 1 ] ) ) { else if ( node.tagName.size() == 2 && node.tagName[ 0 ] == L'm' && iswdigit( node.tagName[ 1 ] ) ) {
result += "<div class=\"dsl_" + Utf8::encode( node.tagName ) + "\">" + processNodeChildren( node ) + "</div>"; result += "<div class=\"dsl_" + Text::toUtf8( node.tagName ) + "\">" + processNodeChildren( node ) + "</div>";
} }
else if ( node.tagName == U"trn" ) { else if ( node.tagName == U"trn" ) {
result += "<span class=\"dsl_trn\">" + processNodeChildren( node ) + "</span>"; result += "<span class=\"dsl_trn\">" + processNodeChildren( node ) + "</span>";
@ -809,7 +808,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
result += "<span class=\"dsl_com\">" + processNodeChildren( node ) + "</span>"; result += "<span class=\"dsl_com\">" + processNodeChildren( node ) + "</span>";
} }
else if ( node.tagName == U"s" || node.tagName == U"video" ) { else if ( node.tagName == U"s" || node.tagName == U"video" ) {
string filename = Filetype::simplifyString( Utf8::encode( node.renderAsText() ), false ); string filename = Filetype::simplifyString( Text::toUtf8( node.renderAsText() ), false );
string n = resourceDir1 + filename; string n = resourceDir1 + filename;
if ( Filetype::isNameOfSound( filename ) ) { if ( Filetype::isNameOfSound( filename ) ) {
@ -888,7 +887,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
else if ( node.tagName == U"p" ) { else if ( node.tagName == U"p" ) {
result += "<span class=\"dsl_p\""; result += "<span class=\"dsl_p\"";
string val = Utf8::encode( node.renderAsText() ); string val = Text::toUtf8( node.renderAsText() );
// If we have such a key, display a title // If we have such a key, display a title
@ -908,7 +907,8 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
// user could pick up the best suitable option. // user could pick up the best suitable option.
string data = processNodeChildren( node ); string data = processNodeChildren( node );
result += R"(<span class="dsl_stress"><span class="dsl_stress_without_accent">)" + data + "</span>" result += R"(<span class="dsl_stress"><span class="dsl_stress_without_accent">)" + data + "</span>"
+ "<span class=\"dsl_stress_with_accent\">" + data + Utf8::encode( wstring( 1, 0x301 ) ) + "</span></span>"; + "<span class=\"dsl_stress_with_accent\">" + data + Text::toUtf8( std::u32string( 1, 0x301 ) )
+ "</span></span>";
} }
else if ( node.tagName == U"lang" ) { else if ( node.tagName == U"lang" ) {
result += "<span class=\"dsl_lang\""; result += "<span class=\"dsl_lang\"";
@ -944,7 +944,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
url.setScheme( "gdlookup" ); url.setScheme( "gdlookup" );
url.setHost( "localhost" ); url.setHost( "localhost" );
auto nodeStr = Utf8::decode( getNodeLink( node ) ); auto nodeStr = Text::toUtf32( getNodeLink( node ) );
normalizeHeadword( nodeStr ); normalizeHeadword( nodeStr );
url.setPath( Utils::Url::ensureLeadingSlash( QString::fromStdU32String( nodeStr ) ) ); url.setPath( Utils::Url::ensureLeadingSlash( QString::fromStdU32String( nodeStr ) ) );
@ -968,7 +968,7 @@ string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
url.setScheme( "gdlookup" ); url.setScheme( "gdlookup" );
url.setHost( "localhost" ); url.setHost( "localhost" );
wstring nodeStr = node.renderAsText(); std::u32string nodeStr = node.renderAsText();
normalizeHeadword( nodeStr ); normalizeHeadword( nodeStr );
url.setPath( Utils::Url::ensureLeadingSlash( QString::fromStdU32String( nodeStr ) ) ); url.setPath( Utils::Url::ensureLeadingSlash( QString::fromStdU32String( nodeStr ) ) );
@ -1120,7 +1120,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
vector< char > chunk; vector< char > chunk;
char * articleProps; char * articleProps;
wstring articleData; std::u32string articleData;
{ {
QMutexLocker _( &idxMutex ); QMutexLocker _( &idxMutex );
@ -1161,7 +1161,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
// Skip headword // Skip headword
size_t pos = 0; size_t pos = 0;
wstring articleHeadword, tildeValue; std::u32string articleHeadword, tildeValue;
// Check if we retrieve insided card // Check if we retrieve insided card
bool insidedCard = isDslWs( articleData.at( 0 ) ); bool insidedCard = isDslWs( articleData.at( 0 ) );
@ -1170,20 +1170,20 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
size_t begin = pos; size_t begin = pos;
pos = articleData.find_first_of( U"\n\r", begin ); pos = articleData.find_first_of( U"\n\r", begin );
if ( pos == wstring::npos ) { if ( pos == std::u32string::npos ) {
pos = articleData.size(); pos = articleData.size();
} }
if ( articleHeadword.empty() ) { if ( articleHeadword.empty() ) {
// Process the headword // Process the headword
articleHeadword = wstring( articleData, begin, pos - begin ); articleHeadword = std::u32string( articleData, begin, pos - begin );
if ( insidedCard && !articleHeadword.empty() && isDslWs( articleHeadword[ 0 ] ) ) { if ( insidedCard && !articleHeadword.empty() && isDslWs( articleHeadword[ 0 ] ) ) {
// Headword of the insided card // Headword of the insided card
wstring::size_type hpos = articleHeadword.find( L'@' ); std::u32string::size_type hpos = articleHeadword.find( L'@' );
if ( hpos != string::npos ) { if ( hpos != string::npos ) {
wstring head = Folding::trimWhitespace( articleHeadword.substr( hpos + 1 ) ); std::u32string head = Folding::trimWhitespace( articleHeadword.substr( hpos + 1 ) );
hpos = head.find( L'~' ); hpos = head.find( L'~' );
while ( hpos != string::npos ) { while ( hpos != string::npos ) {
if ( hpos == 0 || head[ hpos ] != L'\\' ) { if ( hpos == 0 || head[ hpos ] != L'\\' ) {
break; break;
@ -1200,7 +1200,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
} }
if ( !articleHeadword.empty() ) { if ( !articleHeadword.empty() ) {
list< wstring > lst; list< std::u32string > lst;
tildeValue = articleHeadword; tildeValue = articleHeadword;
@ -1237,15 +1237,15 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
// Check for begin article text // Check for begin article text
if ( insidedCard ) { if ( insidedCard ) {
// Check for next insided headword // Check for next insided headword
wstring::size_type hpos = articleData.find_first_of( U"\n\r", pos ); std::u32string::size_type hpos = articleData.find_first_of( U"\n\r", pos );
if ( hpos == wstring::npos ) { if ( hpos == std::u32string::npos ) {
hpos = articleData.size(); hpos = articleData.size();
} }
wstring str = wstring( articleData, pos, hpos - pos ); std::u32string str = std::u32string( articleData, pos, hpos - pos );
hpos = str.find( L'@' ); hpos = str.find( L'@' );
if ( hpos == wstring::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) { if ( hpos == std::u32string::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) ) {
break; break;
} }
} }
@ -1261,17 +1261,17 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
headword = QString::fromStdU32String( articleHeadword ); headword = QString::fromStdU32String( articleHeadword );
} }
wstring articleText; std::u32string articleText;
if ( pos != articleData.size() ) { if ( pos != articleData.size() ) {
articleText = wstring( articleData, pos ); articleText = std::u32string( articleData, pos );
} }
else { else {
articleText.clear(); articleText.clear();
} }
if ( !tildeValue.empty() ) { if ( !tildeValue.empty() ) {
list< wstring > lst; list< std::u32string > lst;
processUnsortedParts( tildeValue, false ); processUnsortedParts( tildeValue, false );
expandOptionalParts( tildeValue, &lst ); expandOptionalParts( tildeValue, &lst );
@ -1377,8 +1377,8 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
class DslArticleRequest: public Dictionary::DataRequest class DslArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
DslDictionary & dict; DslDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -1387,8 +1387,8 @@ class DslArticleRequest: public Dictionary::DataRequest
public: public:
DslArticleRequest( wstring const & word_, DslArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
DslDictionary & dict_, DslDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -1444,7 +1444,7 @@ void DslArticleRequest::run()
// index here. // index here.
set< pair< uint32_t, unsigned > > articlesIncluded; set< pair< uint32_t, unsigned > > articlesIncluded;
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
for ( auto & x : chain ) { for ( auto & x : chain ) {
// Check if we're cancelled occasionally // Check if we're cancelled occasionally
@ -1455,9 +1455,9 @@ void DslArticleRequest::run()
// Grab that article // Grab that article
wstring tildeValue; std::u32string tildeValue;
wstring displayedHeadword; std::u32string displayedHeadword;
wstring articleBody; std::u32string articleBody;
unsigned headwordIndex; unsigned headwordIndex;
string articleText, articleAfter; string articleText, articleAfter;
@ -1541,9 +1541,9 @@ void DslArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > DslDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > DslDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -1632,7 +1632,7 @@ void DslResourceRequest::run()
if ( dict.resourceZip.isOpen() ) { if ( dict.resourceZip.isOpen() ) {
QMutexLocker _( &dataMutex ); QMutexLocker _( &dataMutex );
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) {
throw; // Make it fail since we couldn't read the archive throw; // Make it fail since we couldn't read the archive
} }
} }
@ -1761,7 +1761,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
// Building the index // Building the index
initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) ); initializing.indexingDictionary( Text::toUtf8( scanner.getDictionaryName() ) );
qDebug( "Dsl: Building the index for dictionary: %s", qDebug( "Dsl: Building the index for dictionary: %s",
QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() ); QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() );
@ -1777,12 +1777,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idx.write( idxHeader ); idx.write( idxHeader );
string dictionaryName = Utf8::encode( scanner.getDictionaryName() ); string dictionaryName = Text::toUtf8( scanner.getDictionaryName() );
idx.write( (uint32_t)dictionaryName.size() ); idx.write( (uint32_t)dictionaryName.size() );
idx.write( dictionaryName.data(), dictionaryName.size() ); idx.write( dictionaryName.data(), dictionaryName.size() );
string soundDictName = Utf8::encode( scanner.getSoundDictionaryName() ); string soundDictName = Text::toUtf8( scanner.getSoundDictionaryName() );
if ( !soundDictName.empty() ) { if ( !soundDictName.empty() ) {
idxHeader.hasSoundDictionaryName = 1; idxHeader.hasSoundDictionaryName = 1;
idx.write( (uint32_t)soundDictName.size() ); idx.write( (uint32_t)soundDictName.size() );
@ -1803,7 +1803,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
map< string, string > abrv; map< string, string > abrv;
wstring curString; std::u32string curString;
size_t curOffset; size_t curOffset;
for ( ;; ) { for ( ;; ) {
@ -1815,7 +1815,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
continue; continue;
} }
list< wstring > keys; list< std::u32string > keys;
bool eof = false; bool eof = false;
@ -1851,13 +1851,13 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
// If the string has any dsl markup, we strip it // If the string has any dsl markup, we strip it
string value = Utf8::encode( ArticleDom( curString ).root.renderAsText() ); string value = Text::toUtf8( ArticleDom( curString ).root.renderAsText() );
for ( auto & key : keys ) { for ( auto & key : keys ) {
unescapeDsl( key ); unescapeDsl( key );
normalizeHeadword( key ); normalizeHeadword( key );
abrv[ Utf8::encode( Folding::trimWhitespace( key ) ) ] = value; abrv[ Text::toUtf8( Folding::trimWhitespace( key ) ) ] = value;
} }
} }
@ -1885,7 +1885,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
bool hasString = false; bool hasString = false;
wstring curString; std::u32string curString;
size_t curOffset; size_t curOffset;
uint32_t articleCount = 0, wordCount = 0; uint32_t articleCount = 0, wordCount = 0;
@ -1919,7 +1919,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Ok, got the headword // Ok, got the headword
list< wstring > allEntryWords; list< std::u32string > allEntryWords;
processUnsortedParts( curString, true ); processUnsortedParts( curString, true );
expandOptionalParts( curString, &allEntryWords ); expandOptionalParts( curString, &allEntryWords );
@ -1972,10 +1972,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
wordCount += allEntryWords.size(); wordCount += allEntryWords.size();
int insideInsided = 0; int insideInsided = 0;
wstring headword; std::u32string headword;
QList< InsidedCard > insidedCards; QList< InsidedCard > insidedCards;
uint32_t offset = curOffset; uint32_t offset = curOffset;
QList< wstring > insidedHeadwords; QList< std::u32string > insidedHeadwords;
unsigned linesInsideCard = 0; unsigned linesInsideCard = 0;
int dogLine = 0; int dogLine = 0;
bool wasEmptyLine = false; bool wasEmptyLine = false;
@ -2018,8 +2018,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Find embedded cards // Find embedded cards
wstring::size_type n = curString.find( L'@' ); std::u32string::size_type n = curString.find( L'@' );
if ( n == wstring::npos || curString[ n - 1 ] == L'\\' ) { if ( n == std::u32string::npos || curString[ n - 1 ] == L'\\' ) {
if ( insideInsided ) { if ( insideInsided ) {
linesInsideCard++; linesInsideCard++;
} }

View file

@ -6,7 +6,7 @@
#include "folding.hh" #include "folding.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "ufile.hh" #include "ufile.hh"
#include "utf8.hh" #include "text.hh"
#include <exception> #include <exception>
#include <stdio.h> #include <stdio.h>
@ -17,9 +17,8 @@
namespace Dsl { namespace Dsl {
namespace Details { namespace Details {
using gd::wstring;
using std::list; using std::list;
using Utf8::Encoding; using Text::Encoding;
static QMap< int, string > lang_codes = { static QMap< int, string > lang_codes = {
{ 1, "en" }, { 1033, "en" }, { 2, "ru" }, { 1049, "ru" }, { 1068, "az" }, { 1025, "ar" }, { 1067, "am" }, { 1, "en" }, { 1033, "en" }, { 2, "ru" }, { 1049, "ru" }, { 1068, "az" }, { 1025, "ar" }, { 1067, "am" },
@ -40,7 +39,7 @@ string findCodeForDslId( int id )
return lang_codes[ id ]; return lang_codes[ id ];
} }
bool isAtSignFirst( wstring const & str ) bool isAtSignFirst( std::u32string const & str )
{ {
// Test if '@' is first in string except spaces and dsl tags // Test if '@' is first in string except spaces and dsl tags
QRegularExpression reg( R"([ \t]*(?:\[[^\]]+\][ \t]*)*@)", QRegularExpression::PatternOption::CaseInsensitiveOption ); QRegularExpression reg( R"([ \t]*(?:\[[^\]]+\][ \t]*)*@)", QRegularExpression::PatternOption::CaseInsensitiveOption );
@ -49,13 +48,13 @@ bool isAtSignFirst( wstring const & str )
/////////////// ArticleDom /////////////// ArticleDom
wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const std::u32string ArticleDom::Node::renderAsText( bool stripTrsTag ) const
{ {
if ( !isTag ) { if ( !isTag ) {
return text; return text;
} }
wstring result; std::u32string result;
for ( const auto & i : *this ) { for ( const auto & i : *this ) {
if ( !stripTrsTag || i.tagName != U"!trs" ) { if ( !stripTrsTag || i.tagName != U"!trs" ) {
@ -69,17 +68,17 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const
namespace { namespace {
/// @return true if @p tagName equals "mN" where N is a digit /// @return true if @p tagName equals "mN" where N is a digit
bool is_mN( wstring const & tagName ) bool is_mN( std::u32string const & tagName )
{ {
return tagName.size() == 2 && tagName[ 0 ] == U'm' && iswdigit( tagName[ 1 ] ); return tagName.size() == 2 && tagName[ 0 ] == U'm' && iswdigit( tagName[ 1 ] );
} }
bool isAnyM( wstring const & tagName ) bool isAnyM( std::u32string const & tagName )
{ {
return tagName == U"m" || is_mN( tagName ); return tagName == U"m" || is_mN( tagName );
} }
bool checkM( wstring const & dest, wstring const & src ) bool checkM( std::u32string const & dest, std::u32string const & src )
{ {
return src == U"m" && is_mN( dest ); return src == U"m" && is_mN( dest );
} }
@ -97,8 +96,8 @@ struct MustTagBeClosed
} // unnamed namespace } // unnamed namespace
ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring const & headword_ ): ArticleDom::ArticleDom( std::u32string const & str, string const & dictName, std::u32string const & headword_ ):
root( Node::Tag(), wstring(), wstring() ), root( Node::Tag(), std::u32string(), std::u32string() ),
stringPos( str.c_str() ), stringPos( str.c_str() ),
lineStartPos( str.c_str() ), lineStartPos( str.c_str() ),
transcriptionCount( 0 ), transcriptionCount( 0 ),
@ -126,7 +125,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
} }
else { else {
// Insided card // Insided card
wstring linkTo; std::u32string linkTo;
nextChar(); nextChar();
for ( ;; nextChar() ) { for ( ;; nextChar() ) {
if ( ch == L'\n' ) { if ( ch == L'\n' ) {
@ -142,13 +141,13 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
linkTo = Folding::trimWhitespace( linkTo ); linkTo = Folding::trimWhitespace( linkTo );
if ( !linkTo.empty() ) { if ( !linkTo.empty() ) {
list< wstring > allLinkEntries; list< std::u32string > allLinkEntries;
processUnsortedParts( linkTo, true ); processUnsortedParts( linkTo, true );
expandOptionalParts( linkTo, &allLinkEntries ); expandOptionalParts( linkTo, &allLinkEntries );
for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) { for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) {
if ( !textNode ) { if ( !textNode ) {
Node text = Node( Node::Text(), wstring() ); Node text = Node( Node::Text(), std::u32string() );
if ( stack.empty() ) { if ( stack.empty() ) {
root.push_back( text ); root.push_back( text );
@ -168,10 +167,10 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
stack.pop_back(); stack.pop_back();
textNode = 0; textNode = 0;
wstring linkText = Folding::trimWhitespace( *entry ); std::u32string linkText = Folding::trimWhitespace( *entry );
ArticleDom nodeDom( linkText, dictName, headword_ ); ArticleDom nodeDom( linkText, dictName, headword_ );
Node link( Node::Tag(), U"@", wstring() ); Node link( Node::Tag(), U"@", std::u32string() );
for ( auto & n : nodeDom.root ) { for ( auto & n : nodeDom.root ) {
link.push_back( n ); link.push_back( n );
} }
@ -181,13 +180,13 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
if ( stack.empty() ) { if ( stack.empty() ) {
root.push_back( link ); root.push_back( link );
if ( entry != allLinkEntries.end() ) { // Add line break before next entry if ( entry != allLinkEntries.end() ) { // Add line break before next entry
root.push_back( Node( Node::Tag(), U"br", wstring() ) ); root.push_back( Node( Node::Tag(), U"br", std::u32string() ) );
} }
} }
else { else {
stack.back()->push_back( link ); stack.back()->push_back( link );
if ( entry != allLinkEntries.end() ) { if ( entry != allLinkEntries.end() ) {
stack.back()->push_back( Node( Node::Tag(), U"br", wstring() ) ); stack.back()->push_back( Node( Node::Tag(), U"br", std::u32string() ) );
} }
} }
} }
@ -208,8 +207,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
if ( ch == L'[' && !escaped ) { if ( ch == L'[' && !escaped ) {
// Beginning of a tag. // Beginning of a tag.
bool isClosing; bool isClosing;
wstring name; std::u32string name;
wstring attrs; std::u32string attrs;
try { try {
do { do {
@ -330,7 +329,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
nextChar(); nextChar();
} while ( Folding::isWhitespace( ch ) ); } while ( Folding::isWhitespace( ch ) );
wstring linkTo, linkText; std::u32string linkTo, linkText;
for ( ;; nextChar() ) { for ( ;; nextChar() ) {
// Is it the end? // Is it the end?
@ -373,7 +372,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
processUnsortedParts( linkText, true ); processUnsortedParts( linkText, true );
ArticleDom nodeDom( linkText, dictName, headword_ ); ArticleDom nodeDom( linkText, dictName, headword_ );
Node link( Node::Tag(), U"ref", wstring() ); Node link( Node::Tag(), U"ref", std::u32string() );
for ( auto & n : nodeDom.root ) { for ( auto & n : nodeDom.root ) {
link.push_back( n ); link.push_back( n );
} }
@ -427,7 +426,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
// If there's currently no text node, open one // If there's currently no text node, open one
if ( !textNode ) { if ( !textNode ) {
Node text = Node( Node::Text(), wstring() ); Node text = Node( Node::Text(), std::u32string() );
if ( stack.empty() ) { if ( stack.empty() ) {
root.push_back( text ); root.push_back( text );
@ -691,7 +690,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, wstring co
} }
} }
void ArticleDom::openTag( wstring const & name, wstring const & attrs, list< Node * > & stack ) void ArticleDom::openTag( std::u32string const & name, std::u32string const & attrs, list< Node * > & stack )
{ {
list< Node > nodesToReopen; list< Node > nodesToReopen;
@ -746,7 +745,7 @@ void ArticleDom::openTag( wstring const & name, wstring const & attrs, list< Nod
} }
} }
void ArticleDom::closeTag( wstring const & name, list< Node * > & stack, bool warn ) void ArticleDom::closeTag( std::u32string const & name, list< Node * > & stack, bool warn )
{ {
// Find the tag which is to be closed // Find the tag which is to be closed
@ -839,13 +838,13 @@ bool ArticleDom::atSignFirstInLine()
return true; return true;
} }
return isAtSignFirst( wstring( lineStartPos ) ); return isAtSignFirst( std::u32string( lineStartPos ) );
} }
/////////////// DslScanner /////////////// DslScanner
DslScanner::DslScanner( string const & fileName ): DslScanner::DslScanner( string const & fileName ):
encoding( Utf8::Utf8 ), encoding( Text::Utf8 ),
readBufferPtr( readBuffer ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), readBufferLeft( 0 ),
linesRead( 0 ) linesRead( 0 )
@ -876,19 +875,19 @@ DslScanner::DslScanner( string const & fileName ):
guessedEncoding.has_value() ) { guessedEncoding.has_value() ) {
switch ( guessedEncoding.value() ) { switch ( guessedEncoding.value() ) {
case QStringConverter::Utf8: case QStringConverter::Utf8:
encoding = Utf8::Utf8; encoding = Text::Utf8;
break; break;
case QStringConverter::Utf16LE: case QStringConverter::Utf16LE:
encoding = Utf8::Utf16LE; encoding = Text::Utf16LE;
break; break;
case QStringConverter::Utf16BE: case QStringConverter::Utf16BE:
encoding = Utf8::Utf16BE; encoding = Text::Utf16BE;
break; break;
case QStringConverter::Utf32LE: case QStringConverter::Utf32LE:
encoding = Utf8::Utf16LE; encoding = Text::Utf16LE;
break; break;
case QStringConverter::Utf32BE: case QStringConverter::Utf32BE:
encoding = Utf8::Utf32BE; encoding = Text::Utf32BE;
break; break;
default: default:
break; break;
@ -905,10 +904,10 @@ DslScanner::DslScanner( string const & fileName ):
} }
//iconv.reinit( encoding ); //iconv.reinit( encoding );
lineFeed = Utf8::initLineFeed( encoding ); lineFeed = Text::initLineFeed( encoding );
// We now can use our own readNextLine() function // We now can use our own readNextLine() function
wstring str; std::u32string str;
size_t offset; size_t offset;
for ( ;; ) { for ( ;; ) {
@ -946,7 +945,7 @@ DslScanner::DslScanner( string const & fileName ):
size_t beg = str.find_first_of( L'"' ); size_t beg = str.find_first_of( L'"' );
if ( beg == wstring::npos ) { if ( beg == std::u32string::npos ) {
throw exMalformedDslFile( fileName ); throw exMalformedDslFile( fileName );
} }
@ -956,7 +955,7 @@ DslScanner::DslScanner( string const & fileName ):
throw exMalformedDslFile( fileName ); throw exMalformedDslFile( fileName );
} }
wstring arg( str, beg + 1, end - beg - 1 ); std::u32string arg( str, beg + 1, end - beg - 1 );
if ( isName ) { if ( isName ) {
dictionaryName = arg; dictionaryName = arg;
@ -977,13 +976,13 @@ DslScanner::DslScanner( string const & fileName ):
qWarning( "Warning: encoding was specified in a Unicode file, ignoring." ); qWarning( "Warning: encoding was specified in a Unicode file, ignoring." );
} }
else if ( !arg.compare( U"Latin" ) ) { else if ( !arg.compare( U"Latin" ) ) {
encoding = Utf8::Windows1252; encoding = Text::Windows1252;
} }
else if ( !arg.compare( U"Cyrillic" ) ) { else if ( !arg.compare( U"Cyrillic" ) ) {
encoding = Utf8::Windows1251; encoding = Text::Windows1251;
} }
else if ( !arg.compare( U"EasternEuropean" ) ) { else if ( !arg.compare( U"EasternEuropean" ) ) {
encoding = Utf8::Windows1250; encoding = Text::Windows1250;
} }
else { else {
gzclose( f ); gzclose( f );
@ -1009,7 +1008,7 @@ DslScanner::~DslScanner() noexcept
gzclose( f ); gzclose( f );
} }
bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) bool DslScanner::readNextLine( std::u32string & out, size_t & offset, bool only_head_word )
{ {
offset = gztell( f ) - readBufferLeft /*+pos*/; offset = gztell( f ) - readBufferLeft /*+pos*/;
@ -1036,7 +1035,7 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
return false; return false;
} }
int pos = Utf8::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length ); int pos = Text::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length );
if ( pos == -1 ) { if ( pos == -1 ) {
return false; return false;
} }
@ -1057,9 +1056,9 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
} }
} }
bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset, bool only_headword ) bool DslScanner::readNextLineWithoutComments( std::u32string & out, size_t & offset, bool only_headword )
{ {
wstring str; std::u32string str;
bool commentToNextLine = false; bool commentToNextLine = false;
size_t currentOffset; size_t currentOffset;
@ -1087,14 +1086,14 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset, bo
/////////////// DslScanner /////////////// DslScanner
void processUnsortedParts( wstring & str, bool strip ) void processUnsortedParts( std::u32string & str, bool strip )
{ {
int refCount = 0; int refCount = 0;
size_t startPos = 0; size_t startPos = 0;
for ( size_t x = 0; x < str.size(); ) { for ( size_t x = 0; x < str.size(); ) {
wchar ch = str[ x ]; char32_t ch = str[ x ];
if ( ch == L'\\' ) { if ( ch == L'\\' ) {
// Escape code // Escape code
@ -1150,18 +1149,18 @@ void processUnsortedParts( wstring & str, bool strip )
} }
} }
void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, bool inside_recurse ) void expandOptionalParts( std::u32string & str, list< std::u32string > * result, size_t x, bool inside_recurse )
{ {
// if str is too long ,it can never be headwords. // if str is too long ,it can never be headwords.
if ( str.size() > 100 ) { if ( str.size() > 100 ) {
return; return;
} }
list< wstring > expanded; list< std::u32string > expanded;
list< wstring > * headwords; list< std::u32string > * headwords;
headwords = inside_recurse ? result : &expanded; headwords = inside_recurse ? result : &expanded;
for ( ; x < str.size(); ) { for ( ; x < str.size(); ) {
wchar ch = str[ x ]; char32_t ch = str[ x ];
if ( ch == L'\\' ) { if ( ch == L'\\' ) {
// Escape code // Escape code
@ -1174,7 +1173,7 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo
int refCount = 1; int refCount = 1;
for ( size_t y = x + 1; y < str.size(); ++y ) { for ( size_t y = x + 1; y < str.size(); ++y ) {
wchar ch = str[ y ]; char32_t ch = str[ y ];
if ( ch == L'\\' ) { if ( ch == L'\\' ) {
// Escape code // Escape code
@ -1190,7 +1189,7 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo
if ( y != x + 1 ) // Only do for non-empty cases if ( y != x + 1 ) // Only do for non-empty cases
{ {
wstring removed( str, 0, x ); std::u32string removed( str, 0, x );
removed.append( str, y + 1, str.size() - y - 1 ); removed.append( str, y + 1, str.size() - y - 1 );
expandOptionalParts( removed, headwords, x, true ); expandOptionalParts( removed, headwords, x, true );
@ -1204,7 +1203,7 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo
if ( refCount && x != str.size() - 1 ) { if ( refCount && x != str.size() - 1 ) {
// Closing paren not found? Chop it. // Closing paren not found? Chop it.
wstring removed( str, 0, x ); std::u32string removed( str, 0, x );
// Limit the amount of results to avoid excessive resource consumption // Limit the amount of results to avoid excessive resource consumption
if ( headwords->size() < 32 ) { if ( headwords->size() < 32 ) {
@ -1242,10 +1241,10 @@ void expandOptionalParts( wstring & str, list< wstring > * result, size_t x, boo
} }
} }
static const wstring openBraces( U"{{" ); static const std::u32string openBraces( U"{{" );
static const wstring closeBraces( U"}}" ); static const std::u32string closeBraces( U"}}" );
void stripComments( wstring & str, bool & nextLine ) void stripComments( std::u32string & str, bool & nextLine )
{ {
string::size_type n = 0, n2 = 0; string::size_type n = 0, n2 = 0;
@ -1269,9 +1268,9 @@ void stripComments( wstring & str, bool & nextLine )
} }
} }
void expandTildes( wstring & str, wstring const & tildeReplacement ) void expandTildes( std::u32string & str, std::u32string const & tildeReplacement )
{ {
wstring tildeValue = Folding::trimWhitespace( tildeReplacement ); std::u32string tildeValue = Folding::trimWhitespace( tildeReplacement );
for ( size_t x = 0; x < str.size(); ) { for ( size_t x = 0; x < str.size(); ) {
if ( str[ x ] == L'\\' ) { if ( str[ x ] == L'\\' ) {
x += 2; x += 2;
@ -1294,7 +1293,7 @@ void expandTildes( wstring & str, wstring const & tildeReplacement )
} }
} }
void unescapeDsl( wstring & str ) void unescapeDsl( std::u32string & str )
{ {
for ( size_t x = 0; x < str.size(); ++x ) { for ( size_t x = 0; x < str.size(); ++x ) {
if ( str[ x ] == L'\\' ) { if ( str[ x ] == L'\\' ) {
@ -1303,7 +1302,7 @@ void unescapeDsl( wstring & str )
} }
} }
void normalizeHeadword( wstring & str ) void normalizeHeadword( std::u32string & str )
{ {
for ( size_t x = str.size(); x-- > 1; ) // >1 -- Don't test the first char for ( size_t x = str.size(); x-- > 1; ) // >1 -- Don't test the first char
{ {
@ -1331,7 +1330,7 @@ void normalizeHeadword( wstring & str )
} }
namespace { namespace {
void cutEnding( wstring & where, wstring const & ending ) void cutEnding( std::u32string & where, std::u32string const & ending )
{ {
if ( where.size() > ending.size() && where.compare( where.size() - ending.size(), ending.size(), ending ) == 0 ) { if ( where.size() > ending.size() && where.compare( where.size() - ending.size(), ending.size(), ending ) == 0 ) {
where.erase( where.size() - ending.size() ); where.erase( where.size() - ending.size() );
@ -1339,17 +1338,17 @@ void cutEnding( wstring & where, wstring const & ending )
} }
} // namespace } // namespace
quint32 dslLanguageToId( wstring const & name ) quint32 dslLanguageToId( std::u32string const & name )
{ {
static wstring newSp( U"newspelling" ); static std::u32string newSp( U"newspelling" );
static wstring st( U"standard" ); static std::u32string st( U"standard" );
static wstring ms( U"modernsort" ); static std::u32string ms( U"modernsort" );
static wstring ts( U"traditionalsort" ); static std::u32string ts( U"traditionalsort" );
static wstring prc( U"prc" ); static std::u32string prc( U"prc" );
// Any of those endings are to be removed // Any of those endings are to be removed
wstring nameStripped = Folding::apply( name ); std::u32string nameStripped = Folding::apply( name );
cutEnding( nameStripped, newSp ); cutEnding( nameStripped, newSp );
cutEnding( nameStripped, st ); cutEnding( nameStripped, st );

View file

@ -11,23 +11,21 @@
#include "iconv.hh" #include "iconv.hh"
#include <QtCore5Compat/QTextCodec> #include <QtCore5Compat/QTextCodec>
#include <QByteArray> #include <QByteArray>
#include "utf8.hh" #include "text.hh"
// Implementation details for Dsl, not part of its interface // Implementation details for Dsl, not part of its interface
namespace Dsl { namespace Dsl {
namespace Details { namespace Details {
using std::string; using std::string;
using gd::wstring;
using gd::wchar;
using std::list; using std::list;
using std::vector; using std::vector;
using Utf8::Encoding; using Text::Encoding;
using Utf8::LineFeed; using Text::LineFeed;
string findCodeForDslId( int id ); string findCodeForDslId( int id );
bool isAtSignFirst( wstring const & str ); bool isAtSignFirst( std::u32string const & str );
/// Parses the DSL language, representing it in its structural DOM form. /// Parses the DSL language, representing it in its structural DOM form.
struct ArticleDom struct ArticleDom
@ -37,23 +35,23 @@ struct ArticleDom
bool isTag; // true if it is a tag with subnodes, false if it's a leaf text bool isTag; // true if it is a tag with subnodes, false if it's a leaf text
// data. // data.
// Those are only used if isTag is true // Those are only used if isTag is true
wstring tagName; std::u32string tagName;
wstring tagAttrs; std::u32string tagAttrs;
wstring text; // This is only used if isTag is false std::u32string text; // This is only used if isTag is false
class Text class Text
{}; {};
class Tag class Tag
{}; {};
Node( Tag, wstring const & name, wstring const & attrs ): Node( Tag, std::u32string const & name, std::u32string const & attrs ):
isTag( true ), isTag( true ),
tagName( name ), tagName( name ),
tagAttrs( attrs ) tagAttrs( attrs )
{ {
} }
Node( Text, wstring const & text_ ): Node( Text, std::u32string const & text_ ):
isTag( false ), isTag( false ),
text( text_ ) text( text_ )
{ {
@ -61,30 +59,32 @@ struct ArticleDom
/// Concatenates all childen text nodes recursively to form all text /// Concatenates all childen text nodes recursively to form all text
/// the node contains stripped of any markup. /// the node contains stripped of any markup.
wstring renderAsText( bool stripTrsTag = false ) const; std::u32string renderAsText( bool stripTrsTag = false ) const;
}; };
/// Does the parse at construction. Refer to the 'root' member variable /// Does the parse at construction. Refer to the 'root' member variable
/// afterwards. /// afterwards.
explicit ArticleDom( wstring const &, string const & dictName = string(), wstring const & headword_ = wstring() ); explicit ArticleDom( std::u32string const &,
string const & dictName = string(),
std::u32string const & headword_ = std::u32string() );
/// Root of DOM's tree /// Root of DOM's tree
Node root; Node root;
private: private:
void openTag( wstring const & name, wstring const & attr, list< Node * > & stack ); void openTag( std::u32string const & name, std::u32string const & attr, list< Node * > & stack );
void closeTag( wstring const & name, list< Node * > & stack, bool warn = true ); void closeTag( std::u32string const & name, list< Node * > & stack, bool warn = true );
bool atSignFirstInLine(); bool atSignFirstInLine();
wchar const *stringPos, *lineStartPos; char32_t const *stringPos, *lineStartPos;
class eot: std::exception class eot: std::exception
{}; {};
wchar ch; char32_t ch;
bool escaped; bool escaped;
unsigned transcriptionCount; // >0 = inside a [t] tag unsigned transcriptionCount; // >0 = inside a [t] tag
unsigned mediaCount; // >0 = inside a [s] tag unsigned mediaCount; // >0 = inside a [s] tag
@ -93,7 +93,7 @@ private:
/// Information for diagnostic purposes /// Information for diagnostic purposes
string dictionaryName; string dictionaryName;
wstring headword; std::u32string headword;
}; };
/// Opens the .dsl or .dsl.dz file and allows line-by-line reading. Auto-detects /// Opens the .dsl or .dsl.dz file and allows line-by-line reading. Auto-detects
@ -103,9 +103,9 @@ class DslScanner
gzFile f; gzFile f;
Encoding encoding; Encoding encoding;
QTextCodec * codec; QTextCodec * codec;
wstring dictionaryName; std::u32string dictionaryName;
wstring langFrom, langTo; std::u32string langFrom, langTo;
wstring soundDictionary; std::u32string soundDictionary;
char readBuffer[ 65536 ]; char readBuffer[ 65536 ];
char * readBufferPtr; char * readBufferPtr;
LineFeed lineFeed; LineFeed lineFeed;
@ -132,25 +132,25 @@ public:
} }
/// Returns the dictionary's name, as was read from file's headers. /// Returns the dictionary's name, as was read from file's headers.
wstring const & getDictionaryName() const std::u32string const & getDictionaryName() const
{ {
return dictionaryName; return dictionaryName;
} }
/// Returns the dictionary's source language, as was read from file's headers. /// Returns the dictionary's source language, as was read from file's headers.
wstring const & getLangFrom() const std::u32string const & getLangFrom() const
{ {
return langFrom; return langFrom;
} }
/// Returns the dictionary's target language, as was read from file's headers. /// Returns the dictionary's target language, as was read from file's headers.
wstring const & getLangTo() const std::u32string const & getLangTo() const
{ {
return langTo; return langTo;
} }
/// Returns the preferred external dictionary with sounds, as was read from file's headers. /// Returns the preferred external dictionary with sounds, as was read from file's headers.
wstring const & getSoundDictionaryName() const std::u32string const & getSoundDictionaryName() const
{ {
return soundDictionary; return soundDictionary;
} }
@ -161,10 +161,10 @@ public:
/// If end of file is reached, false is returned. /// If end of file is reached, false is returned.
/// Reading begins from the first line after the headers (ones which start /// Reading begins from the first line after the headers (ones which start
/// with #). /// with #).
bool readNextLine( wstring &, size_t & offset, bool only_head_word = false ); bool readNextLine( std::u32string &, size_t & offset, bool only_head_word = false );
/// Similar readNextLine but strip all DSL comments {{...}} /// Similar readNextLine but strip all DSL comments {{...}}
bool readNextLineWithoutComments( wstring &, size_t & offset, bool only_headword = false ); bool readNextLineWithoutComments( std::u32string &, size_t & offset, bool only_headword = false );
/// Returns the number of lines read so far from the file. /// Returns the number of lines read so far from the file.
unsigned getLinesRead() const unsigned getLinesRead() const
@ -180,32 +180,35 @@ public:
/// This function either removes parts of string enclosed in braces, or leaves /// This function either removes parts of string enclosed in braces, or leaves
/// them intact. The braces themselves are removed always, though. /// them intact. The braces themselves are removed always, though.
void processUnsortedParts( wstring & str, bool strip ); void processUnsortedParts( std::u32string & str, bool strip );
/// Expands optional parts of a headword (ones marked with parentheses), /// Expands optional parts of a headword (ones marked with parentheses),
/// producing all possible combinations where they are present or absent. /// producing all possible combinations where they are present or absent.
void expandOptionalParts( wstring & str, list< wstring > * result, size_t x = 0, bool inside_recurse = false ); void expandOptionalParts( std::u32string & str,
list< std::u32string > * result,
size_t x = 0,
bool inside_recurse = false );
/// Expands all unescaped tildes, inserting tildeReplacement text instead of /// Expands all unescaped tildes, inserting tildeReplacement text instead of
/// them. /// them.
void expandTildes( wstring & str, wstring const & tildeReplacement ); void expandTildes( std::u32string & str, std::u32string const & tildeReplacement );
/// Unescapes any escaped chars. Be sure to handle all their special meanings /// Unescapes any escaped chars. Be sure to handle all their special meanings
/// before unescaping them. /// before unescaping them.
void unescapeDsl( wstring & str ); void unescapeDsl( std::u32string & str );
/// Normalizes the headword. Currently turns any sequences of consecutive spaces /// Normalizes the headword. Currently turns any sequences of consecutive spaces
/// into a single space. /// into a single space.
void normalizeHeadword( wstring & ); void normalizeHeadword( std::u32string & );
/// Strip DSL {{...}} comments /// Strip DSL {{...}} comments
void stripComments( wstring &, bool & ); void stripComments( std::u32string &, bool & );
inline size_t DslScanner::distanceToBytes( size_t x ) const inline size_t DslScanner::distanceToBytes( size_t x ) const
{ {
switch ( encoding ) { switch ( encoding ) {
case Utf8::Utf16LE: case Text::Utf16LE:
case Utf8::Utf16BE: case Text::Utf16BE:
return x * 2; return x * 2;
default: default:
return x; return x;
@ -214,7 +217,7 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const
/// Converts the given language name taken from Dsl header (i.e. getLangFrom(), /// Converts the given language name taken from Dsl header (i.e. getLangFrom(),
/// getLangTo()) to its proper language id. /// getLangTo()) to its proper language id.
quint32 dslLanguageToId( wstring const & name ); quint32 dslLanguageToId( std::u32string const & name );
} // namespace Details } // namespace Details
} // namespace Dsl } // namespace Dsl

View file

@ -29,7 +29,7 @@ using std::multimap;
using std::vector; using std::vector;
using std::set; using std::set;
using std::pair; using std::pair;
using gd::wstring; using std::u32string;
namespace { namespace {
@ -109,10 +109,10 @@ public:
QString const & getDescription() override; QString const & getDescription() override;
void getHeadwordPos( wstring const & word_, QList< int > & pg, QList< int > & off ); void getHeadwordPos( u32string const & word_, QList< int > & pg, QList< int > & off );
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest >
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; getArticle( u32string const &, vector< u32string > const & alts, u32string const &, bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -134,16 +134,16 @@ public:
&& ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize ); && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
} }
static int japaneseWriting( gd::wchar ch ); static int japaneseWriting( char32_t ch );
static bool isSign( gd::wchar ch ); static bool isSign( char32_t ch );
static bool isJapanesePunctiation( gd::wchar ch ); static bool isJapanesePunctiation( char32_t ch );
sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &, unsigned long ) override; sptr< Dictionary::WordSearchRequest > prefixMatch( u32string const &, unsigned long ) override;
sptr< Dictionary::WordSearchRequest > sptr< Dictionary::WordSearchRequest >
stemmedMatch( wstring const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ) override; stemmedMatch( u32string const &, unsigned minLength, unsigned maxSuffixVariation, unsigned long maxResults ) override;
protected: protected:
@ -156,7 +156,7 @@ private:
quint32 address, string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ); quint32 address, string & articleHeadword, string & articleText, int & articlePage, int & articleOffset );
sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & word ) override; sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( u32string const & word ) override;
void loadArticleNextPage( string & articleHeadword, string & articleText, int & articlePage, int & articleOffset ); void loadArticleNextPage( string & articleHeadword, string & articleText, int & articlePage, int & articleOffset );
void void
@ -449,7 +449,7 @@ void EpwingDictionary::getArticleText( uint32_t articleAddress, QString & headwo
class EpwingHeadwordsRequest: public Dictionary::WordSearchRequest class EpwingHeadwordsRequest: public Dictionary::WordSearchRequest
{ {
wstring str; u32string str;
EpwingDictionary & dict; EpwingDictionary & dict;
QAtomicInt isCancelled; QAtomicInt isCancelled;
@ -457,7 +457,7 @@ class EpwingHeadwordsRequest: public Dictionary::WordSearchRequest
public: public:
EpwingHeadwordsRequest( wstring const & word_, EpwingDictionary & dict_ ): EpwingHeadwordsRequest( u32string const & word_, EpwingDictionary & dict_ ):
str( word_ ), str( word_ ),
dict( dict_ ) dict( dict_ )
{ {
@ -533,7 +533,7 @@ void EpwingHeadwordsRequest::run()
finish(); finish();
} }
sptr< Dictionary::WordSearchRequest > EpwingDictionary::findHeadwordsForSynonym( wstring const & word ) sptr< Dictionary::WordSearchRequest > EpwingDictionary::findHeadwordsForSynonym( u32string const & word )
{ {
return synonymSearchEnabled ? std::make_shared< EpwingHeadwordsRequest >( word, *this ) : return synonymSearchEnabled ? std::make_shared< EpwingHeadwordsRequest >( word, *this ) :
Class::findHeadwordsForSynonym( word ); Class::findHeadwordsForSynonym( word );
@ -542,8 +542,8 @@ sptr< Dictionary::WordSearchRequest > EpwingDictionary::findHeadwordsForSynonym(
class EpwingArticleRequest: public Dictionary::DataRequest class EpwingArticleRequest: public Dictionary::DataRequest
{ {
wstring word; u32string word;
vector< wstring > alts; vector< u32string > alts;
EpwingDictionary & dict; EpwingDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -552,8 +552,8 @@ class EpwingArticleRequest: public Dictionary::DataRequest
public: public:
EpwingArticleRequest( wstring const & word_, EpwingArticleRequest( u32string const & word_,
vector< wstring > const & alts_, vector< u32string > const & alts_,
EpwingDictionary & dict_, EpwingDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -568,10 +568,10 @@ public:
void run(); void run();
void getBuiltInArticle( wstring const & word_, void getBuiltInArticle( u32string const & word_,
QList< int > & pages, QList< int > & pages,
QList< int > & offsets, QList< int > & offsets,
multimap< wstring, pair< string, string > > & mainArticles ); multimap< u32string, pair< string, string > > & mainArticles );
void cancel() override void cancel() override
{ {
@ -601,13 +601,13 @@ void EpwingArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< u32string, pair< string, string > > mainArticles, alternateArticles;
set< quint32 > articlesIncluded; // Some synonims make it that the articles set< quint32 > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) if ( ignoreDiacritics )
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
@ -641,11 +641,11 @@ void EpwingArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); u32string headwordStripped = Folding::applySimpleCaseOnly( headword );
if ( ignoreDiacritics ) if ( ignoreDiacritics )
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
multimap< wstring, pair< string, string > > & mapToUse = multimap< u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
@ -670,7 +670,7 @@ void EpwingArticleRequest::run()
string result = "<div class=\"epwing_article\">"; string result = "<div class=\"epwing_article\">";
multimap< wstring, pair< string, string > >::const_iterator i; multimap< u32string, pair< string, string > >::const_iterator i;
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
result += "<h3>"; result += "<h3>";
@ -719,10 +719,10 @@ void EpwingArticleRequest::run()
finish(); finish();
} }
void EpwingArticleRequest::getBuiltInArticle( wstring const & word_, void EpwingArticleRequest::getBuiltInArticle( u32string const & word_,
QList< int > & pages, QList< int > & pages,
QList< int > & offsets, QList< int > & offsets,
multimap< wstring, pair< string, string > > & mainArticles ) multimap< u32string, pair< string, string > > & mainArticles )
{ {
try { try {
string headword, articleText; string headword, articleText;
@ -756,7 +756,7 @@ void EpwingArticleRequest::getBuiltInArticle( wstring const & word_,
} }
} }
void EpwingDictionary::getHeadwordPos( wstring const & word_, QList< int > & pg, QList< int > & off ) void EpwingDictionary::getHeadwordPos( u32string const & word_, QList< int > & pg, QList< int > & off )
{ {
try { try {
QMutexLocker _( &eBook.getLibMutex() ); QMutexLocker _( &eBook.getLibMutex() );
@ -767,9 +767,9 @@ void EpwingDictionary::getHeadwordPos( wstring const & word_, QList< int > & pg,
} }
} }
sptr< Dictionary::DataRequest > EpwingDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > EpwingDictionary::getArticle( u32string const & word,
vector< wstring > const & alts, vector< u32string > const & alts,
wstring const &, u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -882,7 +882,7 @@ sptr< Dictionary::DataRequest > EpwingDictionary::getSearchResults( QString cons
ignoreDiacritics ); ignoreDiacritics );
} }
int EpwingDictionary::japaneseWriting( gd::wchar ch ) int EpwingDictionary::japaneseWriting( char32_t ch )
{ {
if ( ( ch >= 0x30A0 && ch <= 0x30FF ) || ( ch >= 0x31F0 && ch <= 0x31FF ) || ( ch >= 0x3200 && ch <= 0x32FF ) if ( ( ch >= 0x30A0 && ch <= 0x30FF ) || ( ch >= 0x31F0 && ch <= 0x31FF ) || ( ch >= 0x3200 && ch <= 0x32FF )
|| ( ch >= 0xFF00 && ch <= 0xFFEF ) || ( ch == 0x1B000 ) ) || ( ch >= 0xFF00 && ch <= 0xFFEF ) || ( ch == 0x1B000 ) )
@ -895,7 +895,7 @@ int EpwingDictionary::japaneseWriting( gd::wchar ch )
return 0; return 0;
} }
bool EpwingDictionary::isSign( gd::wchar ch ) bool EpwingDictionary::isSign( char32_t ch )
{ {
switch ( ch ) { switch ( ch ) {
case 0x002B: // PLUS SIGN case 0x002B: // PLUS SIGN
@ -915,7 +915,7 @@ bool EpwingDictionary::isSign( gd::wchar ch )
} }
} }
bool EpwingDictionary::isJapanesePunctiation( gd::wchar ch ) bool EpwingDictionary::isJapanesePunctiation( char32_t ch )
{ {
return ch >= 0x3000 && ch <= 0x303F; return ch >= 0x3000 && ch <= 0x303F;
} }
@ -929,7 +929,7 @@ class EpwingWordSearchRequest: public BtreeIndexing::BtreeWordSearchRequest
public: public:
EpwingWordSearchRequest( EpwingDictionary & dict_, EpwingWordSearchRequest( EpwingDictionary & dict_,
wstring const & str_, u32string const & str_,
unsigned minLength_, unsigned minLength_,
int maxSuffixVariation_, int maxSuffixVariation_,
bool allowMiddleMatches_, bool allowMiddleMatches_,
@ -976,13 +976,13 @@ void EpwingWordSearchRequest::findMatches()
finish(); finish();
} }
sptr< Dictionary::WordSearchRequest > EpwingDictionary::prefixMatch( wstring const & str, unsigned long maxResults ) sptr< Dictionary::WordSearchRequest > EpwingDictionary::prefixMatch( u32string const & str, unsigned long maxResults )
{ {
return std::make_shared< EpwingWordSearchRequest >( *this, str, 0, -1, true, maxResults ); return std::make_shared< EpwingWordSearchRequest >( *this, str, 0, -1, true, maxResults );
} }
sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch( wstring const & str, sptr< Dictionary::WordSearchRequest > EpwingDictionary::stemmedMatch( u32string const & str,
unsigned minLength, unsigned minLength,
unsigned maxSuffixVariation, unsigned maxSuffixVariation,
unsigned long maxResults ) unsigned long maxResults )
@ -1021,20 +1021,20 @@ void addWordToChunks( Epwing::Book::EpwingHeadword & head,
chunks.addToBlock( &head.page, sizeof( head.page ) ); chunks.addToBlock( &head.page, sizeof( head.page ) );
chunks.addToBlock( &head.offset, sizeof( head.offset ) ); chunks.addToBlock( &head.offset, sizeof( head.offset ) );
wstring hw = head.headword.toStdU32String(); u32string hw = head.headword.toStdU32String();
indexedWords.addWord( hw, offset ); indexedWords.addWord( hw, offset );
wordCount++; wordCount++;
articleCount++; articleCount++;
vector< wstring > words; vector< u32string > words;
// Parse combined kanji/katakana/hiragana headwords // Parse combined kanji/katakana/hiragana headwords
int w_prev = 0; int w_prev = 0;
wstring word; u32string word;
for ( wstring::size_type n = 0; n < hw.size(); n++ ) { for ( u32string::size_type n = 0; n < hw.size(); n++ ) {
gd::wchar ch = hw[ n ]; char32_t ch = hw[ n ];
if ( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isSign( ch ) if ( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isSign( ch )
|| EpwingDictionary::isJapanesePunctiation( ch ) ) || EpwingDictionary::isJapanesePunctiation( ch ) )
@ -1044,7 +1044,7 @@ void addWordToChunks( Epwing::Book::EpwingHeadword & head,
if ( w > 0 ) { if ( w > 0 ) {
// Store only separated words // Store only separated words
gd::wchar ch_prev = 0; char32_t ch_prev = 0;
if ( n ) if ( n )
ch_prev = hw[ n - 1 ]; ch_prev = hw[ n - 1 ];
bool needStore = ( n == 0 || Folding::isPunct( ch_prev ) || Folding::isWhitespace( ch_prev ) bool needStore = ( n == 0 || Folding::isPunct( ch_prev ) || Folding::isWhitespace( ch_prev )
@ -1052,7 +1052,7 @@ void addWordToChunks( Epwing::Book::EpwingHeadword & head,
word.push_back( ch ); word.push_back( ch );
w_prev = w; w_prev = w;
wstring::size_type i; u32string::size_type i;
for ( i = n + 1; i < hw.size(); i++ ) { for ( i = n + 1; i < hw.size(); i++ ) {
ch = hw[ i ]; ch = hw[ i ];
if ( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) ) if ( Folding::isPunct( ch ) || Folding::isWhitespace( ch ) || EpwingDictionary::isJapanesePunctiation( ch ) )

View file

@ -10,8 +10,7 @@
#include <QTextDocumentFragment> #include <QTextDocumentFragment>
#include <QHash> #include <QHash>
#include "audiolink.hh" #include "audiolink.hh"
#include "wstring.hh" #include "text.hh"
#include "wstring_qt.hh"
#include "folding.hh" #include "folding.hh"
#include "epwing_charmap.hh" #include "epwing_charmap.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
@ -1135,7 +1134,7 @@ void EpwingBook::fixHeadword( QString & headword )
// return; // return;
//} //}
gd::wstring folded = Folding::applyPunctOnly( fixed.toStdU32String() ); std::u32string folded = Folding::applyPunctOnly( fixed.toStdU32String() );
//fixed = QString::fromStdU32String( folded ); //fixed = QString::fromStdU32String( folded );
//if( isHeadwordCorrect( fixed ) ) //if( isHeadwordCorrect( fixed ) )
@ -1993,4 +1992,4 @@ QMutex EpwingBook::libMutex;
} // namespace Epwing } // namespace Epwing
#endif #endif

View file

@ -2,14 +2,13 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "forvo.hh" #include "forvo.hh"
#include "wstring_qt.hh"
#include <QNetworkAccessManager> #include <QNetworkAccessManager>
#include <QNetworkReply> #include <QNetworkReply>
#include <QtXml> #include <QtXml>
#include <list> #include <list>
#include "audiolink.hh" #include "audiolink.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
#include "utf8.hh" #include "text.hh"
namespace Forvo { namespace Forvo {
@ -48,7 +47,7 @@ public:
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const & /*word*/, unsigned long /*maxResults*/ ) override sptr< WordSearchRequest > prefixMatch( std::u32string const & /*word*/, unsigned long /*maxResults*/ ) override
{ {
sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >(); sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >();
@ -57,7 +56,8 @@ public:
return sr; return sr;
} }
sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; sptr< DataRequest >
getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override;
protected: protected:
@ -89,8 +89,8 @@ class ForvoArticleRequest: public Dictionary::DataRequest
public: public:
ForvoArticleRequest( wstring const & word, ForvoArticleRequest( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
QString const & apiKey_, QString const & apiKey_,
QString const & languageCode_, QString const & languageCode_,
string const & dictionaryId_, string const & dictionaryId_,
@ -100,14 +100,16 @@ public:
private: private:
void addQuery( QNetworkAccessManager & mgr, wstring const & word ); void addQuery( QNetworkAccessManager & mgr, std::u32string const & word );
private slots: private slots:
virtual void requestFinished( QNetworkReply * ); virtual void requestFinished( QNetworkReply * );
}; };
sptr< DataRequest > sptr< DataRequest > ForvoDictionary::getArticle( std::u32string const & word,
ForvoDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) vector< std::u32string > const & alts,
std::u32string const &,
bool )
{ {
if ( word.size() > 80 || apiKey.isEmpty() ) { if ( word.size() > 80 || apiKey.isEmpty() ) {
@ -137,8 +139,8 @@ void ForvoArticleRequest::cancel()
finish(); finish();
} }
ForvoArticleRequest::ForvoArticleRequest( wstring const & str, ForvoArticleRequest::ForvoArticleRequest( std::u32string const & str,
vector< wstring > const & alts, vector< std::u32string > const & alts,
QString const & apiKey_, QString const & apiKey_,
QString const & languageCode_, QString const & languageCode_,
string const & dictionaryId_, string const & dictionaryId_,
@ -156,7 +158,7 @@ ForvoArticleRequest::ForvoArticleRequest( wstring const & str,
} }
} }
void ForvoArticleRequest::addQuery( QNetworkAccessManager & mgr, wstring const & str ) void ForvoArticleRequest::addQuery( QNetworkAccessManager & mgr, std::u32string const & str )
{ {
qDebug( "Forvo: requesting article %s", QString::fromStdU32String( str ).toUtf8().data() ); qDebug( "Forvo: requesting article %s", QString::fromStdU32String( str ).toUtf8().data() );
@ -177,7 +179,7 @@ void ForvoArticleRequest::addQuery( QNetworkAccessManager & mgr, wstring const &
sptr< QNetworkReply > netReply = std::shared_ptr< QNetworkReply >( mgr.get( QNetworkRequest( reqUrl ) ) ); sptr< QNetworkReply > netReply = std::shared_ptr< QNetworkReply >( mgr.get( QNetworkRequest( reqUrl ) ) );
netReplies.push_back( NetReply( netReply, Utf8::encode( str ) ) ); netReplies.push_back( NetReply( netReply, Text::toUtf8( str ) ) );
} }
void ForvoArticleRequest::requestFinished( QNetworkReply * r ) void ForvoArticleRequest::requestFinished( QNetworkReply * r )

View file

@ -8,8 +8,7 @@
#include "ufile.hh" #include "ufile.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "wstring_qt.hh"
#include "chunkedstorage.hh" #include "chunkedstorage.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "dictzip.hh" #include "dictzip.hh"
@ -39,14 +38,12 @@ using std::set;
using std::multimap; using std::multimap;
using std::pair; using std::pair;
using gd::wstring;
using gd::wchar;
using BtreeIndexing::WordArticleLink; using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo; using BtreeIndexing::IndexInfo;
using Utf8::Encoding; using Text::Encoding;
using Utf8::LineFeed; using Text::LineFeed;
/////////////// GlsScanner /////////////// GlsScanner
@ -55,9 +52,9 @@ class GlsScanner
gzFile f; gzFile f;
Encoding encoding; Encoding encoding;
QTextCodec * codec; QTextCodec * codec;
wstring dictionaryName; std::u32string dictionaryName;
wstring dictionaryDecription, dictionaryAuthor; std::u32string dictionaryDecription, dictionaryAuthor;
wstring langFrom, langTo; std::u32string langFrom, langTo;
char readBuffer[ 10000 ]; char readBuffer[ 10000 ];
char * readBufferPtr; char * readBufferPtr;
size_t readBufferLeft; size_t readBufferLeft;
@ -82,31 +79,31 @@ public:
} }
/// Returns the dictionary's name, as was read from file's headers. /// Returns the dictionary's name, as was read from file's headers.
wstring const & getDictionaryName() const std::u32string const & getDictionaryName() const
{ {
return dictionaryName; return dictionaryName;
} }
/// Returns the dictionary's author, as was read from file's headers. /// Returns the dictionary's author, as was read from file's headers.
wstring const & getDictionaryAuthor() const std::u32string const & getDictionaryAuthor() const
{ {
return dictionaryAuthor; return dictionaryAuthor;
} }
/// Returns the dictionary's description, as was read from file's headers. /// Returns the dictionary's description, as was read from file's headers.
wstring const & getDictionaryDescription() const std::u32string const & getDictionaryDescription() const
{ {
return dictionaryDecription; return dictionaryDecription;
} }
/// Returns the dictionary's source language, as was read from file's headers. /// Returns the dictionary's source language, as was read from file's headers.
wstring const & getLangFrom() const std::u32string const & getLangFrom() const
{ {
return langFrom; return langFrom;
} }
/// Returns the dictionary's target language, as was read from file's headers. /// Returns the dictionary's target language, as was read from file's headers.
wstring const & getLangTo() const std::u32string const & getLangTo() const
{ {
return langTo; return langTo;
} }
@ -117,7 +114,7 @@ public:
/// If end of file is reached, false is returned. /// If end of file is reached, false is returned.
/// Reading begins from the first line after the headers (ones which end /// Reading begins from the first line after the headers (ones which end
/// by the "### Glossary section:" line). /// by the "### Glossary section:" line).
bool readNextLine( wstring &, size_t & offset ); bool readNextLine( std::u32string &, size_t & offset );
/// Returns the number of lines read so far from the file. /// Returns the number of lines read so far from the file.
unsigned getLinesRead() const unsigned getLinesRead() const
{ {
@ -126,7 +123,7 @@ public:
}; };
GlsScanner::GlsScanner( string const & fileName ): GlsScanner::GlsScanner( string const & fileName ):
encoding( Utf8::Utf8 ), encoding( Text::Utf8 ),
readBufferPtr( readBuffer ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), readBufferLeft( 0 ),
linesRead( 0 ) linesRead( 0 )
@ -152,10 +149,10 @@ GlsScanner::GlsScanner( string const & fileName ):
// If the file begins with the dedicated Unicode marker, we just consume // If the file begins with the dedicated Unicode marker, we just consume
// it. If, on the other hand, it's not, we return the bytes back // it. If, on the other hand, it's not, we return the bytes back
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) { if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) {
encoding = Utf8::Utf16LE; encoding = Text::Utf16LE;
} }
else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) { else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) {
encoding = Utf8::Utf16BE; encoding = Text::Utf16BE;
} }
else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) { else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) {
// Looks like Utf8, read one more byte // Looks like Utf8, read one more byte
@ -164,29 +161,29 @@ GlsScanner::GlsScanner( string const & fileName ):
gzclose( f ); gzclose( f );
throw exMalformedGlsFile( fileName ); throw exMalformedGlsFile( fileName );
} }
encoding = Utf8::Utf8; encoding = Text::Utf8;
} }
else { else {
if ( gzrewind( f ) ) { if ( gzrewind( f ) ) {
gzclose( f ); gzclose( f );
throw exCantOpen( fileName ); throw exCantOpen( fileName );
} }
encoding = Utf8::Utf8; encoding = Text::Utf8;
} }
codec = QTextCodec::codecForName( Utf8::getEncodingNameFor( encoding ) ); codec = QTextCodec::codecForName( Text::getEncodingNameFor( encoding ) );
// We now can use our own readNextLine() function // We now can use our own readNextLine() function
lineFeed = Utf8::initLineFeed( encoding ); lineFeed = Text::initLineFeed( encoding );
wstring str; std::u32string str;
wstring * currentField = 0; std::u32string * currentField = 0;
wstring mark = U"###"; std::u32string mark = U"###";
wstring titleMark = U"### Glossary title:"; std::u32string titleMark = U"### Glossary title:";
wstring authorMark = U"### Author:"; std::u32string authorMark = U"### Author:";
wstring descriptionMark = U"### Description:"; std::u32string descriptionMark = U"### Description:";
wstring langFromMark = U"### Source language:"; std::u32string langFromMark = U"### Source language:";
wstring langToMark = U"### Target language:"; std::u32string langToMark = U"### Target language:";
wstring endOfHeaderMark = U"### Glossary section:"; std::u32string endOfHeaderMark = U"### Glossary section:";
size_t offset; size_t offset;
for ( ;; ) { for ( ;; ) {
@ -199,22 +196,22 @@ GlsScanner::GlsScanner( string const & fileName ):
currentField = 0; currentField = 0;
if ( str.compare( 0, titleMark.size(), titleMark ) == 0 ) { if ( str.compare( 0, titleMark.size(), titleMark ) == 0 ) {
dictionaryName = wstring( str, titleMark.size(), str.size() - titleMark.size() ); dictionaryName = std::u32string( str, titleMark.size(), str.size() - titleMark.size() );
currentField = &dictionaryName; currentField = &dictionaryName;
} }
else if ( str.compare( 0, authorMark.size(), authorMark ) == 0 ) { else if ( str.compare( 0, authorMark.size(), authorMark ) == 0 ) {
dictionaryAuthor = wstring( str, authorMark.size(), str.size() - authorMark.size() ); dictionaryAuthor = std::u32string( str, authorMark.size(), str.size() - authorMark.size() );
currentField = &dictionaryAuthor; currentField = &dictionaryAuthor;
} }
else if ( str.compare( 0, descriptionMark.size(), descriptionMark ) == 0 ) { else if ( str.compare( 0, descriptionMark.size(), descriptionMark ) == 0 ) {
dictionaryDecription = wstring( str, descriptionMark.size(), str.size() - descriptionMark.size() ); dictionaryDecription = std::u32string( str, descriptionMark.size(), str.size() - descriptionMark.size() );
currentField = &dictionaryDecription; currentField = &dictionaryDecription;
} }
else if ( str.compare( 0, langFromMark.size(), langFromMark ) == 0 ) { else if ( str.compare( 0, langFromMark.size(), langFromMark ) == 0 ) {
langFrom = wstring( str, langFromMark.size(), str.size() - langFromMark.size() ); langFrom = std::u32string( str, langFromMark.size(), str.size() - langFromMark.size() );
} }
else if ( str.compare( 0, langToMark.size(), langToMark ) == 0 ) { else if ( str.compare( 0, langToMark.size(), langToMark ) == 0 ) {
langTo = wstring( str, langToMark.size(), str.size() - langToMark.size() ); langTo = std::u32string( str, langToMark.size(), str.size() - langToMark.size() );
} }
else if ( str.compare( 0, endOfHeaderMark.size(), endOfHeaderMark ) == 0 ) { else if ( str.compare( 0, endOfHeaderMark.size(), endOfHeaderMark ) == 0 ) {
break; break;
@ -229,7 +226,7 @@ GlsScanner::GlsScanner( string const & fileName ):
} }
} }
bool GlsScanner::readNextLine( wstring & out, size_t & offset ) bool GlsScanner::readNextLine( std::u32string & out, size_t & offset )
{ {
offset = (size_t)( gztell( f ) - readBufferLeft ); offset = (size_t)( gztell( f ) - readBufferLeft );
@ -256,7 +253,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset )
return false; return false;
} }
int pos = Utf8::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length ); int pos = Text::findFirstLinePosition( readBufferPtr, readBufferLeft, lineFeed.lineFeed, lineFeed.length );
if ( pos == -1 ) { if ( pos == -1 ) {
return false; return false;
} }
@ -369,10 +366,12 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override;
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -503,11 +502,11 @@ QString const & GlsDictionary::getDescription()
try { try {
GlsScanner scanner( getDictionaryFilenames()[ 0 ] ); GlsScanner scanner( getDictionaryFilenames()[ 0 ] );
string str = Utf8::encode( scanner.getDictionaryAuthor() ); string str = Text::toUtf8( scanner.getDictionaryAuthor() );
if ( !str.empty() ) { if ( !str.empty() ) {
dictionaryDescription = QObject::tr( "Author: %1%2" ).arg( QString::fromUtf8( str.c_str() ) ).arg( "\n\n" ); dictionaryDescription = QObject::tr( "Author: %1%2" ).arg( QString::fromUtf8( str.c_str() ) ).arg( "\n\n" );
} }
str = Utf8::encode( scanner.getDictionaryDescription() ); str = Text::toUtf8( scanner.getDictionaryDescription() );
if ( !str.empty() ) { if ( !str.empty() ) {
QString desc = QString::fromUtf8( str.c_str() ); QString desc = QString::fromUtf8( str.c_str() );
desc.replace( "\t", "<br/>" ); desc.replace( "\t", "<br/>" );
@ -592,7 +591,7 @@ void GlsDictionary::loadArticleText( uint32_t address, vector< string > & headwo
} }
else { else {
string articleData = string articleData =
Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize ); Iconv::toUtf8( Text::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
string::size_type start_pos = 0, end_pos = 0; string::size_type start_pos = 0, end_pos = 0;
for ( ;; ) { for ( ;; ) {
@ -621,7 +620,7 @@ void GlsDictionary::loadArticleText( uint32_t address, vector< string > & headwo
end_pos = 0; end_pos = 0;
for ( ;; ) { for ( ;; ) {
end_pos = headword.find( '|', start_pos ); end_pos = headword.find( '|', start_pos );
if ( end_pos == wstring::npos ) { if ( end_pos == std::u32string::npos ) {
string hw = headword.substr( start_pos ); string hw = headword.substr( start_pos );
if ( !hw.empty() ) { if ( !hw.empty() ) {
headwords.push_back( hw ); headwords.push_back( hw );
@ -804,7 +803,7 @@ void GlsDictionary::getArticleText( uint32_t articleAddress, QString & headword,
class GlsHeadwordsRequest: public Dictionary::WordSearchRequest class GlsHeadwordsRequest: public Dictionary::WordSearchRequest
{ {
wstring word; std::u32string word;
GlsDictionary & dict; GlsDictionary & dict;
QAtomicInt isCancelled; QAtomicInt isCancelled;
@ -812,7 +811,7 @@ class GlsHeadwordsRequest: public Dictionary::WordSearchRequest
public: public:
GlsHeadwordsRequest( wstring const & word_, GlsDictionary & dict_ ): GlsHeadwordsRequest( std::u32string const & word_, GlsDictionary & dict_ ):
word( word_ ), word( word_ ),
dict( dict_ ) dict( dict_ )
{ {
@ -845,7 +844,7 @@ void GlsHeadwordsRequest::run()
try { try {
vector< WordArticleLink > chain = dict.findArticles( word ); vector< WordArticleLink > chain = dict.findArticles( word );
wstring caseFolded = Folding::applySimpleCaseOnly( word ); std::u32string caseFolded = Folding::applySimpleCaseOnly( word );
for ( auto & x : chain ) { for ( auto & x : chain ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
@ -858,7 +857,7 @@ void GlsHeadwordsRequest::run()
dict.loadArticleText( x.articleOffset, headwords, articleText ); dict.loadArticleText( x.articleOffset, headwords, articleText );
wstring headwordDecoded = Utf8::decode( headwords.front() ); std::u32string headwordDecoded = Text::toUtf32( headwords.front() );
if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) ) { if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) ) {
// The headword seems to differ from the input word, which makes the // The headword seems to differ from the input word, which makes the
@ -876,7 +875,7 @@ void GlsHeadwordsRequest::run()
finish(); finish();
} }
sptr< Dictionary::WordSearchRequest > GlsDictionary::findHeadwordsForSynonym( wstring const & word ) sptr< Dictionary::WordSearchRequest > GlsDictionary::findHeadwordsForSynonym( std::u32string const & word )
{ {
return synonymSearchEnabled ? std::make_shared< GlsHeadwordsRequest >( word, *this ) : return synonymSearchEnabled ? std::make_shared< GlsHeadwordsRequest >( word, *this ) :
@ -889,8 +888,8 @@ sptr< Dictionary::WordSearchRequest > GlsDictionary::findHeadwordsForSynonym( ws
class GlsArticleRequest: public Dictionary::DataRequest class GlsArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
GlsDictionary & dict; GlsDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -899,8 +898,8 @@ class GlsArticleRequest: public Dictionary::DataRequest
public: public:
GlsArticleRequest( wstring const & word_, GlsArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
GlsDictionary & dict_, GlsDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -944,13 +943,13 @@ void GlsArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -976,16 +975,16 @@ void GlsArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( Utf8::decode( headword ) ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( Text::toUtf32( headword ) );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< std::u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( mapToUse.insert(
pair( Folding::applySimpleCaseOnly( Utf8::decode( headword ) ), pair( headword, articleText ) ) ); pair( Folding::applySimpleCaseOnly( Text::toUtf32( headword ) ), pair( headword, articleText ) ) );
articlesIncluded.insert( x.articleOffset ); articlesIncluded.insert( x.articleOffset );
} }
@ -998,7 +997,7 @@ void GlsArticleRequest::run()
string result; string result;
multimap< wstring, pair< string, string > >::const_iterator i; multimap< std::u32string, pair< string, string > >::const_iterator i;
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
result += i->second.second; result += i->second.second;
@ -1019,9 +1018,9 @@ void GlsArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > GlsDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > GlsDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -1097,7 +1096,7 @@ void GlsResourceRequest::run()
if ( dict.resourceZip.isOpen() ) { if ( dict.resourceZip.isOpen() ) {
QMutexLocker _( &dataMutex ); QMutexLocker _( &dataMutex );
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) {
throw; // Make it fail since we couldn't read the archive throw; // Make it fail since we couldn't read the archive
} }
} }
@ -1239,7 +1238,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// which the incident happened. We need alive scanner for that. // which the incident happened. We need alive scanner for that.
// Building the index // Building the index
initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) ); initializing.indexingDictionary( Text::toUtf8( scanner.getDictionaryName() ) );
qDebug( "Gls: Building the index for dictionary: %s", qDebug( "Gls: Building the index for dictionary: %s",
QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() ); QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() );
@ -1255,7 +1254,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idx.write( idxHeader ); idx.write( idxHeader );
string dictionaryName = Utf8::encode( scanner.getDictionaryName() ); string dictionaryName = Text::toUtf8( scanner.getDictionaryName() );
idx.write( (uint32_t)dictionaryName.size() ); idx.write( (uint32_t)dictionaryName.size() );
idx.write( dictionaryName.data(), dictionaryName.size() ); idx.write( dictionaryName.data(), dictionaryName.size() );
@ -1266,7 +1265,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
ChunkedStorage::Writer chunks( idx ); ChunkedStorage::Writer chunks( idx );
wstring curString; std::u32string curString;
size_t curOffset; size_t curOffset;
uint32_t articleCount = 0, wordCount = 0; uint32_t articleCount = 0, wordCount = 0;
@ -1286,12 +1285,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Parse headwords // Parse headwords
list< wstring > allEntryWords; list< std::u32string > allEntryWords;
wstring::size_type start_pos = 0, end_pos = 0; std::u32string::size_type start_pos = 0, end_pos = 0;
for ( ;; ) { for ( ;; ) {
end_pos = curString.find( '|', start_pos ); end_pos = curString.find( '|', start_pos );
if ( end_pos == wstring::npos ) { if ( end_pos == std::u32string::npos ) {
wstring headword = curString.substr( start_pos ); std::u32string headword = curString.substr( start_pos );
if ( !headword.empty() ) { if ( !headword.empty() ) {
allEntryWords.push_back( headword ); allEntryWords.push_back( headword );
} }

View file

@ -2,7 +2,7 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "hunspell.hh" #include "hunspell.hh"
#include "utf8.hh" #include "text.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
#include "iconv.hh" #include "iconv.hh"
#include "folding.hh" #include "folding.hh"
@ -21,7 +21,6 @@ namespace HunspellMorpho {
using namespace Dictionary; using namespace Dictionary;
using gd::wchar;
namespace { namespace {
@ -60,18 +59,19 @@ public:
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) override; sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) override;
sptr< WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; sptr< WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override;
sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; sptr< DataRequest >
getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override;
bool isLocalDictionary() override bool isLocalDictionary() override
{ {
return true; return true;
} }
vector< wstring > getAlternateWritings( const wstring & word ) noexcept override; vector< std::u32string > getAlternateWritings( const std::u32string & word ) noexcept override;
protected: protected:
@ -94,25 +94,25 @@ private:
/// Encodes the given string to be passed to the hunspell object. May throw /// Encodes the given string to be passed to the hunspell object. May throw
/// Iconv::Ex /// Iconv::Ex
string encodeToHunspell( Hunspell &, wstring const & ); string encodeToHunspell( Hunspell &, std::u32string const & );
/// Decodes the given string returned by the hunspell object. May throw /// Decodes the given string returned by the hunspell object. May throw
/// Iconv::Ex /// Iconv::Ex
wstring decodeFromHunspell( Hunspell &, char const * ); std::u32string decodeFromHunspell( Hunspell &, char const * );
/// Generates suggestions via hunspell /// Generates suggestions via hunspell
QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hunspell ); QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex, Hunspell & hunspell );
/// Generates suggestions for compound expression /// Generates suggestions for compound expression
void getSuggestionsForExpression( wstring const & expression, void getSuggestionsForExpression( std::u32string const & expression,
vector< wstring > & suggestions, vector< std::u32string > & suggestions,
QMutex & hunspellMutex, QMutex & hunspellMutex,
Hunspell & hunspell ); Hunspell & hunspell );
/// Returns true if the string contains whitespace, false otherwise /// Returns true if the string contains whitespace, false otherwise
bool containsWhitespace( wstring const & str ) bool containsWhitespace( std::u32string const & str )
{ {
wchar const * next = str.c_str(); char32_t const * next = str.c_str();
for ( ; *next; ++next ) { for ( ; *next; ++next ) {
if ( Folding::isWhitespace( *next ) ) { if ( Folding::isWhitespace( *next ) ) {
@ -142,9 +142,9 @@ void HunspellDictionary::loadIcon() noexcept
dictionaryIconLoaded = true; dictionaryIconLoaded = true;
} }
vector< wstring > HunspellDictionary::getAlternateWritings( wstring const & word ) noexcept vector< std::u32string > HunspellDictionary::getAlternateWritings( std::u32string const & word ) noexcept
{ {
vector< wstring > results; vector< std::u32string > results;
if ( containsWhitespace( word ) ) { if ( containsWhitespace( word ) ) {
getSuggestionsForExpression( word, results, getHunspellMutex(), hunspell ); getSuggestionsForExpression( word, results, getHunspellMutex(), hunspell );
@ -160,14 +160,14 @@ class HunspellArticleRequest: public Dictionary::DataRequest
QMutex & hunspellMutex; QMutex & hunspellMutex;
Hunspell & hunspell; Hunspell & hunspell;
wstring word; std::u32string word;
QAtomicInt isCancelled; QAtomicInt isCancelled;
QFuture< void > f; QFuture< void > f;
public: public:
HunspellArticleRequest( wstring const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): HunspellArticleRequest( std::u32string const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ):
hunspellMutex( hunspellMutex_ ), hunspellMutex( hunspellMutex_ ),
hunspell( hunspell_ ), hunspell( hunspell_ ),
word( word_ ) word( word_ )
@ -201,7 +201,7 @@ void HunspellArticleRequest::run()
vector< string > suggestions; vector< string > suggestions;
try { try {
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word ); std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( containsWhitespace( trimmedWord ) ) { if ( containsWhitespace( trimmedWord ) ) {
// For now we don't analyze whitespace-containing phrases // For now we don't analyze whitespace-containing phrases
@ -226,10 +226,10 @@ void HunspellArticleRequest::run()
string result = "<div class=\"gdspellsuggestion\">" string result = "<div class=\"gdspellsuggestion\">"
+ Html::escape( QCoreApplication::translate( "Hunspell", "Spelling suggestions: " ).toUtf8().data() ); + Html::escape( QCoreApplication::translate( "Hunspell", "Spelling suggestions: " ).toUtf8().data() );
wstring lowercasedWord = Folding::applySimpleCaseOnly( word ); std::u32string lowercasedWord = Folding::applySimpleCaseOnly( word );
for ( vector< string >::size_type x = 0; x < suggestions.size(); ++x ) { for ( vector< string >::size_type x = 0; x < suggestions.size(); ++x ) {
wstring suggestion = decodeFromHunspell( hunspell, suggestions[ x ].c_str() ); std::u32string suggestion = decodeFromHunspell( hunspell, suggestions[ x ].c_str() );
if ( Folding::applySimpleCaseOnly( suggestion ) == lowercasedWord ) { if ( Folding::applySimpleCaseOnly( suggestion ) == lowercasedWord ) {
// If among suggestions we see the same word just with the different // If among suggestions we see the same word just with the different
@ -240,7 +240,7 @@ void HunspellArticleRequest::run()
return; return;
} }
string suggestionUtf8 = Utf8::encode( suggestion ); string suggestionUtf8 = Text::toUtf8( suggestion );
result += "<a href=\"bword:"; result += "<a href=\"bword:";
result += Html::escape( suggestionUtf8 ) + "\">"; result += Html::escape( suggestionUtf8 ) + "\">";
@ -268,8 +268,10 @@ void HunspellArticleRequest::run()
finish(); finish();
} }
sptr< DataRequest > sptr< DataRequest > HunspellDictionary::getArticle( std::u32string const & word,
HunspellDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) vector< std::u32string > const &,
std::u32string const &,
bool )
{ {
return std::make_shared< HunspellArticleRequest >( word, getHunspellMutex(), hunspell ); return std::make_shared< HunspellArticleRequest >( word, getHunspellMutex(), hunspell );
@ -282,7 +284,7 @@ class HunspellHeadwordsRequest: public Dictionary::WordSearchRequest
QMutex & hunspellMutex; QMutex & hunspellMutex;
Hunspell & hunspell; Hunspell & hunspell;
wstring word; std::u32string word;
QAtomicInt isCancelled; QAtomicInt isCancelled;
QFuture< void > f; QFuture< void > f;
@ -290,7 +292,7 @@ class HunspellHeadwordsRequest: public Dictionary::WordSearchRequest
public: public:
HunspellHeadwordsRequest( wstring const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): HunspellHeadwordsRequest( std::u32string const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ):
hunspellMutex( hunspellMutex_ ), hunspellMutex( hunspellMutex_ ),
hunspell( hunspell_ ), hunspell( hunspell_ ),
word( word_ ) word( word_ )
@ -322,7 +324,7 @@ void HunspellHeadwordsRequest::run()
return; return;
} }
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word ); std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( trimmedWord.size() > 80 ) { if ( trimmedWord.size() > 80 ) {
// We won't do anything for overly long sentences since that would probably // We won't do anything for overly long sentences since that would probably
@ -332,7 +334,7 @@ void HunspellHeadwordsRequest::run()
} }
if ( containsWhitespace( trimmedWord ) ) { if ( containsWhitespace( trimmedWord ) ) {
vector< wstring > results; vector< std::u32string > results;
getSuggestionsForExpression( trimmedWord, results, hunspellMutex, hunspell ); getSuggestionsForExpression( trimmedWord, results, hunspellMutex, hunspell );
@ -342,7 +344,7 @@ void HunspellHeadwordsRequest::run()
} }
} }
else { else {
QList< wstring > suggestions = suggest( trimmedWord, hunspellMutex, hunspell ); QList< std::u32string > suggestions = suggest( trimmedWord, hunspellMutex, hunspell );
if ( !suggestions.empty() ) { if ( !suggestions.empty() ) {
QMutexLocker _( &dataMutex ); QMutexLocker _( &dataMutex );
@ -356,9 +358,9 @@ void HunspellHeadwordsRequest::run()
finish(); finish();
} }
QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hunspell ) QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex, Hunspell & hunspell )
{ {
QList< wstring > result; QList< std::u32string > result;
vector< string > suggestions; vector< string > suggestions;
@ -371,7 +373,7 @@ QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hun
if ( !suggestions.empty() ) { if ( !suggestions.empty() ) {
// There were some suggestions made for us. Make an appropriate output. // There were some suggestions made for us. Make an appropriate output.
wstring lowercasedWord = Folding::applySimpleCaseOnly( word ); std::u32string lowercasedWord = Folding::applySimpleCaseOnly( word );
static QRegularExpression cutStem( R"(^\s*st:(((\s+(?!\w{2}:)(?!-)(?!\+))|\S+)+))" ); static QRegularExpression cutStem( R"(^\s*st:(((\s+(?!\w{2}:)(?!-)(?!\+))|\S+)+))" );
@ -388,7 +390,7 @@ QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hun
auto match = cutStem.match( suggestion.trimmed() ); auto match = cutStem.match( suggestion.trimmed() );
if ( match.hasMatch() ) { if ( match.hasMatch() ) {
wstring alt = match.captured( 1 ).toStdU32String(); std::u32string alt = match.captured( 1 ).toStdU32String();
if ( Folding::applySimpleCaseOnly( alt ) != lowercasedWord ) // No point in providing same word if ( Folding::applySimpleCaseOnly( alt ) != lowercasedWord ) // No point in providing same word
{ {
@ -406,7 +408,7 @@ QList< wstring > suggest( wstring & word, QMutex & hunspellMutex, Hunspell & hun
} }
sptr< WordSearchRequest > HunspellDictionary::findHeadwordsForSynonym( wstring const & word ) sptr< WordSearchRequest > HunspellDictionary::findHeadwordsForSynonym( std::u32string const & word )
{ {
return std::make_shared< HunspellHeadwordsRequest >( word, getHunspellMutex(), hunspell ); return std::make_shared< HunspellHeadwordsRequest >( word, getHunspellMutex(), hunspell );
@ -420,14 +422,14 @@ class HunspellPrefixMatchRequest: public Dictionary::WordSearchRequest
QMutex & hunspellMutex; QMutex & hunspellMutex;
Hunspell & hunspell; Hunspell & hunspell;
wstring word; std::u32string word;
QAtomicInt isCancelled; QAtomicInt isCancelled;
QFuture< void > f; QFuture< void > f;
public: public:
HunspellPrefixMatchRequest( wstring const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ): HunspellPrefixMatchRequest( std::u32string const & word_, QMutex & hunspellMutex_, Hunspell & hunspell_ ):
hunspellMutex( hunspellMutex_ ), hunspellMutex( hunspellMutex_ ),
hunspell( hunspell_ ), hunspell( hunspell_ ),
word( word_ ) word( word_ )
@ -460,7 +462,7 @@ void HunspellPrefixMatchRequest::run()
} }
try { try {
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word ); std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( trimmedWord.empty() || containsWhitespace( trimmedWord ) ) { if ( trimmedWord.empty() || containsWhitespace( trimmedWord ) ) {
// For now we don't analyze whitespace-containing phrases // For now we don't analyze whitespace-containing phrases
@ -487,14 +489,14 @@ void HunspellPrefixMatchRequest::run()
finish(); finish();
} }
sptr< WordSearchRequest > HunspellDictionary::prefixMatch( wstring const & word, unsigned long /*maxResults*/ ) sptr< WordSearchRequest > HunspellDictionary::prefixMatch( std::u32string const & word, unsigned long /*maxResults*/ )
{ {
return std::make_shared< HunspellPrefixMatchRequest >( word, getHunspellMutex(), hunspell ); return std::make_shared< HunspellPrefixMatchRequest >( word, getHunspellMutex(), hunspell );
} }
void getSuggestionsForExpression( wstring const & expression, void getSuggestionsForExpression( std::u32string const & expression,
vector< wstring > & suggestions, vector< std::u32string > & suggestions,
QMutex & hunspellMutex, QMutex & hunspellMutex,
Hunspell & hunspell ) Hunspell & hunspell )
{ {
@ -502,15 +504,15 @@ void getSuggestionsForExpression( wstring const & expression,
// This is useful for compound expressions where some words is // This is useful for compound expressions where some words is
// in different form, e.g. "dozing off" -> "doze off". // in different form, e.g. "dozing off" -> "doze off".
wstring trimmedWord = Folding::trimWhitespaceOrPunct( expression ); std::u32string trimmedWord = Folding::trimWhitespaceOrPunct( expression );
wstring word, punct; std::u32string word, punct;
QList< wstring > words; QList< std::u32string > words;
suggestions.clear(); suggestions.clear();
// Parse string to separate words // Parse string to separate words
for ( wchar const * c = trimmedWord.c_str();; ++c ) { for ( char32_t const * c = trimmedWord.c_str();; ++c ) {
if ( !*c || Folding::isPunct( *c ) || Folding::isWhitespace( *c ) ) { if ( !*c || Folding::isPunct( *c ) || Folding::isWhitespace( *c ) ) {
if ( word.size() ) { if ( word.size() ) {
words.push_back( word ); words.push_back( word );
@ -541,7 +543,7 @@ void getSuggestionsForExpression( wstring const & expression,
// Combine result strings from suggestions // Combine result strings from suggestions
QList< wstring > results; QList< std::u32string > results;
for ( const auto & i : words ) { for ( const auto & i : words ) {
word = i; word = i;
@ -551,13 +553,13 @@ void getSuggestionsForExpression( wstring const & expression,
} }
} }
else { else {
QList< wstring > sugg = suggest( word, hunspellMutex, hunspell ); QList< std::u32string > sugg = suggest( word, hunspellMutex, hunspell );
int suggNum = sugg.size() + 1; int suggNum = sugg.size() + 1;
if ( suggNum > 3 ) { if ( suggNum > 3 ) {
suggNum = 3; suggNum = 3;
} }
int resNum = results.size(); int resNum = results.size();
wstring resultStr; std::u32string resultStr;
if ( resNum == 0 ) { if ( resNum == 0 ) {
for ( int k = 0; k < suggNum; k++ ) { for ( int k = 0; k < suggNum; k++ ) {
@ -587,12 +589,12 @@ void getSuggestionsForExpression( wstring const & expression,
} }
} }
string encodeToHunspell( Hunspell & hunspell, wstring const & str ) string encodeToHunspell( Hunspell & hunspell, std::u32string const & str )
{ {
Iconv conv( Iconv::GdWchar ); Iconv conv( Iconv::GdWchar );
void const * in = str.data(); void const * in = str.data();
size_t inLeft = str.size() * sizeof( wchar ); size_t inLeft = str.size() * sizeof( char32_t );
vector< char > result( str.size() * 4 + 1 ); // +1 isn't actually needed, vector< char > result( str.size() * 4 + 1 ); // +1 isn't actually needed,
// but then iconv complains on empty // but then iconv complains on empty
@ -605,17 +607,17 @@ string encodeToHunspell( Hunspell & hunspell, wstring const & str )
return convStr.toStdString(); return convStr.toStdString();
} }
wstring decodeFromHunspell( Hunspell & hunspell, char const * str ) std::u32string decodeFromHunspell( Hunspell & hunspell, char const * str )
{ {
Iconv conv( hunspell.get_dic_encoding() ); Iconv conv( hunspell.get_dic_encoding() );
void const * in = str; void const * in = str;
size_t inLeft = strlen( str ); size_t inLeft = strlen( str );
vector< wchar > result( inLeft + 1 ); // +1 isn't needed, but see above vector< char32_t > result( inLeft + 1 ); // +1 isn't needed, but see above
void * out = &result.front(); void * out = &result.front();
size_t outLeft = result.size() * sizeof( wchar ); size_t outLeft = result.size() * sizeof( char32_t );
QString convStr = conv.convert( in, inLeft ); QString convStr = conv.convert( in, inLeft );
return convStr.toStdU32String(); return convStr.toStdU32String();

View file

@ -1,5 +1,5 @@
#include "lingualibre.hh" #include "lingualibre.hh"
#include "utf8.hh" #include "text.hh"
#include "audiolink.hh" #include "audiolink.hh"
#include <QJsonArray> #include <QJsonArray>
@ -40,8 +40,8 @@ class LinguaArticleRequest: public Dictionary::DataRequest
public: public:
LinguaArticleRequest( wstring const & word, LinguaArticleRequest( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
QString const & languageCode_, QString const & languageCode_,
QString const & langWikipediaID_, QString const & langWikipediaID_,
string const & dictionaryId_, string const & dictionaryId_,
@ -51,7 +51,7 @@ public:
private: private:
void addQuery( QNetworkAccessManager & mgr, wstring const & word ); void addQuery( QNetworkAccessManager & mgr, std::u32string const & word );
private slots: private slots:
virtual void requestFinished( QNetworkReply * ); virtual void requestFinished( QNetworkReply * );
@ -175,7 +175,7 @@ WHERE {
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const & /*word*/, unsigned long /*maxResults*/ ) override sptr< WordSearchRequest > prefixMatch( std::u32string const & /*word*/, unsigned long /*maxResults*/ ) override
{ {
sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >(); sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >();
@ -184,7 +184,10 @@ WHERE {
return sr; return sr;
} }
sptr< DataRequest > getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) override sptr< DataRequest > getArticle( std::u32string const & word,
vector< std::u32string > const & alts,
std::u32string const &,
bool ) override
{ {
if ( word.size() < 50 ) { if ( word.size() < 50 ) {
return std::make_shared< LinguaArticleRequest >( word, alts, languageCode, langWikipediaID, getId(), netMgr ); return std::make_shared< LinguaArticleRequest >( word, alts, languageCode, langWikipediaID, getId(), netMgr );
@ -231,8 +234,8 @@ void LinguaArticleRequest::cancel()
finish(); finish();
} }
LinguaArticleRequest::LinguaArticleRequest( const wstring & str, LinguaArticleRequest::LinguaArticleRequest( const std::u32string & str,
const vector< wstring > & alts, const vector< std::u32string > & alts,
const QString & languageCode_, const QString & languageCode_,
const QString & langWikipediaID, const QString & langWikipediaID,
const string & dictionaryId_, const string & dictionaryId_,
@ -245,7 +248,7 @@ LinguaArticleRequest::LinguaArticleRequest( const wstring & str,
addQuery( mgr, str ); addQuery( mgr, str );
} }
void LinguaArticleRequest::addQuery( QNetworkAccessManager & mgr, const wstring & word ) void LinguaArticleRequest::addQuery( QNetworkAccessManager & mgr, const std::u32string & word )
{ {
// Doc of the <https://www.mediawiki.org/wiki/API:Query> // Doc of the <https://www.mediawiki.org/wiki/API:Query>
@ -273,7 +276,7 @@ void LinguaArticleRequest::addQuery( QNetworkAccessManager & mgr, const wstring
auto netReply = std::shared_ptr< QNetworkReply >( mgr.get( netRequest ) ); auto netReply = std::shared_ptr< QNetworkReply >( mgr.get( netRequest ) );
netReplies.emplace_back( netReply, Utf8::encode( word ) ); netReplies.emplace_back( netReply, Text::toUtf8( word ) );
} }

View file

@ -5,7 +5,7 @@
#include "dictfile.hh" #include "dictfile.hh"
#include "iconv.hh" #include "iconv.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "audiolink.hh" #include "audiolink.hh"
@ -24,7 +24,6 @@
namespace Lsa { namespace Lsa {
using std::string; using std::string;
using gd::wstring;
using std::map; using std::map;
using std::multimap; using std::multimap;
using std::set; using std::set;
@ -169,8 +168,10 @@ public:
return getArticleCount(); return getArticleCount();
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -199,9 +200,9 @@ LsaDictionary::LsaDictionary( string const & id, string const & indexFile, vecto
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex );
} }
sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > LsaDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -215,13 +216,13 @@ sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word,
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, string > mainArticles, alternateArticles; multimap< std::u32string, string > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -236,12 +237,13 @@ sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word,
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( x.word ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( x.word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, string > & mapToUse = ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; multimap< std::u32string, string > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( x.word ), x.word ) ); mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( x.word ), x.word ) );
@ -254,7 +256,7 @@ sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word,
string result; string result;
multimap< wstring, string >::const_iterator i; multimap< std::u32string, string >::const_iterator i;
result += "<table class=\"lsa_play\">"; result += "<table class=\"lsa_play\">";
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
@ -389,7 +391,7 @@ sptr< Dictionary::DataRequest > LsaDictionary::getResource( string const & name
string strippedName = Utils::endsWithIgnoreCase( name, ".wav" ) ? string( name, 0, name.size() - 4 ) : name; string strippedName = Utils::endsWithIgnoreCase( name, ".wav" ) ? string( name, 0, name.size() - 4 ) : name;
vector< WordArticleLink > chain = findArticles( Utf8::decode( strippedName ) ); vector< WordArticleLink > chain = findArticles( Text::toUtf32( strippedName ) );
if ( chain.empty() ) { if ( chain.empty() ) {
return std::make_shared< Dictionary::DataRequestInstant >( false ); // No such resource return std::make_shared< Dictionary::DataRequestInstant >( false ); // No such resource
@ -572,7 +574,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Insert new entry into an index // Insert new entry into an index
indexedWords.addWord( Utf8::decode( e.name ), offset ); indexedWords.addWord( Text::toUtf32( e.name ), offset );
} }
idxHeader.vorbisOffset = f.tell(); idxHeader.vorbisOffset = f.tell();

View file

@ -4,10 +4,9 @@
#include "mdx.hh" #include "mdx.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "dictfile.hh" #include "dictfile.hh"
#include "wstring.hh" #include "text.hh"
#include "wstring_qt.hh"
#include "chunkedstorage.hh" #include "chunkedstorage.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "audiolink.hh" #include "audiolink.hh"
@ -37,8 +36,6 @@ namespace Mdx {
using std::map; using std::map;
using std::multimap; using std::multimap;
using std::set; using std::set;
using gd::wstring;
using gd::wchar;
using std::list; using std::list;
using std::pair; using std::pair;
using std::string; using std::string;
@ -129,7 +126,7 @@ public:
/// Checks whether the given file exists in the mdd file or not. /// Checks whether the given file exists in the mdd file or not.
/// Note that this function is thread-safe, since it does not access mdd file. /// Note that this function is thread-safe, since it does not access mdd file.
bool hasFile( gd::wstring const & name ) bool hasFile( std::u32string const & name )
{ {
if ( !isFileOpen ) { if ( !isFileOpen ) {
return false; return false;
@ -140,7 +137,7 @@ public:
/// Attempts loading the given file into the given vector. Returns true on /// Attempts loading the given file into the given vector. Returns true on
/// success, false otherwise. /// success, false otherwise.
bool loadFile( gd::wstring const & name, std::vector< char > & result ) bool loadFile( std::u32string const & name, std::vector< char > & result )
{ {
if ( !isFileOpen ) { if ( !isFileOpen ) {
return false; return false;
@ -232,8 +229,10 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const & word,
getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
QString const & getDescription() override; QString const & getDescription() override;
@ -281,7 +280,7 @@ private:
friend class MdxArticleRequest; friend class MdxArticleRequest;
friend class MddResourceRequest; friend class MddResourceRequest;
void loadResourceFile( const wstring & resourceName, vector< char > & data ); void loadResourceFile( const std::u32string & resourceName, vector< char > & data );
}; };
MdxDictionary::MdxDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ): MdxDictionary::MdxDictionary( string const & id, string const & indexFile, vector< string > const & dictionaryFiles ):
@ -488,8 +487,8 @@ sptr< Dictionary::DataRequest > MdxDictionary::getSearchResults( QString const &
class MdxArticleRequest: public Dictionary::DataRequest class MdxArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
MdxDictionary & dict; MdxDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -498,8 +497,8 @@ class MdxArticleRequest: public Dictionary::DataRequest
public: public:
MdxArticleRequest( wstring const & word_, MdxArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
MdxDictionary & dict_, MdxDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -602,8 +601,8 @@ void MdxArticleRequest::run()
// Handle internal redirects // Handle internal redirects
if ( strncmp( articleBody.c_str(), "@@@LINK=", 8 ) == 0 ) { if ( strncmp( articleBody.c_str(), "@@@LINK=", 8 ) == 0 ) {
wstring target = Utf8::decode( articleBody.c_str() + 8 ); std::u32string target = Text::toUtf32( articleBody.c_str() + 8 );
target = Folding::trimWhitespace( target ); target = Folding::trimWhitespace( target );
// Make an additional query for this redirection // Make an additional query for this redirection
vector< WordArticleLink > altChain = dict.findArticles( target ); vector< WordArticleLink > altChain = dict.findArticles( target );
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
@ -626,9 +625,9 @@ void MdxArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > MdxDictionary::getArticle( const wstring & word, sptr< Dictionary::DataRequest > MdxDictionary::getArticle( const std::u32string & word,
const vector< wstring > & alts, const vector< std::u32string > & alts,
const wstring &, const std::u32string &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
return std::make_shared< MdxArticleRequest >( word, alts, *this, ignoreDiacritics ); return std::make_shared< MdxArticleRequest >( word, alts, *this, ignoreDiacritics );
@ -638,7 +637,7 @@ sptr< Dictionary::DataRequest > MdxDictionary::getArticle( const wstring & word,
class MddResourceRequest: public Dictionary::DataRequest class MddResourceRequest: public Dictionary::DataRequest
{ {
MdxDictionary & dict; MdxDictionary & dict;
wstring resourceName; std::u32string resourceName;
QAtomicInt isCancelled; QAtomicInt isCancelled;
QFuture< void > f; QFuture< void > f;
@ -647,7 +646,7 @@ public:
MddResourceRequest( MdxDictionary & dict_, string const & resourceName_ ): MddResourceRequest( MdxDictionary & dict_, string const & resourceName_ ):
Dictionary::DataRequest( &dict_ ), Dictionary::DataRequest( &dict_ ),
dict( dict_ ), dict( dict_ ),
resourceName( Utf8::decode( resourceName_ ) ) resourceName( Text::toUtf32( resourceName_ ) )
{ {
f = QtConcurrent::run( [ this ]() { f = QtConcurrent::run( [ this ]() {
this->run(); this->run();
@ -722,7 +721,7 @@ void MddResourceRequest::run()
} }
// In order to prevent recursive internal redirection... // In order to prevent recursive internal redirection...
set< wstring, std::less<> > resourceIncluded; set< std::u32string, std::less<> > resourceIncluded;
for ( ;; ) { for ( ;; ) {
// Some runnables linger enough that they are cancelled before they start // Some runnables linger enough that they are cancelled before they start
@ -730,7 +729,7 @@ void MddResourceRequest::run()
finish(); finish();
return; return;
} }
string u8ResourceName = Utf8::encode( resourceName ); string u8ResourceName = Text::toUtf8( resourceName );
if ( !resourceIncluded.insert( resourceName ).second ) { if ( !resourceIncluded.insert( resourceName ).second ) {
finish(); finish();
return; return;
@ -1151,11 +1150,11 @@ QString MdxDictionary::getCachedFileName( QString filename )
qWarning( R"(Mdx: file "%s" creating error: "%s")", fullName.toUtf8().data(), f.errorString().toUtf8().data() ); qWarning( R"(Mdx: file "%s" creating error: "%s")", fullName.toUtf8().data(), f.errorString().toUtf8().data() );
return QString(); return QString();
} }
gd::wstring resourceName = filename.toStdU32String(); std::u32string resourceName = filename.toStdU32String();
vector< char > data; vector< char > data;
// In order to prevent recursive internal redirection... // In order to prevent recursive internal redirection...
set< wstring, std::less<> > resourceIncluded; set< std::u32string, std::less<> > resourceIncluded;
for ( ;; ) { for ( ;; ) {
if ( !resourceIncluded.insert( resourceName ).second ) { if ( !resourceIncluded.insert( resourceName ).second ) {
@ -1194,10 +1193,10 @@ QString MdxDictionary::getCachedFileName( QString filename )
return fullName; return fullName;
} }
void MdxDictionary::loadResourceFile( const wstring & resourceName, vector< char > & data ) void MdxDictionary::loadResourceFile( const std::u32string & resourceName, vector< char > & data )
{ {
wstring newResourceName = resourceName; std::u32string newResourceName = resourceName;
string u8ResourceName = Utf8::encode( resourceName ); string u8ResourceName = Text::toUtf8( resourceName );
// Convert to the Windows separator // Convert to the Windows separator
std::replace( newResourceName.begin(), newResourceName.end(), '/', '\\' ); std::replace( newResourceName.begin(), newResourceName.end(), '/', '\\' );

View file

@ -2,7 +2,6 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "mediawiki.hh" #include "mediawiki.hh"
#include "wstring_qt.hh"
#include <QNetworkAccessManager> #include <QNetworkAccessManager>
#include <QNetworkReply> #include <QNetworkReply>
#include <QUrl> #include <QUrl>
@ -66,9 +65,10 @@ public:
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) override; sptr< WordSearchRequest > prefixMatch( std::u32string const &, unsigned long maxResults ) override;
sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; sptr< DataRequest >
getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override;
quint32 getLangFrom() const override quint32 getLangFrom() const override
{ {
@ -133,7 +133,10 @@ class MediaWikiWordSearchRequest: public MediaWikiWordSearchRequestSlots
public: public:
MediaWikiWordSearchRequest( wstring const &, QString const & url, QString const & lang, QNetworkAccessManager & mgr ); MediaWikiWordSearchRequest( std::u32string const &,
QString const & url,
QString const & lang,
QNetworkAccessManager & mgr );
~MediaWikiWordSearchRequest(); ~MediaWikiWordSearchRequest();
@ -144,7 +147,7 @@ private:
void downloadFinished() override; void downloadFinished() override;
}; };
MediaWikiWordSearchRequest::MediaWikiWordSearchRequest( wstring const & str, MediaWikiWordSearchRequest::MediaWikiWordSearchRequest( std::u32string const & str,
QString const & url, QString const & url,
QString const & lang, QString const & lang,
QNetworkAccessManager & mgr ): QNetworkAccessManager & mgr ):
@ -390,8 +393,8 @@ class MediaWikiArticleRequest: public MediaWikiDataRequestSlots
public: public:
MediaWikiArticleRequest( wstring const & word, MediaWikiArticleRequest( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
QString const & url, QString const & url,
QString const & lang, QString const & lang,
QNetworkAccessManager & mgr, QNetworkAccessManager & mgr,
@ -401,7 +404,7 @@ public:
private: private:
void addQuery( QNetworkAccessManager & mgr, wstring const & word ); void addQuery( QNetworkAccessManager & mgr, std::u32string const & word );
void requestFinished( QNetworkReply * ) override; void requestFinished( QNetworkReply * ) override;
@ -435,8 +438,8 @@ void MediaWikiArticleRequest::cancel()
finish(); finish();
} }
MediaWikiArticleRequest::MediaWikiArticleRequest( wstring const & str, MediaWikiArticleRequest::MediaWikiArticleRequest( std::u32string const & str,
vector< wstring > const & alts, vector< std::u32string > const & alts,
QString const & url_, QString const & url_,
QString const & lang_, QString const & lang_,
QNetworkAccessManager & mgr, QNetworkAccessManager & mgr,
@ -458,7 +461,7 @@ MediaWikiArticleRequest::MediaWikiArticleRequest( wstring const & str,
} }
} }
void MediaWikiArticleRequest::addQuery( QNetworkAccessManager & mgr, wstring const & str ) void MediaWikiArticleRequest::addQuery( QNetworkAccessManager & mgr, std::u32string const & str )
{ {
qDebug( "MediaWiki: requesting article %s", QString::fromStdU32String( str ).toUtf8().data() ); qDebug( "MediaWiki: requesting article %s", QString::fromStdU32String( str ).toUtf8().data() );
@ -705,7 +708,7 @@ void MediaWikiArticleRequest::requestFinished( QNetworkReply * r )
} }
} }
sptr< WordSearchRequest > MediaWikiDictionary::prefixMatch( wstring const & word, unsigned long maxResults ) sptr< WordSearchRequest > MediaWikiDictionary::prefixMatch( std::u32string const & word, unsigned long maxResults )
{ {
(void)maxResults; (void)maxResults;
@ -719,8 +722,10 @@ sptr< WordSearchRequest > MediaWikiDictionary::prefixMatch( wstring const & word
} }
} }
sptr< DataRequest > sptr< DataRequest > MediaWikiDictionary::getArticle( std::u32string const & word,
MediaWikiDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) vector< std::u32string > const & alts,
std::u32string const &,
bool )
{ {
if ( word.size() > 80 ) { if ( word.size() > 80 ) {

View file

@ -4,8 +4,7 @@
#include "programs.hh" #include "programs.hh"
#include "audiolink.hh" #include "audiolink.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
#include "utf8.hh" #include "text.hh"
#include "wstring_qt.hh"
#include "iconv.hh" #include "iconv.hh"
#include "utils.hh" #include "utils.hh"
#include "globalbroadcaster.hh" #include "globalbroadcaster.hh"
@ -46,16 +45,17 @@ public:
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const & word, unsigned long maxResults ) override; sptr< WordSearchRequest > prefixMatch( std::u32string const & word, unsigned long maxResults ) override;
sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; sptr< DataRequest >
getArticle( std::u32string const &, vector< std::u32string > const & alts, std::u32string const &, bool ) override;
protected: protected:
void loadIcon() noexcept override; void loadIcon() noexcept override;
}; };
sptr< WordSearchRequest > ProgramsDictionary::prefixMatch( wstring const & word, unsigned long /*maxResults*/ ) sptr< WordSearchRequest > ProgramsDictionary::prefixMatch( std::u32string const & word, unsigned long /*maxResults*/ )
{ {
if ( prg.type == Config::Program::PrefixMatch ) { if ( prg.type == Config::Program::PrefixMatch ) {
@ -70,8 +70,10 @@ sptr< WordSearchRequest > ProgramsDictionary::prefixMatch( wstring const & word,
} }
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > ProgramsDictionary::getArticle( std::u32string const & word,
ProgramsDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) vector< std::u32string > const &,
std::u32string const &,
bool )
{ {
switch ( prg.type ) { switch ( prg.type ) {
@ -79,7 +81,7 @@ ProgramsDictionary::getArticle( wstring const & word, vector< wstring > const &,
// Audio results are instantaneous // Audio results are instantaneous
string result; string result;
string wordUtf8( Utf8::encode( word ) ); string wordUtf8( Text::toUtf8( word ) );
result += "<table class=\"programs_play\"><tr>"; result += "<table class=\"programs_play\"><tr>";

View file

@ -6,14 +6,13 @@
#include <QProcess> #include <QProcess>
#include "dictionary.hh" #include "dictionary.hh"
#include "config.hh" #include "config.hh"
#include "wstring.hh" #include "text.hh"
/// Support for arbitrary programs. /// Support for arbitrary programs.
namespace Programs { namespace Programs {
using std::vector; using std::vector;
using std::string; using std::string;
using gd::wstring;
vector< sptr< Dictionary::Class > > makeDictionaries( Config::Programs const & ); vector< sptr< Dictionary::Class > > makeDictionaries( Config::Programs const & );

View file

@ -9,7 +9,7 @@
#include "htmlescape.hh" #include "htmlescape.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "sdict.hh" #include "sdict.hh"
#include "utf8.hh" #include "text.hh"
#include <map> #include <map>
#include <QAtomicInt> #include <QAtomicInt>
#include <QDir> #include <QDir>
@ -26,7 +26,6 @@ using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using std::string; using std::string;
using gd::wstring;
using BtreeIndexing::WordArticleLink; using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
@ -133,8 +132,10 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
QString const & getDescription() override; QString const & getDescription() override;
@ -416,8 +417,8 @@ SdictDictionary::getSearchResults( QString const & searchString, int searchMode,
class SdictArticleRequest: public Dictionary::DataRequest class SdictArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
SdictDictionary & dict; SdictDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -427,8 +428,8 @@ class SdictArticleRequest: public Dictionary::DataRequest
public: public:
SdictArticleRequest( wstring const & word_, SdictArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
SdictDictionary & dict_, SdictDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -472,13 +473,13 @@ void SdictArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -507,12 +508,12 @@ void SdictArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< std::u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
@ -532,7 +533,7 @@ void SdictArticleRequest::run()
string result; string result;
multimap< wstring, pair< string, string > >::const_iterator i; multimap< std::u32string, pair< string, string > >::const_iterator i;
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
result += dict.isFromLanguageRTL() ? "<h3 dir=\"rtl\">" : "<h3>"; result += dict.isFromLanguageRTL() ? "<h3 dir=\"rtl\">" : "<h3>";
@ -561,9 +562,9 @@ void SdictArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > SdictDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > SdictDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -741,7 +742,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Insert new entry // Insert new entry
indexedWords.addWord( Utf8::decode( string( data.data(), size ) ), articleOffset ); indexedWords.addWord( Text::toUtf32( string( data.data(), size ) ), articleOffset );
pos += el.nextWord; pos += el.nextWord;
} }

View file

@ -6,7 +6,7 @@
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "decompress.hh" #include "decompress.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "ftshelpers.hh" #include "ftshelpers.hh"
@ -40,7 +40,6 @@ using std::vector;
using std::multimap; using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using gd::wstring;
using BtreeIndexing::WordArticleLink; using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
@ -630,8 +629,10 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -853,7 +854,7 @@ void SlobDictionary::loadResource( std::string & resourceName, string & data )
vector< WordArticleLink > link; vector< WordArticleLink > link;
RefEntry entry; RefEntry entry;
link = resourceIndex.findArticles( Utf8::decode( resourceName ) ); link = resourceIndex.findArticles( Text::toUtf32( resourceName ) );
if ( link.empty() ) { if ( link.empty() ) {
return; return;
@ -989,8 +990,8 @@ SlobDictionary::getSearchResults( QString const & searchString, int searchMode,
class SlobArticleRequest: public Dictionary::DataRequest class SlobArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
SlobDictionary & dict; SlobDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -999,8 +1000,8 @@ class SlobArticleRequest: public Dictionary::DataRequest
public: public:
SlobArticleRequest( wstring const & word_, SlobArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
SlobDictionary & dict_, SlobDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -1045,13 +1046,13 @@ void SlobArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles;
set< quint64 > articlesIncluded; // Some synonims make it that the articles set< quint64 > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -1084,12 +1085,12 @@ void SlobArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< std::u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
@ -1105,7 +1106,7 @@ void SlobArticleRequest::run()
string result; string result;
multimap< wstring, pair< string, string > >::const_iterator i; multimap< std::u32string, pair< string, string > >::const_iterator i;
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
result += R"(<div class="slobdict"><h3 class="slobdict_headword">)"; result += R"(<div class="slobdict"><h3 class="slobdict_headword">)";
@ -1128,9 +1129,9 @@ void SlobArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > SlobDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > SlobDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {

View file

@ -3,13 +3,12 @@
#include "sounddir.hh" #include "sounddir.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "chunkedstorage.hh" #include "chunkedstorage.hh"
#include "filetype.hh" #include "filetype.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
#include "audiolink.hh" #include "audiolink.hh"
#include "wstring_qt.hh"
#include "utils.hh" #include "utils.hh"
@ -21,7 +20,6 @@
namespace SoundDir { namespace SoundDir {
using std::string; using std::string;
using gd::wstring;
using std::map; using std::map;
using std::multimap; using std::multimap;
using std::set; using std::set;
@ -85,8 +83,10 @@ public:
return getArticleCount(); return getArticleCount();
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -114,9 +114,9 @@ SoundDirDictionary::SoundDirDictionary( string const & id,
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex ); openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, idxHeader.indexRootOffset ), idx, idxMutex );
} }
sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
vector< WordArticleLink > chain = findArticles( word, ignoreDiacritics ); vector< WordArticleLink > chain = findArticles( word, ignoreDiacritics );
@ -130,13 +130,13 @@ sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const &
} }
// maps to the chain number // maps to the chain number
multimap< wstring, unsigned > mainArticles, alternateArticles; multimap< std::u32string, unsigned > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -151,12 +151,12 @@ sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const &
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( chain[ x ].word ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( chain[ x ].word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, unsigned > & mapToUse = multimap< std::u32string, unsigned > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( chain[ x ].word ), x ) ); mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( chain[ x ].word ), x ) );
@ -170,7 +170,7 @@ sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const &
string result; string result;
multimap< wstring, uint32_t >::const_iterator i; multimap< std::u32string, uint32_t >::const_iterator i;
string displayedName; string displayedName;
vector< char > chunk; vector< char > chunk;
@ -399,11 +399,11 @@ void addDir( QDir const & baseDir,
const uint32_t articleOffset = chunks.startNewBlock(); const uint32_t articleOffset = chunks.startNewBlock();
chunks.addToBlock( fileName.c_str(), fileName.size() + 1 ); chunks.addToBlock( fileName.c_str(), fileName.size() + 1 );
wstring name = i->fileName().toStdU32String(); std::u32string name = i->fileName().toStdU32String();
const wstring::size_type pos = name.rfind( L'.' ); const std::u32string::size_type pos = name.rfind( L'.' );
if ( pos != wstring::npos ) { if ( pos != std::u32string::npos ) {
name.erase( pos ); name.erase( pos );
} }

View file

@ -4,7 +4,7 @@
#include "stardict.hh" #include "stardict.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "chunkedstorage.hh" #include "chunkedstorage.hh"
#include "dictzip.hh" #include "dictzip.hh"
#include "xdxf2html.hh" #include "xdxf2html.hh"
@ -42,7 +42,6 @@ using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using std::string; using std::string;
using gd::wstring;
using BtreeIndexing::WordArticleLink; using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
@ -154,10 +153,12 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override; sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & ) override;
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -1164,7 +1165,7 @@ sptr< Dictionary::DataRequest > StardictDictionary::getSearchResults( QString co
class StardictHeadwordsRequest: public Dictionary::WordSearchRequest class StardictHeadwordsRequest: public Dictionary::WordSearchRequest
{ {
wstring word; std::u32string word;
StardictDictionary & dict; StardictDictionary & dict;
QAtomicInt isCancelled; QAtomicInt isCancelled;
@ -1172,7 +1173,7 @@ class StardictHeadwordsRequest: public Dictionary::WordSearchRequest
public: public:
StardictHeadwordsRequest( wstring const & word_, StardictDictionary & dict_ ): StardictHeadwordsRequest( std::u32string const & word_, StardictDictionary & dict_ ):
word( word_ ), word( word_ ),
dict( dict_ ) dict( dict_ )
{ {
@ -1207,7 +1208,7 @@ void StardictHeadwordsRequest::run()
//limited the synomys to at most 10 entries //limited the synomys to at most 10 entries
vector< WordArticleLink > chain = dict.findArticles( word, false, 10 ); vector< WordArticleLink > chain = dict.findArticles( word, false, 10 );
wstring caseFolded = Folding::applySimpleCaseOnly( word ); std::u32string caseFolded = Folding::applySimpleCaseOnly( word );
for ( auto & x : chain ) { for ( auto & x : chain ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) {
@ -1219,7 +1220,7 @@ void StardictHeadwordsRequest::run()
dict.loadArticle( x.articleOffset, headword, articleText ); dict.loadArticle( x.articleOffset, headword, articleText );
wstring headwordDecoded = Utf8::decode( headword ); std::u32string headwordDecoded = Text::toUtf32( headword );
if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) ) { if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) ) {
// The headword seems to differ from the input word, which makes the // The headword seems to differ from the input word, which makes the
@ -1237,7 +1238,7 @@ void StardictHeadwordsRequest::run()
finish(); finish();
} }
sptr< Dictionary::WordSearchRequest > StardictDictionary::findHeadwordsForSynonym( wstring const & word ) sptr< Dictionary::WordSearchRequest > StardictDictionary::findHeadwordsForSynonym( std::u32string const & word )
{ {
return synonymSearchEnabled ? std::make_shared< StardictHeadwordsRequest >( word, *this ) : return synonymSearchEnabled ? std::make_shared< StardictHeadwordsRequest >( word, *this ) :
Class::findHeadwordsForSynonym( word ); Class::findHeadwordsForSynonym( word );
@ -1250,8 +1251,8 @@ sptr< Dictionary::WordSearchRequest > StardictDictionary::findHeadwordsForSynony
class StardictArticleRequest: public Dictionary::DataRequest class StardictArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
StardictDictionary & dict; StardictDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -1261,8 +1262,8 @@ class StardictArticleRequest: public Dictionary::DataRequest
public: public:
StardictArticleRequest( wstring const & word_, StardictArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
StardictDictionary & dict_, StardictDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -1312,13 +1313,13 @@ void StardictArticleRequest::run()
} }
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonyms make it that the articles set< uint32_t > articlesIncluded; // Some synonyms make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -1345,12 +1346,12 @@ void StardictArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< std::u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
@ -1366,7 +1367,7 @@ void StardictArticleRequest::run()
string result; string result;
multimap< wstring, pair< string, string > >::const_iterator i; multimap< std::u32string, pair< string, string > >::const_iterator i;
string cleaner = Utils::Html::getHtmlCleaner(); string cleaner = Utils::Html::getHtmlCleaner();
@ -1409,9 +1410,9 @@ void StardictArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > StardictDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > StardictDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -1569,7 +1570,7 @@ void StardictResourceRequest::run()
if ( dict.resourceZip.isOpen() ) { if ( dict.resourceZip.isOpen() ) {
QMutexLocker _( &dataMutex ); QMutexLocker _( &dataMutex );
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) {
throw; // Make it fail since we couldn't read the archive throw; // Make it fail since we couldn't read the archive
} }
} }
@ -1801,10 +1802,10 @@ static void handleIdxSynFile( string const & fileName,
// Insert new entry into an index // Insert new entry into an index
if ( parseHeadwords ) { if ( parseHeadwords ) {
indexedWords.addWord( Utf8::decode( word ), offset ); indexedWords.addWord( Text::toUtf32( word ), offset );
} }
else { else {
indexedWords.addSingleWord( Utf8::decode( word ), offset ); indexedWords.addSingleWord( Text::toUtf32( word ), offset );
} }
} }

View file

@ -7,7 +7,7 @@
#include <opencc/opencc.h> #include <opencc/opencc.h>
#include "folding.hh" #include "folding.hh"
#include "transliteration.hh" #include "transliteration.hh"
#include "utf8.hh" #include "text.hh"
namespace ChineseTranslit { namespace ChineseTranslit {
@ -27,7 +27,7 @@ public:
QString const & openccConfig ); QString const & openccConfig );
~CharacterConversionDictionary(); ~CharacterConversionDictionary();
std::vector< gd::wstring > getAlternateWritings( gd::wstring const & ) noexcept override; std::vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept override;
}; };
CharacterConversionDictionary::CharacterConversionDictionary( std::string const & id, CharacterConversionDictionary::CharacterConversionDictionary( std::string const & id,
@ -68,15 +68,15 @@ CharacterConversionDictionary::~CharacterConversionDictionary()
// #endif // #endif
} }
std::vector< gd::wstring > CharacterConversionDictionary::getAlternateWritings( gd::wstring const & str ) noexcept std::vector< std::u32string > CharacterConversionDictionary::getAlternateWritings( std::u32string const & str ) noexcept
{ {
std::vector< gd::wstring > results; std::vector< std::u32string > results;
if ( converter != NULL ) { if ( converter != NULL ) {
gd::wstring folded = Folding::applySimpleCaseOnly( str ); std::u32string folded = Folding::applySimpleCaseOnly( str );
std::string input = Utf8::encode( folded ); std::string input = Text::toUtf8( folded );
std::string output; std::string output;
gd::wstring result; std::u32string result;
try { try {
// #ifdef Q_OS_MAC // #ifdef Q_OS_MAC
@ -93,7 +93,7 @@ std::vector< gd::wstring > CharacterConversionDictionary::getAlternateWritings(
// #else // #else
// output = converter->Convert( input ); // output = converter->Convert( input );
// #endif // #endif
result = Utf8::decode( output ); result = Text::toUtf32( output );
} }
catch ( std::exception & ex ) { catch ( std::exception & ex ) {
qWarning( "OpenCC: conversion failed %s", ex.what() ); qWarning( "OpenCC: conversion failed %s", ex.what() );

View file

@ -2,12 +2,11 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "transliteration.hh" #include "transliteration.hh"
#include "utf8.hh" #include "text.hh"
#include "folding.hh" #include "folding.hh"
namespace Transliteration { namespace Transliteration {
using gd::wchar;
BaseTransliterationDictionary::BaseTransliterationDictionary( string const & id, BaseTransliterationDictionary::BaseTransliterationDictionary( string const & id,
string const & name_, string const & name_,
@ -36,24 +35,28 @@ unsigned long BaseTransliterationDictionary::getWordCount() noexcept
return 0; return 0;
} }
sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::prefixMatch( wstring const &, unsigned long ) sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::prefixMatch( std::u32string const &,
unsigned long )
{ {
return std::make_shared< Dictionary::WordSearchRequestInstant >(); return std::make_shared< Dictionary::WordSearchRequestInstant >();
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > BaseTransliterationDictionary::getArticle( std::u32string const &,
BaseTransliterationDictionary::getArticle( wstring const &, vector< wstring > const &, wstring const &, bool ) vector< std::u32string > const &,
std::u32string const &,
bool )
{ {
return std::make_shared< Dictionary::DataRequestInstant >( false ); return std::make_shared< Dictionary::DataRequestInstant >( false );
} }
sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::findHeadwordsForSynonym( wstring const & str ) sptr< Dictionary::WordSearchRequest >
BaseTransliterationDictionary::findHeadwordsForSynonym( std::u32string const & str )
{ {
sptr< Dictionary::WordSearchRequestInstant > result = std::make_shared< Dictionary::WordSearchRequestInstant >(); sptr< Dictionary::WordSearchRequestInstant > result = std::make_shared< Dictionary::WordSearchRequestInstant >();
vector< wstring > alts = getAlternateWritings( str ); vector< std::u32string > alts = getAlternateWritings( str );
qDebug( "alts = %u", (unsigned)alts.size() ); qDebug( "alts = %u", (unsigned)alts.size() );
@ -67,13 +70,13 @@ sptr< Dictionary::WordSearchRequest > BaseTransliterationDictionary::findHeadwor
void Table::ins( char const * from, char const * to ) void Table::ins( char const * from, char const * to )
{ {
wstring fr = Utf8::decode( std::string( from ) ); std::u32string fr = Text::toUtf32( std::string( from ) );
if ( fr.size() > maxEntrySize ) { if ( fr.size() > maxEntrySize ) {
maxEntrySize = fr.size(); maxEntrySize = fr.size();
} }
insert( std::pair< wstring, wstring >( fr, Utf8::decode( std::string( to ) ) ) ); insert( std::pair< std::u32string, std::u32string >( fr, Text::toUtf32( std::string( to ) ) ) );
} }
@ -84,12 +87,12 @@ TransliterationDictionary::TransliterationDictionary(
{ {
} }
vector< wstring > TransliterationDictionary::getAlternateWritings( wstring const & str ) noexcept vector< std::u32string > TransliterationDictionary::getAlternateWritings( std::u32string const & str ) noexcept
{ {
vector< wstring > results; vector< std::u32string > results;
wstring result, folded; std::u32string result, folded;
wstring const * target; std::u32string const * target;
if ( caseSensitive ) { if ( caseSensitive ) {
// Don't do any transform -- the transliteration is case-sensitive // Don't do any transform -- the transliteration is case-sensitive
@ -100,8 +103,8 @@ vector< wstring > TransliterationDictionary::getAlternateWritings( wstring const
target = &folded; target = &folded;
} }
wchar const * ptr = target->c_str(); char32_t const * ptr = target->c_str();
size_t left = target->size(); size_t left = target->size();
Table::const_iterator i; Table::const_iterator i;
@ -110,7 +113,7 @@ vector< wstring > TransliterationDictionary::getAlternateWritings( wstring const
for ( x = table.getMaxEntrySize(); x >= 1; --x ) { for ( x = table.getMaxEntrySize(); x >= 1; --x ) {
if ( left >= x ) { if ( left >= x ) {
i = table.find( wstring( ptr, x ) ); i = table.find( std::u32string( ptr, x ) );
if ( i != table.end() ) { if ( i != table.end() ) {
result.append( i->second ); result.append( i->second );

View file

@ -9,7 +9,6 @@
namespace Transliteration { namespace Transliteration {
using std::map; using std::map;
using gd::wstring;
using std::string; using std::string;
using std::vector; using std::vector;
@ -32,18 +31,18 @@ public:
virtual unsigned long getWordCount() noexcept; virtual unsigned long getWordCount() noexcept;
virtual vector< wstring > getAlternateWritings( wstring const & ) noexcept = 0; virtual vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept = 0;
virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ); virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( std::u32string const & );
virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &, unsigned long ); virtual sptr< Dictionary::WordSearchRequest > prefixMatch( std::u32string const &, unsigned long );
virtual sptr< Dictionary::DataRequest > virtual sptr< Dictionary::DataRequest >
getArticle( wstring const &, vector< wstring > const &, wstring const &, bool ); getArticle( std::u32string const &, vector< std::u32string > const &, std::u32string const &, bool );
}; };
class Table: public map< wstring, wstring > class Table: public map< std::u32string, std::u32string >
{ {
unsigned maxEntrySize; unsigned maxEntrySize;
@ -77,7 +76,7 @@ public:
TransliterationDictionary( TransliterationDictionary(
string const & id, string const & name, QIcon icon, Table const & table, bool caseSensitive = true ); string const & id, string const & name, QIcon icon, Table const & table, bool caseSensitive = true );
virtual vector< wstring > getAlternateWritings( wstring const & ) noexcept; virtual vector< std::u32string > getAlternateWritings( std::u32string const & ) noexcept;
}; };
} // namespace Transliteration } // namespace Transliteration

View file

@ -4,9 +4,8 @@
#include "indexedzip.hh" #include "indexedzip.hh"
#include "zipfile.hh" #include "zipfile.hh"
#include <zlib.h> #include <zlib.h>
#include "utf8.hh" #include "text.hh"
#include "iconv.hh" #include "iconv.hh"
#include "wstring_qt.hh"
#include <QtCore5Compat/QTextCodec> #include <QtCore5Compat/QTextCodec>
#include <QMutexLocker> #include <QMutexLocker>
@ -23,7 +22,7 @@ bool IndexedZip::openZipFile( QString const & name )
return zipIsOpen; return zipIsOpen;
} }
bool IndexedZip::hasFile( gd::wstring const & name ) bool IndexedZip::hasFile( std::u32string const & name )
{ {
if ( !zipIsOpen ) { if ( !zipIsOpen ) {
return false; return false;
@ -34,7 +33,7 @@ bool IndexedZip::hasFile( gd::wstring const & name )
return !links.empty(); return !links.empty();
} }
bool IndexedZip::loadFile( gd::wstring const & name, vector< char > & data ) bool IndexedZip::loadFile( std::u32string const & name, vector< char > & data )
{ {
if ( !zipIsOpen ) { if ( !zipIsOpen ) {
return false; return false;
@ -180,7 +179,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
if ( !hasNonAscii ) { if ( !hasNonAscii ) {
// Add entry as is // Add entry as is
zipFileNames.addSingleWord( Utf8::decode( entry.fileName.data() ), entry.localHeaderOffset ); zipFileNames.addSingleWord( Text::toUtf32( entry.fileName.data() ), entry.localHeaderOffset );
if ( filesCount ) { if ( filesCount ) {
*filesCount += 1; *filesCount += 1;
} }
@ -192,7 +191,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
// Utf8 // Utf8
try { try {
wstring decoded = Utf8::decode( entry.fileName.constData() ); std::u32string decoded = Text::toUtf32( entry.fileName.constData() );
zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );
if ( filesCount != 0 && !alreadyCounted ) { if ( filesCount != 0 && !alreadyCounted ) {
@ -200,12 +199,12 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
alreadyCounted = true; alreadyCounted = true;
} }
} }
catch ( Utf8::exCantDecode & ) { catch ( Text::exCantDecode & ) {
// Failed to decode // Failed to decode
} }
if ( !entry.fileNameInUTF8 ) { if ( !entry.fileNameInUTF8 ) {
wstring nameInSystemLocale; std::u32string nameInSystemLocale;
// System locale // System locale
if ( localeCodec ) { if ( localeCodec ) {
@ -224,7 +223,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
// CP866 // CP866
try { try {
wstring decoded = Iconv::toWstring( "CP866", entry.fileName.constData(), entry.fileName.size() ); std::u32string decoded = Iconv::toWstring( "CP866", entry.fileName.constData(), entry.fileName.size() );
if ( nameInSystemLocale != decoded ) { if ( nameInSystemLocale != decoded ) {
zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );
@ -241,7 +240,7 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
// CP1251 // CP1251
try { try {
wstring decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(), entry.fileName.size() ); std::u32string decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(), entry.fileName.size() );
if ( nameInSystemLocale != decoded ) { if ( nameInSystemLocale != decoded ) {
zipFileNames.addSingleWord( decoded, entry.localHeaderOffset ); zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );

View file

@ -37,11 +37,11 @@ public:
/// Checks whether the given file exists in the zip file or not. /// Checks whether the given file exists in the zip file or not.
/// Note that this function is thread-safe, since it does not access zip file. /// Note that this function is thread-safe, since it does not access zip file.
bool hasFile( gd::wstring const & name ); bool hasFile( std::u32string const & name );
/// Attempts loading the given file into the given vector. Returns true on /// Attempts loading the given file into the given vector. Returns true on
/// success, false otherwise. /// success, false otherwise.
bool loadFile( gd::wstring const & name, std::vector< char > & ); bool loadFile( std::u32string const & name, std::vector< char > & );
bool loadFile( uint32_t offset, std::vector< char > & ); bool loadFile( uint32_t offset, std::vector< char > & );
/// Index compressed files in zip file /// Index compressed files in zip file

View file

@ -5,8 +5,7 @@
#include "voiceengines.hh" #include "voiceengines.hh"
#include "audiolink.hh" #include "audiolink.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
#include "utf8.hh" #include "text.hh"
#include "wstring_qt.hh"
#include <string> #include <string>
#include <map> #include <map>
@ -21,6 +20,7 @@ namespace VoiceEngines {
using namespace Dictionary; using namespace Dictionary;
using std::string; using std::string;
using std::u32string;
using std::map; using std::map;
inline string toMd5( QByteArray const & b ) inline string toMd5( QByteArray const & b )
@ -58,16 +58,18 @@ public:
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const & word, unsigned long maxResults ) override; sptr< WordSearchRequest > prefixMatch( u32string const & word, unsigned long maxResults ) override;
sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; sptr< DataRequest >
getArticle( u32string const &, vector< u32string > const & alts, u32string const &, bool ) override;
protected: protected:
void loadIcon() noexcept override; void loadIcon() noexcept override;
}; };
sptr< WordSearchRequest > VoiceEnginesDictionary::prefixMatch( wstring const & /*word*/, unsigned long /*maxResults*/ ) sptr< WordSearchRequest > VoiceEnginesDictionary::prefixMatch( u32string const & /*word*/,
unsigned long /*maxResults*/ )
{ {
WordSearchRequestInstant * sr = new WordSearchRequestInstant(); WordSearchRequestInstant * sr = new WordSearchRequestInstant();
@ -76,11 +78,11 @@ sptr< WordSearchRequest > VoiceEnginesDictionary::prefixMatch( wstring const & /
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest >
VoiceEnginesDictionary::getArticle( wstring const & word, vector< wstring > const &, wstring const &, bool ) VoiceEnginesDictionary::getArticle( u32string const & word, vector< u32string > const &, u32string const &, bool )
{ {
string result; string result;
string wordUtf8( Utf8::encode( word ) ); string wordUtf8( Text::toUtf8( word ) );
result += "<table class=\"voiceengines_play\"><tr>"; result += "<table class=\"voiceengines_play\"><tr>";
@ -135,4 +137,4 @@ vector< sptr< Dictionary::Class > > makeDictionaries( Config::VoiceEngines const
} // namespace VoiceEngines } // namespace VoiceEngines
#endif #endif

View file

@ -5,16 +5,13 @@
#include "dictionary.hh" #include "dictionary.hh"
#include "config.hh" #include "config.hh"
#include "wstring.hh" #include "text.hh"
#include <QCryptographicHash> #include <QCryptographicHash>
namespace VoiceEngines { namespace VoiceEngines {
using std::vector; using std::vector;
using std::string; using std::string;
using gd::wstring;
vector< sptr< Dictionary::Class > > makeDictionaries( Config::VoiceEngines const & voiceEngines ); vector< sptr< Dictionary::Class > > makeDictionaries( Config::VoiceEngines const & voiceEngines );

View file

@ -2,8 +2,7 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "website.hh" #include "website.hh"
#include "wstring_qt.hh" #include "text.hh"
#include "utf8.hh"
#include <QUrl> #include <QUrl>
#include <QTextCodec> #include <QTextCodec>
#include <QDir> #include <QDir>
@ -62,10 +61,12 @@ public:
return 0; return 0;
} }
sptr< WordSearchRequest > prefixMatch( wstring const & word, unsigned long ) override; sptr< WordSearchRequest > prefixMatch( std::u32string const & word, unsigned long ) override;
sptr< DataRequest > sptr< DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const & context, bool ) override; vector< std::u32string > const & alts,
std::u32string const & context,
bool ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -90,7 +91,7 @@ protected slots:
virtual void requestFinished( QNetworkReply * ) {} virtual void requestFinished( QNetworkReply * ) {}
}; };
sptr< WordSearchRequest > WebSiteDictionary::prefixMatch( wstring const & /*word*/, unsigned long ) sptr< WordSearchRequest > WebSiteDictionary::prefixMatch( std::u32string const & /*word*/, unsigned long )
{ {
sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >(); sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >();
@ -308,9 +309,9 @@ void WebSiteArticleRequest::requestFinished( QNetworkReply * r )
finish(); finish();
} }
sptr< DataRequest > WebSiteDictionary::getArticle( wstring const & str, sptr< DataRequest > WebSiteDictionary::getArticle( std::u32string const & str,
vector< wstring > const & /*alts*/, vector< std::u32string > const & /*alts*/,
wstring const & context, std::u32string const & context,
bool /*ignoreDiacritics*/ ) bool /*ignoreDiacritics*/ )
{ {
QString urlString = Utils::WebSite::urlReplaceWord( QString( urlTemplate ), QString::fromStdU32String( str ) ); QString urlString = Utils::WebSite::urlReplaceWord( QString( urlTemplate ), QString::fromStdU32String( str ) );

View file

@ -4,7 +4,7 @@
#include "xdxf.hh" #include "xdxf.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "chunkedstorage.hh" #include "chunkedstorage.hh"
#include "dictzip.hh" #include "dictzip.hh"
#include "htmlescape.hh" #include "htmlescape.hh"
@ -39,7 +39,6 @@ using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using std::string; using std::string;
using gd::wstring;
using std::vector; using std::vector;
using std::list; using std::list;
@ -160,8 +159,10 @@ public:
return idxHeader.langTo; return idxHeader.langTo;
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -411,8 +412,8 @@ XdxfDictionary::getSearchResults( QString const & searchString, int searchMode,
class XdxfArticleRequest: public Dictionary::DataRequest class XdxfArticleRequest: public Dictionary::DataRequest
{ {
wstring word; std::u32string word;
vector< wstring > alts; vector< std::u32string > alts;
XdxfDictionary & dict; XdxfDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -421,8 +422,8 @@ class XdxfArticleRequest: public Dictionary::DataRequest
public: public:
XdxfArticleRequest( wstring const & word_, XdxfArticleRequest( std::u32string const & word_,
vector< wstring > const & alts_, vector< std::u32string > const & alts_,
XdxfDictionary & dict_, XdxfDictionary & dict_,
bool ignoreDiacritics_ ): bool ignoreDiacritics_ ):
word( word_ ), word( word_ ),
@ -467,13 +468,13 @@ void XdxfArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< std::u32string, pair< string, string > > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -502,12 +503,12 @@ void XdxfArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( headword );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< std::u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
@ -527,7 +528,7 @@ void XdxfArticleRequest::run()
string result; string result;
multimap< wstring, pair< string, string > >::const_iterator i; multimap< std::u32string, pair< string, string > >::const_iterator i;
string cleaner = Utils::Html::getHtmlCleaner(); string cleaner = Utils::Html::getHtmlCleaner();
@ -554,9 +555,9 @@ void XdxfArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > XdxfDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > XdxfDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -973,7 +974,7 @@ void XdxfResourceRequest::run()
if ( dict.resourceZip.isOpen() ) { if ( dict.resourceZip.isOpen() ) {
QMutexLocker _( &dataMutex ); QMutexLocker _( &dataMutex );
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) ) { if ( !dict.resourceZip.loadFile( Text::toUtf32( resourceName ), data ) ) {
throw; // Make it fail since we couldn't read the archive throw; // Make it fail since we couldn't read the archive
} }
} }
@ -1194,7 +1195,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
else if ( stream.name() == u"abbreviations" ) { else if ( stream.name() == u"abbreviations" ) {
QString s; QString s;
string value; string value;
list< wstring > keys; list< std::u32string > keys;
while ( !( stream.isEndElement() && stream.name() == u"abbreviations" ) && !stream.atEnd() ) { while ( !( stream.isEndElement() && stream.name() == u"abbreviations" ) && !stream.atEnd() ) {
if ( !stream.readNextStartElement() ) { if ( !stream.readNextStartElement() ) {
break; break;
@ -1210,7 +1211,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
s = readElementText( stream ); s = readElementText( stream );
value = Folding::trimWhitespace( s ).toStdString(); value = Folding::trimWhitespace( s ).toStdString();
for ( const auto & key : keys ) { for ( const auto & key : keys ) {
abrv[ Utf8::encode( Folding::trimWhitespace( key ) ) ] = value; abrv[ Text::toUtf8( Folding::trimWhitespace( key ) ) ] = value;
} }
keys.clear(); keys.clear();
} }
@ -1230,7 +1231,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
s = readElementText( stream ); s = readElementText( stream );
value = Folding::trimWhitespace( s ).toStdString(); value = Folding::trimWhitespace( s ).toStdString();
for ( const auto & key : keys ) { for ( const auto & key : keys ) {
abrv[ Utf8::encode( Folding::trimWhitespace( key ) ) ] = value; abrv[ Text::toUtf8( Folding::trimWhitespace( key ) ) ] = value;
} }
keys.clear(); keys.clear();
} }

View file

@ -3,8 +3,7 @@
#include "xdxf2html.hh" #include "xdxf2html.hh"
#include <QtXml> #include <QtXml>
#include "utf8.hh" #include "text.hh"
#include "wstring_qt.hh"
#include "folding.hh" #include "folding.hh"
#include "audiolink.hh" #include "audiolink.hh"
@ -442,7 +441,7 @@ string convert( string const & in,
if ( i != pAbrv->end() ) { if ( i != pAbrv->end() ) {
string title; string title;
if ( Utf8::decode( i->second ).size() < 70 ) { if ( Text::toUtf32( i->second ).size() < 70 ) {
// Replace all spaces with non-breakable ones, since that's how Lingvo shows tooltips // Replace all spaces with non-breakable ones, since that's how Lingvo shows tooltips
title.reserve( i->second.size() ); title.reserve( i->second.size() );
@ -466,7 +465,7 @@ string convert( string const & in,
else { else {
title = i->second; title = i->second;
} }
el.setAttribute( "title", QString::fromStdU32String( Utf8::decode( title ) ) ); el.setAttribute( "title", QString::fromStdU32String( Text::toUtf32( title ) ) );
} }
} }
} }
@ -628,7 +627,7 @@ string convert( string const & in,
// if( type == XDXF && dictPtr != NULL && !el.hasAttribute( "start" ) ) // if( type == XDXF && dictPtr != NULL && !el.hasAttribute( "start" ) )
if ( dictPtr != NULL && !el.hasAttribute( "start" ) ) { if ( dictPtr != NULL && !el.hasAttribute( "start" ) ) {
string filename = Utf8::encode( el.text().toStdU32String() ); string filename = Text::toUtf8( el.text().toStdU32String() );
if ( Filetype::isNameOfPicture( filename ) ) { if ( Filetype::isNameOfPicture( filename ) ) {
QUrl url; QUrl url;

View file

@ -6,7 +6,7 @@
#include "zim.hh" #include "zim.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "langcoder.hh" #include "langcoder.hh"
#include "filetype.hh" #include "filetype.hh"
#include "dictfile.hh" #include "dictfile.hh"
@ -38,12 +38,12 @@
namespace Zim { namespace Zim {
using std::string; using std::string;
using std::u32string;
using std::map; using std::map;
using std::vector; using std::vector;
using std::multimap; using std::multimap;
using std::pair; using std::pair;
using std::set; using std::set;
using gd::wstring;
using BtreeIndexing::WordArticleLink; using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexedWords;
@ -182,7 +182,7 @@ public:
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest >
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; getArticle( u32string const &, vector< u32string > const & alts, u32string const &, bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -519,8 +519,8 @@ ZimDictionary::getSearchResults( QString const & searchString, int searchMode, b
class ZimArticleRequest: public Dictionary::DataRequest class ZimArticleRequest: public Dictionary::DataRequest
{ {
wstring word; u32string word;
vector< wstring > alts; vector< u32string > alts;
ZimDictionary & dict; ZimDictionary & dict;
bool ignoreDiacritics; bool ignoreDiacritics;
@ -529,7 +529,10 @@ class ZimArticleRequest: public Dictionary::DataRequest
public: public:
ZimArticleRequest( wstring word_, vector< wstring > const & alts_, ZimDictionary & dict_, bool ignoreDiacritics_ ): ZimArticleRequest( u32string word_,
vector< u32string > const & alts_,
ZimDictionary & dict_,
bool ignoreDiacritics_ ):
word( std::move( word_ ) ), word( std::move( word_ ) ),
alts( alts_ ), alts( alts_ ),
dict( dict_ ), dict( dict_ ),
@ -571,13 +574,13 @@ void ZimArticleRequest::run()
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< u32string, pair< string, string > > mainArticles, alternateArticles;
set< quint32 > articlesIncluded; // Some synonyms make it that the articles set< quint32 > articlesIncluded; // Some synonyms make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -614,12 +617,12 @@ void ZimArticleRequest::run()
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( headword ); u32string headwordStripped = Folding::applySimpleCaseOnly( headword );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, pair< string, string > > & mapToUse = multimap< u32string, pair< string, string > > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) ); mapToUse.insert( pair( Folding::applySimpleCaseOnly( headword ), pair( headword, articleText ) ) );
@ -638,7 +641,7 @@ void ZimArticleRequest::run()
// See Issue #271: A mechanism to clean-up invalid HTML cards. // See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = Utils::Html::getHtmlCleaner(); string cleaner = Utils::Html::getHtmlCleaner();
multimap< wstring, pair< string, string > >::const_iterator i; multimap< u32string, pair< string, string > >::const_iterator i;
for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) { for ( i = mainArticles.begin(); i != mainArticles.end(); ++i ) {
@ -666,9 +669,9 @@ void ZimArticleRequest::run()
finish(); finish();
} }
sptr< Dictionary::DataRequest > ZimDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > ZimDictionary::getArticle( u32string const & word,
vector< wstring > const & alts, vector< u32string > const & alts,
wstring const &, u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -766,7 +769,7 @@ sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name
return std::make_shared< ZimResourceRequest >( *this, noLeadingDot.toStdString() ); return std::make_shared< ZimResourceRequest >( *this, noLeadingDot.toStdString() );
} }
wstring normalizeWord( const std::string & url ); u32string normalizeWord( const std::string & url );
vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames,
string const & indicesDir, string const & indicesDir,
Dictionary::Initializing & initializing, Dictionary::Initializing & initializing,
@ -849,7 +852,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
if ( maxHeadwordsToExpand > 0 && ( articleCount >= maxHeadwordsToExpand ) ) { if ( maxHeadwordsToExpand > 0 && ( articleCount >= maxHeadwordsToExpand ) ) {
if ( !title.empty() ) { if ( !title.empty() ) {
wstring word = Utf8::decode( title ); u32string word = Text::toUtf32( title );
indexedWords.addSingleWord( word, index ); indexedWords.addSingleWord( word, index );
} }
else if ( !url.empty() ) { else if ( !url.empty() ) {
@ -858,7 +861,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
else { else {
if ( !title.empty() ) { if ( !title.empty() ) {
auto word = Utf8::decode( title ); auto word = Text::toUtf32( title );
indexedWords.addWord( word, index ); indexedWords.addWord( word, index );
wordCount++; wordCount++;
} }
@ -903,7 +906,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
} }
return dictionaries; return dictionaries;
} }
wstring normalizeWord( const std::string & url ) u32string normalizeWord( const std::string & url )
{ {
auto formattedUrl = QString::fromStdString( url ).remove( RX::Zim::leadingDotSlash ); auto formattedUrl = QString::fromStdString( url ).remove( RX::Zim::leadingDotSlash );
return formattedUrl.toStdU32String(); return formattedUrl.toStdU32String();

View file

@ -4,7 +4,7 @@
#include "zipsounds.hh" #include "zipsounds.hh"
#include "dictfile.hh" #include "dictfile.hh"
#include "folding.hh" #include "folding.hh"
#include "utf8.hh" #include "text.hh"
#include "btreeidx.hh" #include "btreeidx.hh"
#include "audiolink.hh" #include "audiolink.hh"
@ -24,7 +24,6 @@
namespace ZipSounds { namespace ZipSounds {
using std::string; using std::string;
using gd::wstring;
using std::map; using std::map;
using std::multimap; using std::multimap;
using std::set; using std::set;
@ -64,19 +63,19 @@ bool indexIsOldOrBad( string const & indexFile )
|| header.formatVersion != CurrentFormatVersion; || header.formatVersion != CurrentFormatVersion;
} }
wstring stripExtension( string const & str ) std::u32string stripExtension( string const & str )
{ {
wstring name; std::u32string name;
try { try {
name = Utf8::decode( str ); name = Text::toUtf32( str );
} }
catch ( Utf8::exCantDecode & ) { catch ( Text::exCantDecode & ) {
return name; return name;
} }
if ( Filetype::isNameOfSound( str ) ) { if ( Filetype::isNameOfSound( str ) ) {
wstring::size_type pos = name.rfind( L'.' ); std::u32string::size_type pos = name.rfind( L'.' );
if ( pos != wstring::npos ) { if ( pos != std::u32string::npos ) {
name.erase( pos ); name.erase( pos );
} }
@ -118,8 +117,10 @@ public:
return getArticleCount(); return getArticleCount();
} }
sptr< Dictionary::DataRequest > sptr< Dictionary::DataRequest > getArticle( std::u32string const &,
getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ignoreDiacritics ) override; vector< std::u32string > const & alts,
std::u32string const &,
bool ignoreDiacritics ) override;
sptr< Dictionary::DataRequest > getResource( string const & name ) override; sptr< Dictionary::DataRequest > getResource( string const & name ) override;
@ -157,9 +158,9 @@ string ZipSoundsDictionary::getName() noexcept
return result; return result;
} }
sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const & word, sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( std::u32string const & word,
vector< wstring > const & alts, vector< std::u32string > const & alts,
wstring const &, std::u32string const &,
bool ignoreDiacritics ) bool ignoreDiacritics )
{ {
@ -173,13 +174,13 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const &
chain.insert( chain.end(), altChain.begin(), altChain.end() ); chain.insert( chain.end(), altChain.begin(), altChain.end() );
} }
multimap< wstring, uint32_t > mainArticles, alternateArticles; multimap< std::u32string, uint32_t > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonims make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
wstring wordCaseFolded = Folding::applySimpleCaseOnly( word ); std::u32string wordCaseFolded = Folding::applySimpleCaseOnly( word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
} }
@ -194,12 +195,12 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const &
// We do the case-folded comparison here. // We do the case-folded comparison here.
wstring headwordStripped = Folding::applySimpleCaseOnly( x.word ); std::u32string headwordStripped = Folding::applySimpleCaseOnly( x.word );
if ( ignoreDiacritics ) { if ( ignoreDiacritics ) {
headwordStripped = Folding::applyDiacriticsOnly( headwordStripped ); headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
} }
multimap< wstring, uint32_t > & mapToUse = multimap< std::u32string, uint32_t > & mapToUse =
( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles; ( wordCaseFolded == headwordStripped ) ? mainArticles : alternateArticles;
mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( x.word ), x.articleOffset ) ); mapToUse.insert( std::pair( Folding::applySimpleCaseOnly( x.word ), x.articleOffset ) );
@ -213,7 +214,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const &
string result; string result;
multimap< wstring, uint32_t >::const_iterator i; multimap< std::u32string, uint32_t >::const_iterator i;
result += "<table class=\"lsa_play\">"; result += "<table class=\"lsa_play\">";
@ -244,7 +245,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const &
nameBlock += sz; nameBlock += sz;
string displayedName = string displayedName =
mainArticles.size() + alternateArticles.size() > 1 ? name : Utf8::encode( stripExtension( name ) ); mainArticles.size() + alternateArticles.size() > 1 ? name : Text::toUtf8( stripExtension( name ) );
result += "<tr>"; result += "<tr>";
@ -286,7 +287,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getArticle( wstring const &
nameBlock += sz; nameBlock += sz;
string displayedName = string displayedName =
mainArticles.size() + alternateArticles.size() > 1 ? name : Utf8::encode( stripExtension( name ) ); mainArticles.size() + alternateArticles.size() > 1 ? name : Text::toUtf8( stripExtension( name ) );
result += "<tr>"; result += "<tr>";
@ -316,7 +317,7 @@ sptr< Dictionary::DataRequest > ZipSoundsDictionary::getResource( string const &
{ {
// Remove extension for sound files (like in sound dirs) // Remove extension for sound files (like in sound dirs)
wstring strippedName = stripExtension( name ); std::u32string strippedName = stripExtension( name );
vector< WordArticleLink > chain = findArticles( strippedName ); vector< WordArticleLink > chain = findArticles( strippedName );
@ -430,7 +431,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
// Remove extension for sound files (like in sound dirs) // Remove extension for sound files (like in sound dirs)
wstring word = stripExtension( link.word ); std::u32string word = stripExtension( link.word );
if ( !word.empty() ) { if ( !word.empty() ) {
names.addWord( word, offset ); names.addWord( word, offset );
} }

View file

@ -5,7 +5,6 @@
#include <cstdlib> #include <cstdlib>
#include "fulltextsearch.hh" #include "fulltextsearch.hh"
#include "ftshelpers.hh" #include "ftshelpers.hh"
#include "wstring_qt.hh"
#include "dictfile.hh" #include "dictfile.hh"
#include "folding.hh" #include "folding.hh"
#include "utils.hh" #include "utils.hh"

View file

@ -7,7 +7,6 @@
#include "btreeidx.hh" #include "btreeidx.hh"
#include "fulltextsearch.hh" #include "fulltextsearch.hh"
#include "folding.hh" #include "folding.hh"
#include "wstring_qt.hh"
namespace FtsHelpers { namespace FtsHelpers {
@ -44,7 +43,7 @@ public:
{ {
if ( ignoreDiacritics_ ) if ( ignoreDiacritics_ )
searchString = searchString =
QString::fromStdU32String( Folding::applyDiacriticsOnly( gd::removeTrailingZero( searchString_ ) ) ); QString::fromStdU32String( Folding::applyDiacriticsOnly( Text::removeTrailingZero( searchString_ ) ) );
foundHeadwords = new QList< FTS::FtsHeadword >; foundHeadwords = new QList< FTS::FtsHeadword >;
results = 0; results = 0;

View file

@ -1,5 +1,4 @@
#include "headwordsmodel.hh" #include "headwordsmodel.hh"
#include "wstring_qt.hh"
HeadwordListModel::HeadwordListModel( QObject * parent ): HeadwordListModel::HeadwordListModel( QObject * parent ):
QAbstractListModel( parent ), QAbstractListModel( parent ),
@ -67,7 +66,7 @@ void HeadwordListModel::setFilter( const QRegularExpression & reg )
} }
} }
filterWords.clear(); filterWords.clear();
auto sr = _dict->prefixMatch( gd::removeTrailingZero( reg.pattern() ), maxFilterResults ); auto sr = _dict->prefixMatch( Text::removeTrailingZero( reg.pattern() ), maxFilterResults );
connect( sr.get(), &Dictionary::Request::finished, this, &HeadwordListModel::requestFinished, Qt::QueuedConnection ); connect( sr.get(), &Dictionary::Request::finished, this, &HeadwordListModel::requestFinished, Qt::QueuedConnection );
queuedRequests.push_back( sr ); queuedRequests.push_back( sr );
} }

View file

@ -3,7 +3,7 @@
#include "langcoder.hh" #include "langcoder.hh"
#include "language.hh" #include "language.hh"
#include "utf8.hh" #include "text.hh"
#include <QFileInfo> #include <QFileInfo>
#include <QLocale> #include <QLocale>
@ -226,9 +226,9 @@ QString LangCoder::intToCode2( quint32 val )
return QString::fromLatin1( ba ); return QString::fromLatin1( ba );
} }
quint32 LangCoder::findIdForLanguage( gd::wstring const & lang ) quint32 LangCoder::findIdForLanguage( std::u32string const & lang )
{ {
const auto langFolded = QByteArrayView( Utf8::encode( lang ) ); const auto langFolded = QByteArrayView( Text::toUtf8( lang ) );
for ( auto const & lc : LANG_CODE_MAP ) { for ( auto const & lc : LANG_CODE_MAP ) {
if ( langFolded.compare( lc.lang, Qt::CaseInsensitive ) == 0 ) { if ( langFolded.compare( lc.lang, Qt::CaseInsensitive ) == 0 ) {

View file

@ -2,7 +2,7 @@
#include <QString> #include <QString>
#include <QIcon> #include <QIcon>
#include "wstring.hh" #include "text.hh"
struct GDLangCode struct GDLangCode
{ {
@ -34,7 +34,7 @@ public:
/// Finds the id for the given language name, written in english. The search /// Finds the id for the given language name, written in english. The search
/// is case- and punctuation insensitive. /// is case- and punctuation insensitive.
static quint32 findIdForLanguage( gd::wstring const & ); static quint32 findIdForLanguage( std::u32string const & );
static quint32 findIdForLanguageCode3( std::string const & ); static quint32 findIdForLanguageCode3( std::string const & );

View file

@ -465,7 +465,7 @@ BabylonLang getBabylonLangByIndex( int index )
return BabylonDb[ index ]; return BabylonDb[ index ];
} }
quint32 findBlgLangIDByEnglishName( gd::wstring const & lang ) quint32 findBlgLangIDByEnglishName( std::u32string const & lang )
{ {
QString enName = QString::fromStdU32String( lang ); QString enName = QString::fromStdU32String( lang );
for ( const auto & idx : BabylonDb ) { for ( const auto & idx : BabylonDb ) {

View file

@ -4,7 +4,6 @@
#pragma once #pragma once
#include <QString> #include <QString>
#include "wstring_qt.hh"
/// Language-specific stuff - codes, names, ids etc. /// Language-specific stuff - codes, names, ids etc.
namespace Language { namespace Language {
@ -47,5 +46,5 @@ struct BabylonLang
const char * localizedName; const char * localizedName;
}; };
BabylonLang getBabylonLangByIndex( int index ); BabylonLang getBabylonLangByIndex( int index );
quint32 findBlgLangIDByEnglishName( gd::wstring const & lang ); quint32 findBlgLangIDByEnglishName( std::u32string const & lang );
} // namespace Language } // namespace Language

View file

@ -10,7 +10,6 @@
#include "utils.hh" #include "utils.hh"
#include "webmultimediadownload.hh" #include "webmultimediadownload.hh"
#include "wildcard.hh" #include "wildcard.hh"
#include "wstring_qt.hh"
#include <QBuffer> #include <QBuffer>
#include <QClipboard> #include <QClipboard>
#include <QCryptographicHash> #include <QCryptographicHash>

View file

@ -3,14 +3,11 @@
#include "wordfinder.hh" #include "wordfinder.hh"
#include "folding.hh" #include "folding.hh"
#include "wstring_qt.hh"
#include <map> #include <map>
using std::vector; using std::vector;
using std::list; using std::list;
using gd::wstring;
using gd::wchar;
using std::map; using std::map;
using std::pair; using std::pair;
@ -134,7 +131,7 @@ void WordFinder::startSearch()
allWordWritings[ 0 ] = inputWord.toStdU32String(); allWordWritings[ 0 ] = inputWord.toStdU32String();
for ( const auto & inputDict : *inputDicts ) { for ( const auto & inputDict : *inputDicts ) {
vector< wstring > writings = inputDict->getAlternateWritings( allWordWritings[ 0 ] ); vector< std::u32string > writings = inputDict->getAlternateWritings( allWordWritings[ 0 ] );
allWordWritings.insert( allWordWritings.end(), writings.begin(), writings.end() ); allWordWritings.insert( allWordWritings.end(), writings.begin(), writings.end() );
} }
@ -255,7 +252,9 @@ unsigned saturated( unsigned x )
/// both sides by either whitespace, punctuation or begin/end of string. /// both sides by either whitespace, punctuation or begin/end of string.
/// If true is returned, pos holds the offset in the haystack. If the offset /// If true is returned, pos holds the offset in the haystack. If the offset
/// is larger than 255, it is set to 255. /// is larger than 255, it is set to 255.
bool hasSurroundedWithWs( wstring const & haystack, wstring const & needle, wstring::size_type & pos ) bool hasSurroundedWithWs( std::u32string const & haystack,
std::u32string const & needle,
std::u32string::size_type & pos )
{ {
if ( haystack.size() < needle.size() ) { if ( haystack.size() < needle.size() ) {
return false; // Needle won't even fit into a haystack return false; // Needle won't even fit into a haystack
@ -264,7 +263,7 @@ bool hasSurroundedWithWs( wstring const & haystack, wstring const & needle, wstr
for ( pos = 0;; ++pos ) { for ( pos = 0;; ++pos ) {
pos = haystack.find( needle, pos ); pos = haystack.find( needle, pos );
if ( pos == wstring::npos ) { if ( pos == std::u32string::npos ) {
return false; // Not found return false; // Not found
} }
@ -290,13 +289,13 @@ void WordFinder::updateResults()
updateResultsTimer.stop(); // Can happen when we were done before it'd expire updateResultsTimer.stop(); // Can happen when we were done before it'd expire
} }
wstring original = Folding::applySimpleCaseOnly( allWordWritings[ 0 ] ); std::u32string original = Folding::applySimpleCaseOnly( allWordWritings[ 0 ] );
for ( auto i = finishedRequests.begin(); i != finishedRequests.end(); ) { for ( auto i = finishedRequests.begin(); i != finishedRequests.end(); ) {
for ( size_t count = ( *i )->matchesCount(), x = 0; x < count; ++x ) { for ( size_t count = ( *i )->matchesCount(), x = 0; x < count; ++x ) {
wstring match = ( **i )[ x ].word; std::u32string match = ( **i )[ x ].word;
int weight = ( **i )[ x ].weight; int weight = ( **i )[ x ].weight;
wstring lowerCased = Folding::applySimpleCaseOnly( match ); std::u32string lowerCased = Folding::applySimpleCaseOnly( match );
if ( searchType == ExpressionMatch ) { if ( searchType == ExpressionMatch ) {
unsigned ws; unsigned ws;
@ -320,7 +319,7 @@ void WordFinder::updateResults()
weight = ws; weight = ws;
} }
auto insertResult = auto insertResult =
resultsIndex.insert( pair< wstring, ResultsArray::iterator >( lowerCased, resultsArray.end() ) ); resultsIndex.insert( pair< std::u32string, ResultsArray::iterator >( lowerCased, resultsArray.end() ) );
if ( !insertResult.second ) { if ( !insertResult.second ) {
// Wasn't inserted since there was already an item -- check the case // Wasn't inserted since there was already an item -- check the case
@ -369,16 +368,16 @@ void WordFinder::updateResults()
}; };
for ( const auto & allWordWriting : allWordWritings ) { for ( const auto & allWordWriting : allWordWritings ) {
wstring target = Folding::applySimpleCaseOnly( allWordWriting ); std::u32string target = Folding::applySimpleCaseOnly( allWordWriting );
wstring targetNoFullCase = Folding::applyFullCaseOnly( target ); std::u32string targetNoFullCase = Folding::applyFullCaseOnly( target );
wstring targetNoDia = Folding::applyDiacriticsOnly( targetNoFullCase ); std::u32string targetNoDia = Folding::applyDiacriticsOnly( targetNoFullCase );
wstring targetNoPunct = Folding::applyPunctOnly( targetNoDia ); std::u32string targetNoPunct = Folding::applyPunctOnly( targetNoDia );
wstring targetNoWs = Folding::applyWhitespaceOnly( targetNoPunct ); std::u32string targetNoWs = Folding::applyWhitespaceOnly( targetNoPunct );
wstring::size_type matchPos = 0; std::u32string::size_type matchPos = 0;
for ( const auto & i : resultsIndex ) { for ( const auto & i : resultsIndex ) {
wstring resultNoFullCase, resultNoDia, resultNoPunct, resultNoWs; std::u32string resultNoFullCase, resultNoDia, resultNoPunct, resultNoWs;
int rank; int rank;
@ -441,14 +440,14 @@ void WordFinder::updateResults()
// only the first one, storing it in rank. Then we sort the results using // only the first one, storing it in rank. Then we sort the results using
// SortByRankAndLength. // SortByRankAndLength.
for ( const auto & allWordWriting : allWordWritings ) { for ( const auto & allWordWriting : allWordWritings ) {
wstring target = Folding::apply( allWordWriting ); std::u32string target = Folding::apply( allWordWriting );
for ( const auto & i : resultsIndex ) { for ( const auto & i : resultsIndex ) {
wstring resultFolded = Folding::apply( i.first ); std::u32string resultFolded = Folding::apply( i.first );
int charsInCommon = 0; int charsInCommon = 0;
for ( wchar const *t = target.c_str(), *r = resultFolded.c_str(); *t && *t == *r; for ( char32_t const *t = target.c_str(), *r = resultFolded.c_str(); *t && *t == *r;
++t, ++r, ++charsInCommon ) { ++t, ++r, ++charsInCommon ) {
; ;
} }

View file

@ -48,11 +48,11 @@ private:
std::vector< sptr< Dictionary::Class > > const * inputDicts; std::vector< sptr< Dictionary::Class > > const * inputDicts;
std::vector< gd::wstring > allWordWritings; // All writings of the inputWord std::vector< std::u32string > allWordWritings; // All writings of the inputWord
struct OneResult struct OneResult
{ {
gd::wstring word; std::u32string word;
int rank; int rank;
bool wasSuggested; bool wasSuggested;
}; };
@ -60,7 +60,7 @@ private:
// Maps lowercased string to the original one. This catches all duplicates // Maps lowercased string to the original one. This catches all duplicates
// without case sensitivity. Made as an array and a map indexing that array. // without case sensitivity. Made as an array and a map indexing that array.
using ResultsArray = std::list< OneResult >; using ResultsArray = std::list< OneResult >;
using ResultsIndex = std::map< gd::wstring, ResultsArray::iterator >; using ResultsIndex = std::map< std::u32string, ResultsArray::iterator >;
ResultsArray resultsArray; ResultsArray resultsArray;
ResultsIndex resultsIndex; ResultsIndex resultsIndex;