mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 15:24:05 +00:00
opt: treat invisible character as whitespace (#1696)
* opt: remove invisible character * opt: remove invisible character * opt: whitespace and punct character * [autofix.ci] apply automated fixes * opt: whitespace and punct character --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
521c359b24
commit
5fb4526158
|
@ -154,7 +154,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
|
||||||
out.reserve( in.size() );
|
out.reserve( in.size() );
|
||||||
|
|
||||||
for ( size_t left = in.size(); left--; ++nextChar ) {
|
for ( size_t left = in.size(); left--; ++nextChar ) {
|
||||||
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
|
if ( !isWhitespaceOrPunct( *nextChar ) )
|
||||||
out.push_back( *nextChar );
|
out.push_back( *nextChar );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -163,12 +163,13 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
|
||||||
|
|
||||||
bool isWhitespace( wchar ch )
|
bool isWhitespace( wchar ch )
|
||||||
{
|
{
|
||||||
return QChar::isSpace( ch );
|
//invisible character should be treated as whitespace as well.
|
||||||
|
return QChar::isSpace( ch ) || !QChar::isPrint( ch );
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isWhitespaceOrPunct( wchar ch )
|
bool isWhitespaceOrPunct( wchar ch )
|
||||||
{
|
{
|
||||||
return QChar::isSpace( ch ) || QChar::isPunct( ch );
|
return isWhitespace( ch ) || QChar::isPunct( ch );
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isPunct( wchar ch )
|
bool isPunct( wchar ch )
|
||||||
|
@ -182,14 +183,13 @@ wstring trimWhitespaceOrPunct( wstring const & in )
|
||||||
wstring::size_type wordSize = in.size();
|
wstring::size_type wordSize = in.size();
|
||||||
|
|
||||||
// Skip any leading whitespace
|
// Skip any leading whitespace
|
||||||
while ( *wordBegin && ( Folding::isWhitespace( *wordBegin ) || Folding::isPunct( *wordBegin ) ) ) {
|
while ( *wordBegin && Folding::isWhitespaceOrPunct( *wordBegin ) ) {
|
||||||
++wordBegin;
|
++wordBegin;
|
||||||
--wordSize;
|
--wordSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip any trailing whitespace
|
// Skip any trailing whitespace
|
||||||
while ( wordSize
|
while ( wordSize && Folding::isWhitespaceOrPunct( wordBegin[ wordSize - 1 ] ) )
|
||||||
&& ( Folding::isWhitespace( wordBegin[ wordSize - 1 ] ) || Folding::isPunct( wordBegin[ wordSize - 1 ] ) ) )
|
|
||||||
--wordSize;
|
--wordSize;
|
||||||
|
|
||||||
return wstring( wordBegin, wordSize );
|
return wstring( wordBegin, wordSize );
|
||||||
|
|
|
@ -69,10 +69,11 @@ bool containHtmlEntity( std::string const & text );
|
||||||
} // namespace Html
|
} // namespace Html
|
||||||
|
|
||||||
const static QRegularExpression accentMark( R"(\p{M})", QRegularExpression::UseUnicodePropertiesOption );
|
const static QRegularExpression accentMark( R"(\p{M})", QRegularExpression::UseUnicodePropertiesOption );
|
||||||
//contain unicode space mark and punctuation
|
//contain unicode space mark,invisible, and punctuation
|
||||||
const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{P}])", QRegularExpression::UseUnicodePropertiesOption );
|
const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{C}\p{P}])",
|
||||||
//contain unicode space and mark.
|
QRegularExpression::UseUnicodePropertiesOption );
|
||||||
const static QRegularExpression markSpace( R"([\p{M}\p{Z}])", QRegularExpression::UseUnicodePropertiesOption );
|
//contain unicode space and mark.invisible
|
||||||
|
const static QRegularExpression markSpace( R"([\p{M}\p{Z}\p{C}])", QRegularExpression::UseUnicodePropertiesOption );
|
||||||
|
|
||||||
const static QRegularExpression whiteSpace( "\\s+" );
|
const static QRegularExpression whiteSpace( "\\s+" );
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue