opt: treat invisible character as whitespace (#1696)

* opt: remove invisible character

* opt: remove invisible character

* opt: whitespace and punct character

* [autofix.ci] apply automated fixes

* opt: whitespace and punct character

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2024-07-17 10:44:50 +08:00 committed by GitHub
parent 521c359b24
commit 5fb4526158
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 11 additions and 10 deletions

View file

@ -154,7 +154,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
out.reserve( in.size() ); out.reserve( in.size() );
for ( size_t left = in.size(); left--; ++nextChar ) { for ( size_t left = in.size(); left--; ++nextChar ) {
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) ) if ( !isWhitespaceOrPunct( *nextChar ) )
out.push_back( *nextChar ); out.push_back( *nextChar );
} }
@ -163,12 +163,13 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
bool isWhitespace( wchar ch ) bool isWhitespace( wchar ch )
{ {
return QChar::isSpace( ch ); //invisible character should be treated as whitespace as well.
return QChar::isSpace( ch ) || !QChar::isPrint( ch );
} }
bool isWhitespaceOrPunct( wchar ch ) bool isWhitespaceOrPunct( wchar ch )
{ {
return QChar::isSpace( ch ) || QChar::isPunct( ch ); return isWhitespace( ch ) || QChar::isPunct( ch );
} }
bool isPunct( wchar ch ) bool isPunct( wchar ch )
@ -182,14 +183,13 @@ wstring trimWhitespaceOrPunct( wstring const & in )
wstring::size_type wordSize = in.size(); wstring::size_type wordSize = in.size();
// Skip any leading whitespace // Skip any leading whitespace
while ( *wordBegin && ( Folding::isWhitespace( *wordBegin ) || Folding::isPunct( *wordBegin ) ) ) { while ( *wordBegin && Folding::isWhitespaceOrPunct( *wordBegin ) ) {
++wordBegin; ++wordBegin;
--wordSize; --wordSize;
} }
// Skip any trailing whitespace // Skip any trailing whitespace
while ( wordSize while ( wordSize && Folding::isWhitespaceOrPunct( wordBegin[ wordSize - 1 ] ) )
&& ( Folding::isWhitespace( wordBegin[ wordSize - 1 ] ) || Folding::isPunct( wordBegin[ wordSize - 1 ] ) ) )
--wordSize; --wordSize;
return wstring( wordBegin, wordSize ); return wstring( wordBegin, wordSize );

View file

@ -69,10 +69,11 @@ bool containHtmlEntity( std::string const & text );
} // namespace Html } // namespace Html
const static QRegularExpression accentMark( R"(\p{M})", QRegularExpression::UseUnicodePropertiesOption ); const static QRegularExpression accentMark( R"(\p{M})", QRegularExpression::UseUnicodePropertiesOption );
//contain unicode space mark and punctuation //contain unicode space mark,invisible, and punctuation
const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{P}])", QRegularExpression::UseUnicodePropertiesOption ); const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{C}\p{P}])",
//contain unicode space and mark. QRegularExpression::UseUnicodePropertiesOption );
const static QRegularExpression markSpace( R"([\p{M}\p{Z}])", QRegularExpression::UseUnicodePropertiesOption ); //contain unicode space and mark.invisible
const static QRegularExpression markSpace( R"([\p{M}\p{Z}\p{C}])", QRegularExpression::UseUnicodePropertiesOption );
const static QRegularExpression whiteSpace( "\\s+" ); const static QRegularExpression whiteSpace( "\\s+" );