opt: treat invisible character as whitespace (#1696)

* opt: remove invisible character

* opt: remove invisible character

* opt: whitespace and punct character

* [autofix.ci] apply automated fixes

* opt: whitespace and punct character

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2024-07-17 10:44:50 +08:00 committed by GitHub
parent 521c359b24
commit 5fb4526158
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 11 additions and 10 deletions

View file

@ -154,7 +154,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
out.reserve( in.size() );
for ( size_t left = in.size(); left--; ++nextChar ) {
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
if ( !isWhitespaceOrPunct( *nextChar ) )
out.push_back( *nextChar );
}
@ -163,12 +163,13 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
bool isWhitespace( wchar ch )
{
return QChar::isSpace( ch );
//invisible character should be treated as whitespace as well.
return QChar::isSpace( ch ) || !QChar::isPrint( ch );
}
bool isWhitespaceOrPunct( wchar ch )
{
return QChar::isSpace( ch ) || QChar::isPunct( ch );
return isWhitespace( ch ) || QChar::isPunct( ch );
}
bool isPunct( wchar ch )
@ -182,14 +183,13 @@ wstring trimWhitespaceOrPunct( wstring const & in )
wstring::size_type wordSize = in.size();
// Skip any leading whitespace
while ( *wordBegin && ( Folding::isWhitespace( *wordBegin ) || Folding::isPunct( *wordBegin ) ) ) {
while ( *wordBegin && Folding::isWhitespaceOrPunct( *wordBegin ) ) {
++wordBegin;
--wordSize;
}
// Skip any trailing whitespace
while ( wordSize
&& ( Folding::isWhitespace( wordBegin[ wordSize - 1 ] ) || Folding::isPunct( wordBegin[ wordSize - 1 ] ) ) )
while ( wordSize && Folding::isWhitespaceOrPunct( wordBegin[ wordSize - 1 ] ) )
--wordSize;
return wstring( wordBegin, wordSize );

View file

@ -69,10 +69,11 @@ bool containHtmlEntity( std::string const & text );
} // namespace Html
const static QRegularExpression accentMark( R"(\p{M})", QRegularExpression::UseUnicodePropertiesOption );
//contain unicode space mark and punctuation
const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{P}])", QRegularExpression::UseUnicodePropertiesOption );
//contain unicode space and mark.
const static QRegularExpression markSpace( R"([\p{M}\p{Z}])", QRegularExpression::UseUnicodePropertiesOption );
//contain unicode space mark,invisible, and punctuation
const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{C}\p{P}])",
QRegularExpression::UseUnicodePropertiesOption );
//contain unicode space and mark.invisible
const static QRegularExpression markSpace( R"([\p{M}\p{Z}\p{C}])", QRegularExpression::UseUnicodePropertiesOption );
const static QRegularExpression whiteSpace( "\\s+" );