opt: remove DiacriticFolding.txt (#713)

* opt: remove DiacriticFolding.txt

* 🎨 apply clang-format changes

* fix: remove foldedDiacritic method

---------

Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2023-05-20 10:28:43 +08:00 committed by GitHub
parent d08d5fe576
commit 69ff9b1177
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 27 additions and 6723 deletions

View file

@ -301,7 +301,6 @@ HEADERS += \
src/common/htmlescape.hh \
src/common/iconv.hh \
src/common/inc_case_folding.hh \
src/common/inc_diacritic_folding.hh \
src/common/mutex.hh \
src/common/sptr.hh \
src/common/ufile.hh \

View file

@ -5,12 +5,11 @@
#include <QRegularExpression>
#include "utf8.hh"
#include "wstring_qt.hh"
#include "globalregex.hh"
namespace Folding {
#include "inc_case_folding.hh"
#include "inc_diacritic_folding.hh"
/// Tests if the given char is one of the Unicode combining marks. Some are
/// caught by the diacritics folding table, but they are only handled there
@ -23,43 +22,39 @@ bool isCombiningMark( wchar ch )
wstring apply( wstring const & in, bool preserveWildcards )
{
//remove space and accent;
auto withPunc = QString::fromStdU32String( in )
.normalized( QString::NormalizationForm_KD )
.remove( RX::markSpace )
.toStdU32String();
//First, strip diacritics and apply ws/punctuation removal
wstring withoutDiacritics;
withoutDiacritics.reserve( in.size() );
withoutDiacritics.reserve( withPunc.size() );
wchar const * nextChar = in.data();
size_t consumed;
for ( auto const & ch : withPunc ) {
for(int left=in.size() ; left; )
{
wchar ch = foldDiacritic( nextChar, left, consumed );
if ( !isCombiningMark( ch ) && !isWhitespace( ch )
&& ( !isPunct( ch )
|| ( preserveWildcards &&
( ch == '\\' || ch == '?' || ch == '*' || ch == '[' || ch == ']' ) )
)
)
if ( !isPunct( ch )
|| ( preserveWildcards && ( ch == '\\' || ch == '?' || ch == '*' || ch == '[' || ch == ']' ) ) ) {
withoutDiacritics.push_back( ch );
nextChar += consumed;
left -= consumed;
}
}
// Now, fold the case
wstring caseFolded;
caseFolded.reserve( withoutDiacritics.size() * foldCaseMaxOut );
nextChar = withoutDiacritics.data();
wchar const * nextChar = withoutDiacritics.data();
wchar buf[ foldCaseMaxOut ];
for( size_t left = withoutDiacritics.size(); left--; )
caseFolded.append( buf, foldCase( *nextChar++, buf ) );
for ( size_t left = withoutDiacritics.size(); left--; )
caseFolded.append( buf, foldCase( *nextChar++, buf ) );
return caseFolded;
}
@ -108,26 +103,8 @@ wstring applyFullCaseOnly( wstring const & in )
wstring applyDiacriticsOnly( wstring const & in )
{
wstring withoutDiacritics;
withoutDiacritics.reserve( in.size() );
wchar const * nextChar = in.data();
size_t consumed;
for( size_t left = in.size(); left; )
{
wchar ch = foldDiacritic( nextChar, left, consumed );
if ( !isCombiningMark( ch ) )
withoutDiacritics.push_back( ch );
nextChar += consumed;
left -= consumed;
}
return withoutDiacritics;
auto noAccent = QString::fromStdU32String( in ).normalized( QString::NormalizationForm_KD ).remove( RX::accentMark );
return noAccent.toStdU32String();
}
wstring applyPunctOnly( wstring const & in )
@ -277,11 +254,4 @@ QString unescapeWildcardSymbols( const QString & str )
return unescaped;
}
wchar foldedDiacritic( wchar const * in, size_t size, size_t & consumed )
{
return foldDiacritic( in, size, consumed );
}
}

View file

@ -86,9 +86,6 @@ QString unescapeWildcardSymbols( QString const & );
/// Escape all wildcard symbols (for place word to input line)
QString escapeWildcardSymbols( QString const & );
/// Return result of foldDiacritic() from "inc_diacritic_folding.hh"
wchar foldedDiacritic( wchar const * in, size_t size, size_t & consumed );
/// Tests if the given char is one of the Unicode combining marks.
bool isCombiningMark( wchar ch );

View file

@ -65,6 +65,12 @@ const static QRegularExpression emptyXmlTag(R"(<(?!(br|hr)\b)([^/ >]*)\s*/>)");
bool containHtmlEntity( std::string const & text );
}
const static QRegularExpression accentMark( R"(\p{M})" );
//contain unicode space mark and punctuation
const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{P}])" );
//contain unicode space and mark.
const static QRegularExpression markSpace( R"([\p{M}\p{Z}])" );
} // namespace RX
#endif // GLOBALREGEX_HH

File diff suppressed because it is too large Load diff

View file

@ -142,7 +142,7 @@ public:
continue;
}
gd::wchar ch = Folding::foldedDiacritic( nextChar, left, consumed );
gd::wchar ch = *nextChar;
if( Folding::isCombiningMark( ch ) )
{
@ -151,16 +151,10 @@ public:
continue;
}
if( consumed > 1 )
{
for( size_t i = 1; i < consumed; i++ )
accentMarkPos.append( pos );
}
normText.push_back( ch );
pos += 1;
nextChar += consumed;
left -= consumed;
nextChar += 1;
left -= 1;
}
normalizedString = QString::fromStdU32String( normText );
}

File diff suppressed because it is too large Load diff

View file

@ -194,93 +194,5 @@ int main()
fclose( outf );
}
// Diacritic folding
{
FILE * inf = fopen( "DiacriticFolding.txt", "r" );
if ( !inf ) {
fprintf( stderr, "Failed to open DiacriticFolding.txt\n" );
return 1;
}
char buf[ 4096 ];
map< char32_t, Node > forest;
while ( fgets( buf, sizeof( buf ), inf ) ) {
if ( *buf == '#' || *buf == '\n' )
continue; // A comment or a whiteline
unsigned long in[ 4 ], out;
unsigned totalIn;
if ( sscanf( buf, "%lx %lx %lx %lx; %lx", in, in + 1, in + 2, in + 3, &out ) == 5 ) {
fprintf( stderr,
"Four input chars ecountered in DiacriticFolding.txt, which we expected"
"the file didn't have, make changes into the program.\n" );
return 1;
}
else if ( sscanf( buf, "%lx %lx %lx; %lx", in, in + 1, in + 2, &out ) == 4 )
totalIn = 3;
else if ( sscanf( buf, "%lx %lx; %lx", in, in + 1, &out ) == 3 )
totalIn = 2;
else if ( sscanf( buf, "%lx; %lx", in, &out ) == 2 )
totalIn = 1;
else {
fprintf( stderr, "Erroneous input in DiacriticFolding.txt: %s\n", buf );
return 1;
}
map< char32_t, Node > * cur = &forest;
for ( unsigned x = 0; x < totalIn - 1; ++x ) {
//printf( "%x ", in[ x ] );
cur = &( ( *cur )[ in[ x ] ].nodes );
}
//printf( "%x\n", in[ totalIn - 1 ] );
if ( ( *cur )[ in[ totalIn - 1 ] ].tail ) {
fprintf( stderr, "Ambiguity in DiacriticFolding.txt\n" );
return 1;
}
( *cur )[ in[ totalIn - 1 ] ].tail = out;
}
fclose( inf );
// Create an outfile
FILE * outf = fopen( "../inc_diacritic_folding.hh", "w" );
if ( !outf ) {
fprintf( stderr, "Failed to create outfile\n" );
return 1;
}
fprintf( outf, "// This file was generated automatically. Do not edit directly.\n\n" );
fprintf( outf, "enum { foldDiacriticMaxIn = 3 };\n\n" );
fprintf( outf, "wchar foldDiacritic( wchar const * in, size_t size, size_t & consumed )\n{\n" );
handleForest( outf, forest, 0, 1 );
fprintf( outf,
" if ( size )\n"
" {\n"
" consumed = 1; return *in;\n"
" }\n"
" consumed = 0; return 0;\n"
"}\n" );
fclose( outf );
}
return 0;
}