mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-23 20:14:05 +00:00
opt: remove DiacriticFolding.txt (#713)
* opt: remove DiacriticFolding.txt
* 🎨 apply clang-format changes
* fix: remove foldedDiacritic method
---------
Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
This commit is contained in:
parent
d08d5fe576
commit
69ff9b1177
|
@ -301,7 +301,6 @@ HEADERS += \
|
|||
src/common/htmlescape.hh \
|
||||
src/common/iconv.hh \
|
||||
src/common/inc_case_folding.hh \
|
||||
src/common/inc_diacritic_folding.hh \
|
||||
src/common/mutex.hh \
|
||||
src/common/sptr.hh \
|
||||
src/common/ufile.hh \
|
||||
|
|
|
@ -5,12 +5,11 @@
|
|||
#include <QRegularExpression>
|
||||
|
||||
#include "utf8.hh"
|
||||
#include "wstring_qt.hh"
|
||||
#include "globalregex.hh"
|
||||
|
||||
namespace Folding {
|
||||
|
||||
#include "inc_case_folding.hh"
|
||||
#include "inc_diacritic_folding.hh"
|
||||
|
||||
/// Tests if the given char is one of the Unicode combining marks. Some are
|
||||
/// caught by the diacritics folding table, but they are only handled there
|
||||
|
@ -23,43 +22,39 @@ bool isCombiningMark( wchar ch )
|
|||
|
||||
wstring apply( wstring const & in, bool preserveWildcards )
|
||||
{
|
||||
//remove space and accent;
|
||||
auto withPunc = QString::fromStdU32String( in )
|
||||
.normalized( QString::NormalizationForm_KD )
|
||||
.remove( RX::markSpace )
|
||||
.toStdU32String();
|
||||
|
||||
//First, strip diacritics and apply ws/punctuation removal
|
||||
wstring withoutDiacritics;
|
||||
|
||||
withoutDiacritics.reserve( in.size() );
|
||||
withoutDiacritics.reserve( withPunc.size() );
|
||||
|
||||
wchar const * nextChar = in.data();
|
||||
|
||||
size_t consumed;
|
||||
for ( auto const & ch : withPunc ) {
|
||||
|
||||
for(int left=in.size() ; left; )
|
||||
{
|
||||
wchar ch = foldDiacritic( nextChar, left, consumed );
|
||||
|
||||
if ( !isCombiningMark( ch ) && !isWhitespace( ch )
|
||||
&& ( !isPunct( ch )
|
||||
|| ( preserveWildcards &&
|
||||
( ch == '\\' || ch == '?' || ch == '*' || ch == '[' || ch == ']' ) )
|
||||
)
|
||||
)
|
||||
if ( !isPunct( ch )
|
||||
|| ( preserveWildcards && ( ch == '\\' || ch == '?' || ch == '*' || ch == '[' || ch == ']' ) ) ) {
|
||||
withoutDiacritics.push_back( ch );
|
||||
|
||||
nextChar += consumed;
|
||||
left -= consumed;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Now, fold the case
|
||||
|
||||
wstring caseFolded;
|
||||
|
||||
caseFolded.reserve( withoutDiacritics.size() * foldCaseMaxOut );
|
||||
|
||||
nextChar = withoutDiacritics.data();
|
||||
wchar const * nextChar = withoutDiacritics.data();
|
||||
|
||||
wchar buf[ foldCaseMaxOut ];
|
||||
|
||||
for( size_t left = withoutDiacritics.size(); left--; )
|
||||
caseFolded.append( buf, foldCase( *nextChar++, buf ) );
|
||||
for ( size_t left = withoutDiacritics.size(); left--; )
|
||||
caseFolded.append( buf, foldCase( *nextChar++, buf ) );
|
||||
|
||||
return caseFolded;
|
||||
}
|
||||
|
@ -108,26 +103,8 @@ wstring applyFullCaseOnly( wstring const & in )
|
|||
|
||||
wstring applyDiacriticsOnly( wstring const & in )
|
||||
{
|
||||
wstring withoutDiacritics;
|
||||
|
||||
withoutDiacritics.reserve( in.size() );
|
||||
|
||||
wchar const * nextChar = in.data();
|
||||
|
||||
size_t consumed;
|
||||
|
||||
for( size_t left = in.size(); left; )
|
||||
{
|
||||
wchar ch = foldDiacritic( nextChar, left, consumed );
|
||||
|
||||
if ( !isCombiningMark( ch ) )
|
||||
withoutDiacritics.push_back( ch );
|
||||
|
||||
nextChar += consumed;
|
||||
left -= consumed;
|
||||
}
|
||||
|
||||
return withoutDiacritics;
|
||||
auto noAccent = QString::fromStdU32String( in ).normalized( QString::NormalizationForm_KD ).remove( RX::accentMark );
|
||||
return noAccent.toStdU32String();
|
||||
}
|
||||
|
||||
wstring applyPunctOnly( wstring const & in )
|
||||
|
@ -277,11 +254,4 @@ QString unescapeWildcardSymbols( const QString & str )
|
|||
|
||||
return unescaped;
|
||||
}
|
||||
|
||||
wchar foldedDiacritic( wchar const * in, size_t size, size_t & consumed )
|
||||
{
|
||||
return foldDiacritic( in, size, consumed );
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -86,9 +86,6 @@ QString unescapeWildcardSymbols( QString const & );
|
|||
/// Escape all wildcard symbols (for place word to input line)
|
||||
QString escapeWildcardSymbols( QString const & );
|
||||
|
||||
/// Return result of foldDiacritic() from "inc_diacritic_folding.hh"
|
||||
wchar foldedDiacritic( wchar const * in, size_t size, size_t & consumed );
|
||||
|
||||
/// Tests if the given char is one of the Unicode combining marks.
|
||||
bool isCombiningMark( wchar ch );
|
||||
|
||||
|
|
|
@ -65,6 +65,12 @@ const static QRegularExpression emptyXmlTag(R"(<(?!(br|hr)\b)([^/ >]*)\s*/>)");
|
|||
bool containHtmlEntity( std::string const & text );
|
||||
}
|
||||
|
||||
const static QRegularExpression accentMark( R"(\p{M})" );
|
||||
//contain unicode space mark and punctuation
|
||||
const static QRegularExpression markPuncSpace( R"([\p{M}\p{Z}\p{P}])" );
|
||||
//contain unicode space and mark.
|
||||
const static QRegularExpression markSpace( R"([\p{M}\p{Z}])" );
|
||||
|
||||
} // namespace RX
|
||||
|
||||
#endif // GLOBALREGEX_HH
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -142,7 +142,7 @@ public:
|
|||
continue;
|
||||
}
|
||||
|
||||
gd::wchar ch = Folding::foldedDiacritic( nextChar, left, consumed );
|
||||
gd::wchar ch = *nextChar;
|
||||
|
||||
if( Folding::isCombiningMark( ch ) )
|
||||
{
|
||||
|
@ -151,16 +151,10 @@ public:
|
|||
continue;
|
||||
}
|
||||
|
||||
if( consumed > 1 )
|
||||
{
|
||||
for( size_t i = 1; i < consumed; i++ )
|
||||
accentMarkPos.append( pos );
|
||||
}
|
||||
|
||||
normText.push_back( ch );
|
||||
pos += 1;
|
||||
nextChar += consumed;
|
||||
left -= consumed;
|
||||
nextChar += 1;
|
||||
left -= 1;
|
||||
}
|
||||
normalizedString = QString::fromStdU32String( normText );
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -194,93 +194,5 @@ int main()
|
|||
fclose( outf );
|
||||
}
|
||||
|
||||
// Diacritic folding
|
||||
{
|
||||
FILE * inf = fopen( "DiacriticFolding.txt", "r" );
|
||||
|
||||
if ( !inf ) {
|
||||
fprintf( stderr, "Failed to open DiacriticFolding.txt\n" );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
char buf[ 4096 ];
|
||||
|
||||
map< char32_t, Node > forest;
|
||||
|
||||
while ( fgets( buf, sizeof( buf ), inf ) ) {
|
||||
if ( *buf == '#' || *buf == '\n' )
|
||||
continue; // A comment or a whiteline
|
||||
|
||||
unsigned long in[ 4 ], out;
|
||||
|
||||
unsigned totalIn;
|
||||
|
||||
if ( sscanf( buf, "%lx %lx %lx %lx; %lx", in, in + 1, in + 2, in + 3, &out ) == 5 ) {
|
||||
fprintf( stderr,
|
||||
"Four input chars ecountered in DiacriticFolding.txt, which we expected"
|
||||
"the file didn't have, make changes into the program.\n" );
|
||||
|
||||
return 1;
|
||||
}
|
||||
else if ( sscanf( buf, "%lx %lx %lx; %lx", in, in + 1, in + 2, &out ) == 4 )
|
||||
totalIn = 3;
|
||||
else if ( sscanf( buf, "%lx %lx; %lx", in, in + 1, &out ) == 3 )
|
||||
totalIn = 2;
|
||||
else if ( sscanf( buf, "%lx; %lx", in, &out ) == 2 )
|
||||
totalIn = 1;
|
||||
else {
|
||||
fprintf( stderr, "Erroneous input in DiacriticFolding.txt: %s\n", buf );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
map< char32_t, Node > * cur = &forest;
|
||||
|
||||
for ( unsigned x = 0; x < totalIn - 1; ++x ) {
|
||||
//printf( "%x ", in[ x ] );
|
||||
|
||||
cur = &( ( *cur )[ in[ x ] ].nodes );
|
||||
}
|
||||
|
||||
//printf( "%x\n", in[ totalIn - 1 ] );
|
||||
|
||||
if ( ( *cur )[ in[ totalIn - 1 ] ].tail ) {
|
||||
fprintf( stderr, "Ambiguity in DiacriticFolding.txt\n" );
|
||||
return 1;
|
||||
}
|
||||
|
||||
( *cur )[ in[ totalIn - 1 ] ].tail = out;
|
||||
}
|
||||
fclose( inf );
|
||||
|
||||
// Create an outfile
|
||||
|
||||
FILE * outf = fopen( "../inc_diacritic_folding.hh", "w" );
|
||||
|
||||
if ( !outf ) {
|
||||
fprintf( stderr, "Failed to create outfile\n" );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf( outf, "// This file was generated automatically. Do not edit directly.\n\n" );
|
||||
|
||||
fprintf( outf, "enum { foldDiacriticMaxIn = 3 };\n\n" );
|
||||
fprintf( outf, "wchar foldDiacritic( wchar const * in, size_t size, size_t & consumed )\n{\n" );
|
||||
|
||||
handleForest( outf, forest, 0, 1 );
|
||||
|
||||
fprintf( outf,
|
||||
" if ( size )\n"
|
||||
" {\n"
|
||||
" consumed = 1; return *in;\n"
|
||||
" }\n"
|
||||
" consumed = 0; return 0;\n"
|
||||
"}\n" );
|
||||
|
||||
fclose( outf );
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue