Properly handle non-normalized unicode headwords

With that change users should be able to search headwords in
any form. For example:

U+03B5 GREEK SMALL LETTER EPSILON and U+0301 COMBINING ACUTE ACCENT

is considered equal to

U+03AD GREEK SMALL LETTER EPSILON WITH TONOS

And no matter in what form the headword is provided in the dictionary, users will be able to find it,
even using the different form.
This commit is contained in:
Tvangeste 2013-07-06 20:18:43 +02:00
parent 89755f8c09
commit 27c4bf7d30
4 changed files with 11 additions and 3 deletions

View file

@ -11,6 +11,7 @@
#include <string.h> #include <string.h>
#include <stdlib.h> #include <stdlib.h>
#include "dprintf.hh" #include "dprintf.hh"
#include "wstring_qt.hh"
//#define __BTREE_USE_LZO //#define __BTREE_USE_LZO
// LZO mode is experimental and unsupported. Tests didn't show any substantial // LZO mode is experimental and unsupported. Tests didn't show any substantial
@ -710,13 +711,13 @@ vector< WordArticleLink > BtreeIndex::readChain( char const * & ptr )
void BtreeIndex::antialias( wstring const & str, void BtreeIndex::antialias( wstring const & str,
vector< WordArticleLink > & chain ) vector< WordArticleLink > & chain )
{ {
wstring caseFolded = Folding::applySimpleCaseOnly( str ); wstring caseFolded = Folding::applySimpleCaseOnly( gd::normalize( str ) );
for( unsigned x = chain.size(); x--; ) for( unsigned x = chain.size(); x--; )
{ {
// If after applying case folding to each word they wouldn't match, we // If after applying case folding to each word they wouldn't match, we
// drop the entry. // drop the entry.
if ( Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) != if ( Folding::applySimpleCaseOnly( gd::normalize( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) ) !=
caseFolded ) caseFolded )
chain.erase( chain.begin() + x ); chain.erase( chain.begin() + x );
else else

2
dsl.cc
View file

@ -668,7 +668,7 @@ void DslDictionary::loadArticle( uint32_t address,
string DslDictionary::dslToHtml( wstring const & str ) string DslDictionary::dslToHtml( wstring const & str )
{ {
// Normalize the string // Normalize the string
wstring normalizedStr = gd::toWString( gd::toQString( str ).normalized( QString::NormalizationForm_C ) ); wstring normalizedStr = gd::normalize( str );
ArticleDom dom( normalizedStr ); ArticleDom dom( normalizedStr );

View file

@ -33,4 +33,10 @@ namespace gd
return wstring( ( const wchar * ) v.constData(), v.size() ); return wstring( ( const wchar * ) v.constData(), v.size() );
} }
wstring normalize( const wstring & str )
{
return gd::toWString( gd::toQString( str ).normalized( QString::NormalizationForm_C ) );
}
} }

View file

@ -14,6 +14,7 @@ namespace gd
{ {
QString toQString( wstring const & ); QString toQString( wstring const & );
wstring toWString( QString const & ); wstring toWString( QString const & );
wstring normalize( wstring const & );
} }
#endif #endif