From 27c4bf7d30650f55cde09a9713d9e42e07e7343c Mon Sep 17 00:00:00 2001 From: Tvangeste Date: Sat, 6 Jul 2013 20:18:43 +0200 Subject: [PATCH] Properly handle non-normalized unicode headwords With that change users should be able to search headwords in any form. For example: U+03B5 GREEK SMALL LETTER EPSILON and U+0301 COMBINING ACUTE ACCENT is considered equal to U+03AD GREEK SMALL LETTER EPSILON WITH TONOS And no matter in what form the headword is provided in the dictionary, users will be able to find it, even using the different form. --- btreeidx.cc | 5 +++-- dsl.cc | 2 +- wstring_qt.cc | 6 ++++++ wstring_qt.hh | 1 + 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/btreeidx.cc b/btreeidx.cc index 5806ba20..25e9ebde 100644 --- a/btreeidx.cc +++ b/btreeidx.cc @@ -11,6 +11,7 @@ #include #include #include "dprintf.hh" +#include "wstring_qt.hh" //#define __BTREE_USE_LZO // LZO mode is experimental and unsupported. Tests didn't show any substantial @@ -710,13 +711,13 @@ vector< WordArticleLink > BtreeIndex::readChain( char const * & ptr ) void BtreeIndex::antialias( wstring const & str, vector< WordArticleLink > & chain ) { - wstring caseFolded = Folding::applySimpleCaseOnly( str ); + wstring caseFolded = Folding::applySimpleCaseOnly( gd::normalize( str ) ); for( unsigned x = chain.size(); x--; ) { // If after applying case folding to each word they wouldn't match, we // drop the entry. - if ( Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) != + if ( Folding::applySimpleCaseOnly( gd::normalize( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ) ) != caseFolded ) chain.erase( chain.begin() + x ); else diff --git a/dsl.cc b/dsl.cc index de1d5227..4c3e5ff8 100644 --- a/dsl.cc +++ b/dsl.cc @@ -668,7 +668,7 @@ void DslDictionary::loadArticle( uint32_t address, string DslDictionary::dslToHtml( wstring const & str ) { // Normalize the string - wstring normalizedStr = gd::toWString( gd::toQString( str ).normalized( QString::NormalizationForm_C ) ); + wstring normalizedStr = gd::normalize( str ); ArticleDom dom( normalizedStr ); diff --git a/wstring_qt.cc b/wstring_qt.cc index 75daf08d..2dfdb9c5 100644 --- a/wstring_qt.cc +++ b/wstring_qt.cc @@ -33,4 +33,10 @@ namespace gd return wstring( ( const wchar * ) v.constData(), v.size() ); } + + wstring normalize( const wstring & str ) + { + return gd::toWString( gd::toQString( str ).normalized( QString::NormalizationForm_C ) ); + } + } diff --git a/wstring_qt.hh b/wstring_qt.hh index 537351cc..96b6160d 100644 --- a/wstring_qt.hh +++ b/wstring_qt.hh @@ -14,6 +14,7 @@ namespace gd { QString toQString( wstring const & ); wstring toWString( QString const & ); + wstring normalize( wstring const & ); } #endif