2010-12-09 12:31:50 +00:00
|
|
|
/* This file is (c) 2008-2011 Konstantin Isakov <ikm@goldendict.org>
|
2009-01-28 20:55:45 +00:00
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#ifndef __FOLDING_HH_INCLUDED__
|
|
|
|
#define __FOLDING_HH_INCLUDED__
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
#include "wstring.hh"
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Folding provides means to translate several possible ways to write a
|
|
|
|
/// symbol into one. This facilitates searching. Here we currently perform
|
|
|
|
/// full case folding (everything gets translated to lowercase, ligatures
|
2009-04-08 16:02:12 +00:00
|
|
|
/// and complex letters are decomposed), diacritics folding (all diacritic
|
|
|
|
/// marks get removed) and whitespace/punctuation marks removal. These
|
|
|
|
/// transforms are done according to the Unicode standard and/or drafts. The
|
|
|
|
/// exact algorithms, lists and types of folding performed might get changed
|
|
|
|
/// in the future -- in this case, the Version field will be bumped up.
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
namespace Folding {
|
|
|
|
|
2009-04-18 17:20:12 +00:00
|
|
|
using gd::wstring;
|
|
|
|
using gd::wchar;
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// The algorithm's version.
|
|
|
|
enum
|
|
|
|
{
|
2010-03-29 19:01:48 +00:00
|
|
|
Version = 5
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/// Applies the folding algorithm to each character in the given string,
|
|
|
|
/// making another one as a result.
|
|
|
|
wstring apply( wstring const & );
|
|
|
|
|
|
|
|
/// Applies only simple case folding algorithm. Since many dictionaries have
|
2009-01-29 16:37:29 +00:00
|
|
|
/// different case style, we interpret words differing only by case as synonyms.
|
2009-01-28 20:55:45 +00:00
|
|
|
wstring applySimpleCaseOnly( wstring const & );
|
|
|
|
|
2009-04-08 21:22:50 +00:00
|
|
|
/// Applies only full case folding algorithm. This includes simple case, but also
|
|
|
|
/// decomposing ligatures and complex letters.
|
|
|
|
wstring applyFullCaseOnly( wstring const & );
|
|
|
|
|
2009-04-08 16:02:12 +00:00
|
|
|
/// Applies only diacritics folding algorithm.
|
|
|
|
wstring applyDiacriticsOnly( wstring const & );
|
|
|
|
|
|
|
|
/// Applies only punctuation folding algorithm.
|
|
|
|
wstring applyPunctOnly( wstring const & );
|
|
|
|
|
|
|
|
/// Applies only whitespace folding algorithm.
|
|
|
|
wstring applyWhitespaceOnly( wstring const & );
|
|
|
|
|
|
|
|
/// Applies only whitespace&punctuation folding algorithm.
|
|
|
|
wstring applyWhitespaceAndPunctOnly( wstring const & );
|
|
|
|
|
|
|
|
/// Returns true if the given character is any form of whitespace, false
|
|
|
|
/// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
|
|
|
|
/// includes \n, \r and \t.
|
2009-04-18 17:20:12 +00:00
|
|
|
bool isWhitespace( wchar ch );
|
2009-04-08 16:02:12 +00:00
|
|
|
|
|
|
|
/// Returns true if the given character is any form of punctuation, false
|
|
|
|
/// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.
|
2009-04-18 17:20:12 +00:00
|
|
|
bool isPunct( wchar ch );
|
2009-04-08 16:02:12 +00:00
|
|
|
|
2009-04-08 21:22:50 +00:00
|
|
|
/// Removes any whitespace or punctuation from the beginning and the end of
|
|
|
|
/// the word.
|
|
|
|
wstring trimWhitespaceOrPunct( wstring const & );
|
|
|
|
|
2009-05-10 15:44:21 +00:00
|
|
|
/// Removes any whitespace from the beginning and the end of
|
|
|
|
/// the word.
|
|
|
|
wstring trimWhitespace( wstring const & );
|
|
|
|
|
2010-03-30 13:41:14 +00:00
|
|
|
/// Turns any sequences of consecutive whitespace into a single basic space.
|
|
|
|
void normalizeWhitespace( wstring & );
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Same as apply( wstring ), but without any heap operations, therefore
|
|
|
|
/// preferable when there're many strings to process. Returns -1 if the
|
|
|
|
/// operation succeded, or otherwise the minimum value of outSize required
|
|
|
|
/// to succeed.
|
|
|
|
/// Currently commented out, consider implementing it in case indices'
|
|
|
|
/// generation would be too slow.
|
2009-04-18 17:20:12 +00:00
|
|
|
//ssize_t apply( wchar const * in, wchar * out, size_t outSize );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|