goldendict-ng/folding.cc
Konstantin Isakov 953d9237a3 Update copyright to span to 2011.
Better early than late, this year is ending anyway.
2010-11-14 18:47:03 +03:00

663 lines
23 KiB
C++

/* This file is (c) 2008-2011 Konstantin Isakov <ikm@users.berlios.de>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "folding.hh"
namespace Folding {
namespace
{
#include "inc_case_folding.hh"
#include "inc_diacritic_folding.hh"
/// Tests if the given char is one of the Unicode combining marks. Some are
/// caught by the diacritics folding table, but they are only handled there
/// when they come with their main characters, not by themselves. The rest
/// are caught here.
bool isCombiningMark( wchar ch )
{
return (
( ch >= 0x300 && ch <= 0x36F ) ||
( ch >= 0x1DC0 && ch <= 0x1DFF ) ||
( ch >= 0x20D0 && ch <= 0x20FF ) ||
( ch >= 0xFE20 && ch <= 0xFE2F )
);
}
}
wstring apply( wstring const & in )
{
// First, strip diacritics and apply ws/punctuation removal
wstring withoutDiacritics;
withoutDiacritics.reserve( in.size() );
wchar const * nextChar = in.data();
size_t consumed;
for( size_t left = in.size(); left; )
{
wchar ch = foldDiacritic( nextChar, left, consumed );
if ( !isCombiningMark( ch ) && !isWhitespace( ch ) && !isPunct( ch ) )
withoutDiacritics.push_back( ch );
nextChar += consumed;
left -= consumed;
}
// Now, fold the case
wstring caseFolded;
caseFolded.reserve( withoutDiacritics.size() * foldCaseMaxOut );
nextChar = withoutDiacritics.data();
wchar buf[ foldCaseMaxOut ];
for( size_t left = withoutDiacritics.size(); left--; )
caseFolded.append( buf, foldCase( *nextChar++, buf ) );
return caseFolded;
}
wstring applySimpleCaseOnly( wstring const & in )
{
wchar const * nextChar = in.data();
wstring out;
out.reserve( in.size() );
for( size_t left = in.size(); left--; )
out.push_back( foldCaseSimple( *nextChar++ ) );
return out;
}
wstring applyFullCaseOnly( wstring const & in )
{
wstring caseFolded;
caseFolded.reserve( in.size() * foldCaseMaxOut );
wchar const * nextChar = in.data();
wchar buf[ foldCaseMaxOut ];
for( size_t left = in.size(); left--; )
caseFolded.append( buf, foldCase( *nextChar++, buf ) );
return caseFolded;
}
wstring applyDiacriticsOnly( wstring const & in )
{
wstring withoutDiacritics;
withoutDiacritics.reserve( in.size() );
wchar const * nextChar = in.data();
size_t consumed;
for( size_t left = in.size(); left; )
{
wchar ch = foldDiacritic( nextChar, left, consumed );
if ( !isCombiningMark( ch ) )
withoutDiacritics.push_back( ch );
nextChar += consumed;
left -= consumed;
}
return withoutDiacritics;
}
wstring applyPunctOnly( wstring const & in )
{
wchar const * nextChar = in.data();
wstring out;
out.reserve( in.size() );
for( size_t left = in.size(); left--; ++nextChar )
if ( !isPunct( *nextChar ) )
out.push_back( *nextChar );
return out;
}
wstring applyWhitespaceOnly( wstring const & in )
{
wchar const * nextChar = in.data();
wstring out;
out.reserve( in.size() );
for( size_t left = in.size(); left--; ++nextChar )
if ( !isWhitespace( *nextChar ) )
out.push_back( *nextChar );
return out;
}
wstring applyWhitespaceAndPunctOnly( wstring const & in )
{
wchar const * nextChar = in.data();
wstring out;
out.reserve( in.size() );
for( size_t left = in.size(); left--; ++nextChar )
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
out.push_back( *nextChar );
return out;
}
bool isWhitespace( wchar ch )
{
switch( ch )
{
case '\n':
case '\r':
case '\t':
case 0x2028: // Zl, LINE SEPARATOR
case 0x2029: // Zp, PARAGRAPH SEPARATOR
case 0x0020: // Zs, SPACE
case 0x00A0: // Zs, NO-BREAK SPACE
case 0x1680: // Zs, OGHAM SPACE MARK
case 0x180E: // Zs, MONGOLIAN VOWEL SEPARATOR
case 0x2000: // Zs, EN QUAD
case 0x2001: // Zs, EM QUAD
case 0x2002: // Zs, EN SPACE
case 0x2003: // Zs, EM SPACE
case 0x2004: // Zs, THREE-PER-EM SPACE
case 0x2005: // Zs, FOUR-PER-EM SPACE
case 0x2006: // Zs, SIX-PER-EM SPACE
case 0x2007: // Zs, FIGURE SPACE
case 0x2008: // Zs, PUNCTUATION SPACE
case 0x2009: // Zs, THIN SPACE
case 0x200A: // Zs, HAIR SPACE
case 0x202F: // Zs, NARROW NO-BREAK SPACE
case 0x205F: // Zs, MEDIUM MATHEMATICAL SPACE
case 0x3000: // Zs, IDEOGRAPHIC SPACE
return true;
default:
return false;
}
}
bool isPunct( wchar ch )
{
switch( ch )
{
// Pc
case 0x005F: // LOW LINE
case 0x203F: // UNDERTIE
case 0x2040: // CHARACTER TIE
case 0x2054: // INVERTED UNDERTIE
case 0x30FB: // KATAKANA MIDDLE DOT
case 0xFE33: // PRESENTATION FORM FOR VERTICAL LOW LINE
case 0xFE34: // PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
case 0xFE4D: // DASHED LOW LINE
case 0xFE4E: // CENTRELINE LOW LINE
case 0xFE4F: // WAVY LOW LINE
case 0xFF3F: // FULLWIDTH LOW LINE
case 0xFF65: // HALFWIDTH KATAKANA MIDDLE DOT
// Pd
case 0x002D: // HYPHEN-MINUS
case 0x058A: // ARMENIAN HYPHEN
case 0x1806: // MONGOLIAN TODO SOFT HYPHEN
case 0x2010: // HYPHEN
case 0x2011: // NON-BREAKING HYPHEN
case 0x2012: // FIGURE DASH
case 0x2013: // EN DASH
case 0x2014: // EM DASH
case 0x2015: // HORIZONTAL BAR
case 0x301C: // WAVE DASH
case 0x3030: // WAVY DASH
case 0x30A0: // KATAKANA-HIRAGANA DOUBLE HYPHEN
case 0xFE31: // PRESENTATION FORM FOR VERTICAL EM DASH
case 0xFE32: // PRESENTATION FORM FOR VERTICAL EN DASH
case 0xFE58: // SMALL EM DASH
case 0xFE63: // SMALL HYPHEN-MINUS
case 0xFF0D: // FULLWIDTH HYPHEN-MINUS
// Ps
case 0x0028: // LEFT PARENTHESIS
case 0x005B: // LEFT SQUARE BRACKET
case 0x007B: // LEFT CURLY BRACKET
case 0x0F3A: // TIBETAN MARK GUG RTAGS GYON
case 0x0F3C: // TIBETAN MARK ANG KHANG GYON
case 0x169B: // OGHAM FEATHER MARK
case 0x201A: // SINGLE LOW-9 QUOTATION MARK
case 0x201E: // DOUBLE LOW-9 QUOTATION MARK
case 0x2045: // LEFT SQUARE BRACKET WITH QUILL
case 0x207D: // SUPERSCRIPT LEFT PARENTHESIS
case 0x208D: // SUBSCRIPT LEFT PARENTHESIS
case 0x2329: // LEFT-POINTING ANGLE BRACKET
case 0x2768: // MEDIUM LEFT PARENTHESIS ORNAMENT
case 0x276A: // MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
case 0x276C: // MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT
case 0x276E: // HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT
case 0x2770: // HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT
case 0x2772: // LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT
case 0x2774: // MEDIUM LEFT CURLY BRACKET ORNAMENT
case 0x27C5: // LEFT S-SHAPED BAG DELIMITER
case 0x27E6: // MATHEMATICAL LEFT WHITE SQUARE BRACKET
case 0x27E8: // MATHEMATICAL LEFT ANGLE BRACKET
case 0x27EA: // MATHEMATICAL LEFT DOUBLE ANGLE BRACKET
case 0x27EC: // MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET
case 0x27EE: // MATHEMATICAL LEFT FLATTENED PARENTHESIS
case 0x2983: // LEFT WHITE CURLY BRACKET
case 0x2985: // LEFT WHITE PARENTHESIS
case 0x2987: // Z NOTATION LEFT IMAGE BRACKET
case 0x2989: // Z NOTATION LEFT BINDING BRACKET
case 0x298B: // LEFT SQUARE BRACKET WITH UNDERBAR
case 0x298D: // LEFT SQUARE BRACKET WITH TICK IN TOP CORNER
case 0x298F: // LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
case 0x2991: // LEFT ANGLE BRACKET WITH DOT
case 0x2993: // LEFT ARC LESS-THAN BRACKET
case 0x2995: // DOUBLE LEFT ARC GREATER-THAN BRACKET
case 0x2997: // LEFT BLACK TORTOISE SHELL BRACKET
case 0x29D8: // LEFT WIGGLY FENCE
case 0x29DA: // LEFT DOUBLE WIGGLY FENCE
case 0x29FC: // LEFT-POINTING CURVED ANGLE BRACKET
case 0x2E22: // TOP LEFT HALF BRACKET
case 0x2E24: // BOTTOM LEFT HALF BRACKET
case 0x2E26: // LEFT SIDEWAYS U BRACKET
case 0x2E28: // LEFT DOUBLE PARENTHESIS
case 0x3008: // LEFT ANGLE BRACKET
case 0x300A: // LEFT DOUBLE ANGLE BRACKET
case 0x300C: // LEFT CORNER BRACKET
case 0x300E: // LEFT WHITE CORNER BRACKET
case 0x3010: // LEFT BLACK LENTICULAR BRACKET
case 0x3014: // LEFT TORTOISE SHELL BRACKET
case 0x3016: // LEFT WHITE LENTICULAR BRACKET
case 0x3018: // LEFT WHITE TORTOISE SHELL BRACKET
case 0x301A: // LEFT WHITE SQUARE BRACKET
case 0x301D: // REVERSED DOUBLE PRIME QUOTATION MARK
case 0xFD3E: // ORNATE LEFT PARENTHESIS
case 0xFE17: // PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
case 0xFE35: // PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
case 0xFE37: // PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
case 0xFE39: // PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
case 0xFE3B: // PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
case 0xFE3D: // PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
case 0xFE3F: // PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
case 0xFE41: // PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
case 0xFE43: // PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
case 0xFE47: // PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
case 0xFE59: // SMALL LEFT PARENTHESIS
case 0xFE5B: // SMALL LEFT CURLY BRACKET
case 0xFE5D: // SMALL LEFT TORTOISE SHELL BRACKET
case 0xFF08: // FULLWIDTH LEFT PARENTHESIS
case 0xFF3B: // FULLWIDTH LEFT SQUARE BRACKET
case 0xFF5B: // FULLWIDTH LEFT CURLY BRACKET
case 0xFF5F: // FULLWIDTH LEFT WHITE PARENTHESIS
case 0xFF62: // HALFWIDTH LEFT CORNER BRACKET
// Pe
case 0x0029: // RIGHT PARENTHESIS
case 0x005D: // RIGHT SQUARE BRACKET
case 0x007D: // RIGHT CURLY BRACKET
case 0x0F3B: // TIBETAN MARK GUG RTAGS GYAS
case 0x0F3D: // TIBETAN MARK ANG KHANG GYAS
case 0x169C: // OGHAM REVERSED FEATHER MARK
case 0x2046: // RIGHT SQUARE BRACKET WITH QUILL
case 0x207E: // SUPERSCRIPT RIGHT PARENTHESIS
case 0x208E: // SUBSCRIPT RIGHT PARENTHESIS
case 0x232A: // RIGHT-POINTING ANGLE BRACKET
case 0x23B5: // BOTTOM SQUARE BRACKET
case 0x2769: // MEDIUM RIGHT PARENTHESIS ORNAMENT
case 0x276B: // MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT
case 0x276D: // MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT
case 0x276F: // HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT
case 0x2771: // HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT
case 0x2773: // LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT
case 0x2775: // MEDIUM RIGHT CURLY BRACKET ORNAMENT
case 0x27E7: // MATHEMATICAL RIGHT WHITE SQUARE BRACKET
case 0x27E9: // MATHEMATICAL RIGHT ANGLE BRACKET
case 0x27EB: // MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET
case 0x2984: // RIGHT WHITE CURLY BRACKET
case 0x2986: // RIGHT WHITE PARENTHESIS
case 0x2988: // Z NOTATION RIGHT IMAGE BRACKET
case 0x298A: // Z NOTATION RIGHT BINDING BRACKET
case 0x298C: // RIGHT SQUARE BRACKET WITH UNDERBAR
case 0x298E: // RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER
case 0x2990: // RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER
case 0x2992: // RIGHT ANGLE BRACKET WITH DOT
case 0x2994: // RIGHT ARC GREATER-THAN BRACKET
case 0x2996: // DOUBLE RIGHT ARC LESS-THAN BRACKET
case 0x2998: // RIGHT BLACK TORTOISE SHELL BRACKET
case 0x29D9: // RIGHT WIGGLY FENCE
case 0x29DB: // RIGHT DOUBLE WIGGLY FENCE
case 0x29FD: // RIGHT-POINTING CURVED ANGLE BRACKET
case 0x3009: // RIGHT ANGLE BRACKET
case 0x300B: // RIGHT DOUBLE ANGLE BRACKET
case 0x300D: // RIGHT CORNER BRACKET
case 0x300F: // RIGHT WHITE CORNER BRACKET
case 0x3011: // RIGHT BLACK LENTICULAR BRACKET
case 0x3015: // RIGHT TORTOISE SHELL BRACKET
case 0x3017: // RIGHT WHITE LENTICULAR BRACKET
case 0x3019: // RIGHT WHITE TORTOISE SHELL BRACKET
case 0x301B: // RIGHT WHITE SQUARE BRACKET
case 0x301E: // DOUBLE PRIME QUOTATION MARK
case 0x301F: // LOW DOUBLE PRIME QUOTATION MARK
case 0xFD3F: // ORNATE RIGHT PARENTHESIS
case 0xFE36: // PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
case 0xFE38: // PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
case 0xFE3A: // PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
case 0xFE3C: // PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
case 0xFE3E: // PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
case 0xFE40: // PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
case 0xFE42: // PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
case 0xFE44: // PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
case 0xFE48: // PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
case 0xFE5A: // SMALL RIGHT PARENTHESIS
case 0xFE5C: // SMALL RIGHT CURLY BRACKET
case 0xFE5E: // SMALL RIGHT TORTOISE SHELL BRACKET
case 0xFF09: // FULLWIDTH RIGHT PARENTHESIS
case 0xFF3D: // FULLWIDTH RIGHT SQUARE BRACKET
case 0xFF5D: // FULLWIDTH RIGHT CURLY BRACKET
case 0xFF60: // FULLWIDTH RIGHT WHITE PARENTHESIS
case 0xFF63: // HALFWIDTH RIGHT CORNER BRACKET
// Pf
case 0x00BB: // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
case 0x2019: // RIGHT SINGLE QUOTATION MARK
case 0x201D: // RIGHT DOUBLE QUOTATION MARK
case 0x203A: // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
// Pi
case 0x00AB: // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
case 0x2018: // LEFT SINGLE QUOTATION MARK
case 0x201C: // LEFT DOUBLE QUOTATION MARK
case 0x2039: // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
// Po
case 0x0021: // EXCLAMATION MARK
case 0x0022: // QUOTATION MARK
case 0x0023: // NUMBER SIGN
case 0x0025: // PERCENT SIGN
case 0x0026: // AMPERSAND
case 0x0027: // APOSTROPHE
case 0x002A: // ASTERISK
case 0x002C: // COMMA
case 0x002E: // FULL STOP
case 0x002F: // SOLIDUS
case 0x003A: // COLON
case 0x003B: // SEMICOLON
case 0x003F: // QUESTION MARK
case 0x0040: // COMMERCIAL AT
case 0x005C: // REVERSE SOLIDUS
case 0x00A1: // INVERTED EXCLAMATION MARK
case 0x00B7: // MIDDLE DOT
case 0x00BF: // INVERTED QUESTION MARK
case 0x037E: // GREEK QUESTION MARK
case 0x0387: // GREEK ANO TELEIA
case 0x055A: // ARMENIAN APOSTROPHE
case 0x055B: // ARMENIAN EMPHASIS MARK
case 0x055C: // ARMENIAN EXCLAMATION MARK
case 0x055D: // ARMENIAN COMMA
case 0x055E: // ARMENIAN QUESTION MARK
case 0x055F: // ARMENIAN ABBREVIATION MARK
case 0x0589: // ARMENIAN FULL STOP
case 0x05BE: // HEBREW PUNCTUATION MAQAF
case 0x05C0: // HEBREW PUNCTUATION PASEQ
case 0x05C3: // HEBREW PUNCTUATION SOF PASUQ
case 0x05F3: // HEBREW PUNCTUATION GERESH
case 0x05F4: // HEBREW PUNCTUATION GERSHAYIM
case 0x060C: // ARABIC COMMA
case 0x060D: // ARABIC DATE SEPARATOR
case 0x061B: // ARABIC SEMICOLON
case 0x061F: // ARABIC QUESTION MARK
case 0x066A: // ARABIC PERCENT SIGN
case 0x066B: // ARABIC DECIMAL SEPARATOR
case 0x066C: // ARABIC THOUSANDS SEPARATOR
case 0x066D: // ARABIC FIVE POINTED STAR
case 0x06D4: // ARABIC FULL STOP
case 0x0700: // SYRIAC END OF PARAGRAPH
case 0x0701: // SYRIAC SUPRALINEAR FULL STOP
case 0x0702: // SYRIAC SUBLINEAR FULL STOP
case 0x0703: // SYRIAC SUPRALINEAR COLON
case 0x0704: // SYRIAC SUBLINEAR COLON
case 0x0705: // SYRIAC HORIZONTAL COLON
case 0x0706: // SYRIAC COLON SKEWED LEFT
case 0x0707: // SYRIAC COLON SKEWED RIGHT
case 0x0708: // SYRIAC SUPRALINEAR COLON SKEWED LEFT
case 0x0709: // SYRIAC SUBLINEAR COLON SKEWED RIGHT
case 0x070A: // SYRIAC CONTRACTION
case 0x070B: // SYRIAC HARKLEAN OBELUS
case 0x070C: // SYRIAC HARKLEAN METOBELUS
case 0x070D: // SYRIAC HARKLEAN ASTERISCUS
case 0x0964: // DEVANAGARI DANDA
case 0x0965: // DEVANAGARI DOUBLE DANDA
case 0x0970: // DEVANAGARI ABBREVIATION SIGN
case 0x0DF4: // SINHALA PUNCTUATION KUNDDALIYA
case 0x0E4F: // THAI CHARACTER FONGMAN
case 0x0E5A: // THAI CHARACTER ANGKHANKHU
case 0x0E5B: // THAI CHARACTER KHOMUT
case 0x0F04: // TIBETAN MARK INITIAL YIG MGO MDUN MA
case 0x0F05: // TIBETAN MARK CLOSING YIG MGO SGAB MA
case 0x0F06: // TIBETAN MARK CARET YIG MGO PHUR SHAD MA
case 0x0F07: // TIBETAN MARK YIG MGO TSHEG SHAD MA
case 0x0F08: // TIBETAN MARK SBRUL SHAD
case 0x0F09: // TIBETAN MARK BSKUR YIG MGO
case 0x0F0A: // TIBETAN MARK BKA- SHOG YIG MGO
case 0x0F0B: // TIBETAN MARK INTERSYLLABIC TSHEG
case 0x0F0C: // TIBETAN MARK DELIMITER TSHEG BSTAR
case 0x0F0D: // TIBETAN MARK SHAD
case 0x0F0E: // TIBETAN MARK NYIS SHAD
case 0x0F0F: // TIBETAN MARK TSHEG SHAD
case 0x0F10: // TIBETAN MARK NYIS TSHEG SHAD
case 0x0F11: // TIBETAN MARK RIN CHEN SPUNGS SHAD
case 0x0F12: // TIBETAN MARK RGYA GRAM SHAD
case 0x0F85: // TIBETAN MARK PALUTA
case 0x104A: // MYANMAR SIGN LITTLE SECTION
case 0x104B: // MYANMAR SIGN SECTION
case 0x104C: // MYANMAR SYMBOL LOCATIVE
case 0x104D: // MYANMAR SYMBOL COMPLETED
case 0x104E: // MYANMAR SYMBOL AFOREMENTIONED
case 0x104F: // MYANMAR SYMBOL GENITIVE
case 0x10FB: // GEORGIAN PARAGRAPH SEPARATOR
case 0x1361: // ETHIOPIC WORDSPACE
case 0x1362: // ETHIOPIC FULL STOP
case 0x1363: // ETHIOPIC COMMA
case 0x1364: // ETHIOPIC SEMICOLON
case 0x1365: // ETHIOPIC COLON
case 0x1366: // ETHIOPIC PREFACE COLON
case 0x1367: // ETHIOPIC QUESTION MARK
case 0x1368: // ETHIOPIC PARAGRAPH SEPARATOR
case 0x166D: // CANADIAN SYLLABICS CHI SIGN
case 0x166E: // CANADIAN SYLLABICS FULL STOP
case 0x16EB: // RUNIC SINGLE PUNCTUATION
case 0x16EC: // RUNIC MULTIPLE PUNCTUATION
case 0x16ED: // RUNIC CROSS PUNCTUATION
case 0x1735: // PHILIPPINE SINGLE PUNCTUATION
case 0x1736: // PHILIPPINE DOUBLE PUNCTUATION
case 0x17D4: // KHMER SIGN KHAN
case 0x17D5: // KHMER SIGN BARIYOOSAN
case 0x17D6: // KHMER SIGN CAMNUC PII KUUH
case 0x17D8: // KHMER SIGN BEYYAL
case 0x17D9: // KHMER SIGN PHNAEK MUAN
case 0x17DA: // KHMER SIGN KOOMUUT
case 0x1800: // MONGOLIAN BIRGA
case 0x1801: // MONGOLIAN ELLIPSIS
case 0x1802: // MONGOLIAN COMMA
case 0x1803: // MONGOLIAN FULL STOP
case 0x1804: // MONGOLIAN COLON
case 0x1805: // MONGOLIAN FOUR DOTS
case 0x1807: // MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER
case 0x1808: // MONGOLIAN MANCHU COMMA
case 0x1809: // MONGOLIAN MANCHU FULL STOP
case 0x180A: // MONGOLIAN NIRUGU
case 0x1944: // LIMBU EXCLAMATION MARK
case 0x1945: // LIMBU QUESTION MARK
case 0x2016: // DOUBLE VERTICAL LINE
case 0x2017: // DOUBLE LOW LINE
case 0x2020: // DAGGER
case 0x2021: // DOUBLE DAGGER
case 0x2022: // BULLET
case 0x2023: // TRIANGULAR BULLET
case 0x2024: // ONE DOT LEADER
case 0x2025: // TWO DOT LEADER
case 0x2026: // HORIZONTAL ELLIPSIS
case 0x2027: // HYPHENATION POINT
case 0x2030: // PER MILLE SIGN
case 0x2031: // PER TEN THOUSAND SIGN
case 0x2032: // PRIME
case 0x2033: // DOUBLE PRIME
case 0x2034: // TRIPLE PRIME
case 0x2035: // REVERSED PRIME
case 0x2036: // REVERSED DOUBLE PRIME
case 0x2037: // REVERSED TRIPLE PRIME
case 0x2038: // CARET
case 0x203B: // REFERENCE MARK
case 0x203C: // DOUBLE EXCLAMATION MARK
case 0x203D: // INTERROBANG
case 0x203E: // OVERLINE
case 0x2041: // CARET INSERTION POINT
case 0x2042: // ASTERISM
case 0x2043: // HYPHEN BULLET
case 0x2047: // DOUBLE QUESTION MARK
case 0x2048: // QUESTION EXCLAMATION MARK
case 0x2049: // EXCLAMATION QUESTION MARK
case 0x204A: // TIRONIAN SIGN ET
case 0x204B: // REVERSED PILCROW SIGN
case 0x204C: // BLACK LEFTWARDS BULLET
case 0x204D: // BLACK RIGHTWARDS BULLET
case 0x204E: // LOW ASTERISK
case 0x204F: // REVERSED SEMICOLON
case 0x2050: // CLOSE UP
case 0x2051: // TWO ASTERISKS ALIGNED VERTICALLY
case 0x2053: // SWUNG DASH
case 0x2057: // QUADRUPLE PRIME
case 0x23B6: // BOTTOM SQUARE BRACKET OVER TOP SQUARE BRACKET
case 0x3001: // IDEOGRAPHIC COMMA
case 0x3002: // IDEOGRAPHIC FULL STOP
case 0x3003: // DITTO MARK
case 0x303D: // PART ALTERNATION MARK
case 0xFE30: // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
case 0xFE45: // SESAME DOT
case 0xFE46: // WHITE SESAME DOT
case 0xFE49: // DASHED OVERLINE
case 0xFE4A: // CENTRELINE OVERLINE
case 0xFE4B: // WAVY OVERLINE
case 0xFE4C: // DOUBLE WAVY OVERLINE
case 0xFE50: // SMALL COMMA
case 0xFE51: // SMALL IDEOGRAPHIC COMMA
case 0xFE52: // SMALL FULL STOP
case 0xFE54: // SMALL SEMICOLON
case 0xFE55: // SMALL COLON
case 0xFE56: // SMALL QUESTION MARK
case 0xFE57: // SMALL EXCLAMATION MARK
case 0xFE5F: // SMALL NUMBER SIGN
case 0xFE60: // SMALL AMPERSAND
case 0xFE61: // SMALL ASTERISK
case 0xFE68: // SMALL REVERSE SOLIDUS
case 0xFE6A: // SMALL PERCENT SIGN
case 0xFE6B: // SMALL COMMERCIAL AT
case 0xFF01: // FULLWIDTH EXCLAMATION MARK
case 0xFF02: // FULLWIDTH QUOTATION MARK
case 0xFF03: // FULLWIDTH NUMBER SIGN
case 0xFF05: // FULLWIDTH PERCENT SIGN
case 0xFF06: // FULLWIDTH AMPERSAND
case 0xFF07: // FULLWIDTH APOSTROPHE
case 0xFF0A: // FULLWIDTH ASTERISK
case 0xFF0C: // FULLWIDTH COMMA
case 0xFF0E: // FULLWIDTH FULL STOP
case 0xFF0F: // FULLWIDTH SOLIDUS
case 0xFF1A: // FULLWIDTH COLON
case 0xFF1B: // FULLWIDTH SEMICOLON
case 0xFF1F: // FULLWIDTH QUESTION MARK
case 0xFF20: // FULLWIDTH COMMERCIAL AT
case 0xFF3C: // FULLWIDTH REVERSE SOLIDUS
case 0xFF61: // HALFWIDTH IDEOGRAPHIC FULL STOP
case 0xFF64: // HALFWIDTH IDEOGRAPHIC COMMA
return true;
default:
return false;
}
}
wstring trimWhitespaceOrPunct( wstring const & in )
{
wchar const * wordBegin = in.c_str();
wstring::size_type wordSize = in.size();
// Skip any leading whitespace
while( *wordBegin && ( Folding::isWhitespace( *wordBegin ) || Folding::isPunct( *wordBegin ) ) )
{
++wordBegin;
--wordSize;
}
// Skip any trailing whitespace
while( wordSize && ( Folding::isWhitespace( wordBegin[ wordSize - 1 ] ) ||
Folding::isPunct( wordBegin[ wordSize - 1 ] ) ) )
--wordSize;
return wstring( wordBegin, wordSize );
}
wstring trimWhitespace( wstring const & in )
{
wchar const * wordBegin = in.c_str();
wstring::size_type wordSize = in.size();
// Skip any leading whitespace
while( *wordBegin && Folding::isWhitespace( *wordBegin ) )
{
++wordBegin;
--wordSize;
}
// Skip any trailing whitespace
while( wordSize && Folding::isWhitespace( wordBegin[ wordSize - 1 ] ) )
--wordSize;
return wstring( wordBegin, wordSize );
}
void normalizeWhitespace( wstring & str )
{
for( size_t x = str.size(); x-- > 1; ) // >1 -- Don't test the first char
{
if ( isWhitespace( str[ x ] ) )
{
size_t y;
for( y = x; y && ( isWhitespace( str[ y - 1 ] ) ) ; --y );
if ( y != x )
{
// Remove extra spaces
str.erase( y, x - y );
x = y;
str[ x ] = ' ';
}
}
}
}
}