2017-07-05 15:12:41 +00:00
|
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
|
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
|
|
*
|
|
|
|
* Copyright (C) 2002-2017 Németh László
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the Mozilla Public License Version
|
|
|
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
* http://www.mozilla.org/MPL/
|
|
|
|
*
|
|
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
|
|
* for the specific language governing rights and limitations under the
|
|
|
|
* License.
|
|
|
|
*
|
|
|
|
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
|
|
|
|
*
|
|
|
|
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
|
|
|
|
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
|
|
|
|
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
|
|
|
|
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
|
|
|
|
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
|
|
|
|
*
|
|
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
|
|
* of those above. If you wish to allow use of your version of this file only
|
|
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
|
|
* use your version of this file under the terms of the MPL, indicate your
|
|
|
|
* decision by deleting the provisions above and replace them with the notice
|
|
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
|
|
* the provisions above, a recipient may use your version of this file under
|
|
|
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
|
|
*
|
|
|
|
* ***** END LICENSE BLOCK ***** */
|
|
|
|
/*
|
|
|
|
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
|
|
|
|
* And Contributors. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* 3. All modifications to the source code must be clearly marked as
|
|
|
|
* such. Binary redistributions based on modified source code
|
|
|
|
* must be clearly marked as modified versions in the documentation
|
|
|
|
* and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
|
|
|
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
|
|
|
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef CSUTIL_HXX_
|
|
|
|
#define CSUTIL_HXX_
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2011-07-19 18:27:47 +00:00
|
|
|
#include "hunvisapi.h"
|
|
|
|
|
2009-04-17 12:02:50 +00:00
|
|
|
// First some base level utility routines
|
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
#include <fstream>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2011-07-19 18:27:47 +00:00
|
|
|
#include <string.h>
|
2009-04-17 12:02:50 +00:00
|
|
|
#include "w_char.hxx"
|
2011-07-19 18:27:47 +00:00
|
|
|
#include "htypes.hxx"
|
|
|
|
|
|
|
|
#ifdef MOZILLA_CLIENT
|
2017-07-05 15:12:41 +00:00
|
|
|
#include "nscore.h" // for mozalloc headers
|
2011-07-19 18:27:47 +00:00
|
|
|
#endif
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// casing
|
2017-07-05 15:12:41 +00:00
|
|
|
#define NOCAP 0
|
2009-04-17 12:02:50 +00:00
|
|
|
#define INITCAP 1
|
2017-07-05 15:12:41 +00:00
|
|
|
#define ALLCAP 2
|
|
|
|
#define HUHCAP 3
|
|
|
|
#define HUHINITCAP 4
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// default encoding and keystring
|
2017-07-05 15:12:41 +00:00
|
|
|
#define SPELL_ENCODING "ISO8859-1"
|
|
|
|
#define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// default morphological fields
|
2017-07-05 15:12:41 +00:00
|
|
|
#define MORPH_STEM "st:"
|
|
|
|
#define MORPH_ALLOMORPH "al:"
|
|
|
|
#define MORPH_POS "po:"
|
|
|
|
#define MORPH_DERI_PFX "dp:"
|
|
|
|
#define MORPH_INFL_PFX "ip:"
|
|
|
|
#define MORPH_TERM_PFX "tp:"
|
|
|
|
#define MORPH_DERI_SFX "ds:"
|
|
|
|
#define MORPH_INFL_SFX "is:"
|
|
|
|
#define MORPH_TERM_SFX "ts:"
|
|
|
|
#define MORPH_SURF_PFX "sp:"
|
|
|
|
#define MORPH_FREQ "fr:"
|
|
|
|
#define MORPH_PHON "ph:"
|
|
|
|
#define MORPH_HYPH "hy:"
|
|
|
|
#define MORPH_PART "pa:"
|
|
|
|
#define MORPH_FLAG "fl:"
|
|
|
|
#define MORPH_HENTRY "_H:"
|
|
|
|
#define MORPH_TAG_LEN strlen(MORPH_STEM)
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
#define MSEP_FLD ' '
|
|
|
|
#define MSEP_REC '\n'
|
|
|
|
#define MSEP_ALT '\v'
|
|
|
|
|
|
|
|
// default flags
|
2017-07-05 15:12:41 +00:00
|
|
|
#define DEFAULTFLAGS 65510
|
|
|
|
#define FORBIDDENWORD 65510
|
2009-04-17 12:02:50 +00:00
|
|
|
#define ONLYUPCASEFLAG 65511
|
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// fix long pathname problem of WIN32 by using w_char std::fstream::open override
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED void myopen(std::ifstream& stream, const char* path,
|
|
|
|
std::ios_base::openmode mode);
|
|
|
|
|
2009-04-17 12:02:50 +00:00
|
|
|
// convert UTF-16 characters to UTF-8
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string& u16_u8(std::string& dest,
|
|
|
|
const std::vector<w_char>& src);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// convert UTF-8 characters to UTF-16
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED int u8_u16(std::vector<w_char>& dest,
|
|
|
|
const std::string& src);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// remove end of line char(s)
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED void mychomp(std::string& s);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// duplicate string
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED char* mystrdup(const char* s);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// parse into tokens with char delimiter
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string::const_iterator mystrsep(const std::string &str,
|
|
|
|
std::string::const_iterator& start);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// replace pat by rep in word and return word
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string& mystrrep(std::string& str,
|
|
|
|
const std::string& search,
|
|
|
|
const std::string& replace);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// append s to ends of every lines in text
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string& strlinecat(std::string& str,
|
|
|
|
const std::string& apd);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// tokenize into lines with new line
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::vector<std::string> line_tok(const std::string& text,
|
|
|
|
char breakchar);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// tokenize into lines with new line and uniq in place
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED void line_uniq(std::string& text, char breakchar);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED void line_uniq_app(std::string& text, char breakchar);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// reverse word
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED size_t reverseword(std::string& word);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// reverse word
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED size_t reverseword_utf(std::string&);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// remove duplicates
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED void uniqlist(std::vector<std::string>& list);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// character encoding information
|
|
|
|
struct cs_info {
|
|
|
|
unsigned char ccase;
|
|
|
|
unsigned char clower;
|
|
|
|
unsigned char cupper;
|
|
|
|
};
|
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED void initialize_utf_tbl();
|
2011-07-19 18:27:47 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED void free_utf_tbl();
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetoupper(unsigned short c,
|
|
|
|
int langnum);
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED w_char upper_utf(w_char u, int langnum);
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED w_char lower_utf(w_char u, int langnum);
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED unsigned short unicodetolower(unsigned short c,
|
|
|
|
int langnum);
|
2011-07-19 18:27:47 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED int unicodeisalpha(unsigned short c);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED struct cs_info* get_current_cs(const std::string& es);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// get language identifiers of language codes
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED int get_lang_num(const std::string& lang);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// get characters of the given 8bit encoding with lower- and uppercase forms
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string get_casechars(const char* enc);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert std::string to all caps
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string& mkallcap(std::string& s,
|
|
|
|
const struct cs_info* csconv);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert null terminated string to all little
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string& mkallsmall(std::string& s,
|
|
|
|
const struct cs_info* csconv);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert first letter of string to little
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string& mkinitsmall(std::string& s,
|
|
|
|
const struct cs_info* csconv);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert first letter of string to capital
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::string& mkinitcap(std::string& s,
|
|
|
|
const struct cs_info* csconv);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert first letter of UTF-8 string to capital
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
|
|
|
|
mkinitcap_utf(std::vector<w_char>& u, int langnum);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert UTF-8 string to little
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
|
|
|
|
mkallsmall_utf(std::vector<w_char>& u, int langnum);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert first letter of UTF-8 string to little
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
|
|
|
|
mkinitsmall_utf(std::vector<w_char>& u, int langnum);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// convert UTF-8 string to capital
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED std::vector<w_char>&
|
|
|
|
mkallcap_utf(std::vector<w_char>& u, int langnum);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// get type of capitalization
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED int get_captype(const std::string& q, cs_info*);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// get type of capitalization (UTF-8)
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED int get_captype_utf8(const std::vector<w_char>& q, int langnum);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// strip all ignored characters in the string
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars_utf(
|
|
|
|
std::string& word,
|
|
|
|
const std::vector<w_char>& ignored_chars);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// strip all ignored characters in the string
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED size_t remove_ignored_chars(
|
|
|
|
std::string& word,
|
|
|
|
const std::string& ignored_chars);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED bool parse_string(const std::string& line,
|
|
|
|
std::string& out,
|
|
|
|
int ln);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED bool parse_array(const std::string& line,
|
|
|
|
std::string& out,
|
|
|
|
std::vector<w_char>& out_utf16,
|
|
|
|
int utf8,
|
|
|
|
int ln);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED int fieldlen(const char* r);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED bool copy_field(std::string& dest,
|
|
|
|
const std::string& morph,
|
|
|
|
const std::string& var);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// conversion function for protected memory
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED void store_pointer(char* dest, char* source);
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
// conversion function for protected memory
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
|
2011-07-19 18:27:47 +00:00
|
|
|
|
|
|
|
// hash entry macros
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
|
|
|
|
char* ret;
|
|
|
|
if (!h->var)
|
|
|
|
ret = NULL;
|
|
|
|
else if (h->var & H_OPT_ALIASM)
|
|
|
|
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
|
|
|
|
else
|
|
|
|
ret = HENTRY_WORD(h) + h->blen + 1;
|
|
|
|
return ret;
|
2011-07-19 18:27:47 +00:00
|
|
|
}
|
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
|
|
|
|
const struct hentry* h) {
|
|
|
|
const char* ret;
|
|
|
|
if (!h->var)
|
|
|
|
ret = NULL;
|
|
|
|
else if (h->var & H_OPT_ALIASM)
|
|
|
|
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
|
|
|
|
else
|
|
|
|
ret = HENTRY_WORD(h) + h->blen + 1;
|
|
|
|
return ret;
|
2011-07-19 18:27:47 +00:00
|
|
|
}
|
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
// NULL-free version for warning-free OOo build
|
|
|
|
LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(
|
|
|
|
const struct hentry* h) {
|
|
|
|
const char* ret;
|
|
|
|
if (!h->var)
|
|
|
|
ret = "";
|
|
|
|
else if (h->var & H_OPT_ALIASM)
|
|
|
|
ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
|
|
|
|
else
|
|
|
|
ret = HENTRY_WORD(h) + h->blen + 1;
|
|
|
|
return ret;
|
2011-07-19 18:27:47 +00:00
|
|
|
}
|
|
|
|
|
2017-07-05 15:12:41 +00:00
|
|
|
LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_FIND(struct hentry* h,
|
|
|
|
const char* p) {
|
|
|
|
return (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL);
|
|
|
|
}
|
2009-04-17 12:02:50 +00:00
|
|
|
|
|
|
|
#endif
|