diff --git a/opencc/HKVariants.ocd b/opencc/HKVariants.ocd new file mode 100644 index 00000000..4c03d3b4 Binary files /dev/null and b/opencc/HKVariants.ocd differ diff --git a/opencc/HKVariantsPhrases.ocd b/opencc/HKVariantsPhrases.ocd new file mode 100644 index 00000000..4206c343 Binary files /dev/null and b/opencc/HKVariantsPhrases.ocd differ diff --git a/opencc/STCharacters.ocd b/opencc/STCharacters.ocd new file mode 100644 index 00000000..93f023e3 Binary files /dev/null and b/opencc/STCharacters.ocd differ diff --git a/opencc/STPhrases.ocd b/opencc/STPhrases.ocd new file mode 100644 index 00000000..691c51f7 Binary files /dev/null and b/opencc/STPhrases.ocd differ diff --git a/opencc/TSCharacters.ocd b/opencc/TSCharacters.ocd new file mode 100644 index 00000000..26b3fc49 Binary files /dev/null and b/opencc/TSCharacters.ocd differ diff --git a/opencc/TSPhrases.ocd b/opencc/TSPhrases.ocd new file mode 100644 index 00000000..59223349 Binary files /dev/null and b/opencc/TSPhrases.ocd differ diff --git a/opencc/TWVariants.ocd b/opencc/TWVariants.ocd new file mode 100644 index 00000000..faca6c17 Binary files /dev/null and b/opencc/TWVariants.ocd differ diff --git a/opencc/s2hk.json b/opencc/s2hk.json new file mode 100644 index 00000000..94854409 --- /dev/null +++ b/opencc/s2hk.json @@ -0,0 +1,33 @@ +{ + "name": "Simplified Chinese to Traditional Chinese (Hong Kong standard)", + "segmentation": { + "type": "mmseg", + "dict": { + "type": "ocd", + "file": "STPhrases.ocd" + } + }, + "conversion_chain": [{ + "dict": { + "type": "group", + "dicts": [{ + "type": "ocd", + "file": "STPhrases.ocd" + }, { + "type": "ocd", + "file": "STCharacters.ocd" + }] + } + }, { + "dict": { + "type": "group", + "dicts": [{ + "type": "ocd", + "file": "HKVariantsPhrases.ocd" + }, { + "type": "ocd", + "file": "HKVariants.ocd" + }] + } + }] +} diff --git a/opencc/s2tw.json b/opencc/s2tw.json new file mode 100644 index 00000000..5fc6afe1 --- /dev/null +++ b/opencc/s2tw.json @@ -0,0 +1,27 @@ +{ + "name": "Simplified Chinese to Traditional Chinese (Taiwan standard)", + "segmentation": { + "type": "mmseg", + "dict": { + "type": "ocd", + "file": "STPhrases.ocd" + } + }, + "conversion_chain": [{ + "dict": { + "type": "group", + "dicts": [{ + "type": "ocd", + "file": "STPhrases.ocd" + }, { + "type": "ocd", + "file": "STCharacters.ocd" + }] + } + }, { + "dict": { + "type": "ocd", + "file": "TWVariants.ocd" + } + }] +} diff --git a/opencc/t2s.json b/opencc/t2s.json new file mode 100644 index 00000000..21ba6e40 --- /dev/null +++ b/opencc/t2s.json @@ -0,0 +1,22 @@ +{ + "name": "Traditional Chinese to Simplified Chinese", + "segmentation": { + "type": "mmseg", + "dict": { + "type": "ocd", + "file": "TSPhrases.ocd" + } + }, + "conversion_chain": [{ + "dict": { + "type": "group", + "dicts": [{ + "type": "ocd", + "file": "TSPhrases.ocd" + }, { + "type": "ocd", + "file": "TSCharacters.ocd" + }] + } + }] +} diff --git a/winlibs/include/opencc/BinaryDict.hpp b/winlibs/include/opencc/BinaryDict.hpp new file mode 100644 index 00000000..9d900ced --- /dev/null +++ b/winlibs/include/opencc/BinaryDict.hpp @@ -0,0 +1,53 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "SerializableDict.hpp" + +namespace opencc { +/** +* Binary dictionary for faster deserialization +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT BinaryDict : public SerializableDict { +public: + BinaryDict(const LexiconPtr& _lexicon) : lexicon(_lexicon) {} + + virtual ~BinaryDict() {} + + virtual void SerializeToFile(FILE* fp) const; + + static BinaryDictPtr NewFromFile(FILE* fp); + + const LexiconPtr& GetLexicon() const { return lexicon; } + + size_t KeyMaxLength() const; + +private: + LexiconPtr lexicon; + string keyBuffer; + string valueBuffer; + + void ConstructBuffer(string& keyBuffer, vector& keyOffset, + size_t& keyTotalLength, string& valueBuffer, + vector& valueOffset, + size_t& valueTotalLength) const; +}; +} diff --git a/winlibs/include/opencc/Common.hpp b/winlibs/include/opencc/Common.hpp new file mode 100644 index 00000000..9d8d69a0 --- /dev/null +++ b/winlibs/include/opencc/Common.hpp @@ -0,0 +1,92 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// Microsoft Visual C++ specific +#if defined(_MSC_VER) && (_MSC_VER >= 1020) +#pragma warning(disable : 4251 4266 4350 4503 4512 4514 4710 4820) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "Exception.hpp" +#include "Export.hpp" +#include "Optional.hpp" + +using std::list; +using std::string; +using std::vector; + +// Forward decalarations and alias +namespace opencc { +class BinaryDict; +class Config; +class Conversion; +class ConversionChain; +class Converter; +class DartsDict; +class Dict; +class DictEntry; +class DictGroup; +class Lexicon; +class MultiValueDictEntry; +class NoValueDictEntry; +class Segmentation; +class Segments; +class SerializableDict; +class SingleValueDictEntry; +class TextDict; +typedef std::shared_ptr BinaryDictPtr; +typedef std::shared_ptr ConversionPtr; +typedef std::shared_ptr ConversionChainPtr; +typedef std::shared_ptr ConverterPtr; +typedef std::shared_ptr DartsDictPtr; +typedef std::shared_ptr DictPtr; +typedef std::shared_ptr DictGroupPtr; +typedef std::shared_ptr LexiconPtr; +typedef std::shared_ptr SegmentationPtr; +typedef std::shared_ptr SegmentsPtr; +typedef std::shared_ptr SerializableDictPtr; +typedef std::shared_ptr TextDictPtr; +} + +#ifndef PKGDATADIR +const string PACKAGE_DATA_DIRECTORY = ""; +#else // ifndef PKGDATADIR +const string PACKAGE_DATA_DIRECTORY = PKGDATADIR "/"; +#endif // ifndef PKGDATADIR + +#ifndef VERSION +#define VERSION "1.0.*" +#endif // ifndef VERSION diff --git a/winlibs/include/opencc/Config.hpp b/winlibs/include/opencc/Config.hpp new file mode 100644 index 00000000..a71a00d0 --- /dev/null +++ b/winlibs/include/opencc/Config.hpp @@ -0,0 +1,41 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" + +namespace opencc { +/** +* Configuration loader +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT Config { +public: + Config(); + + virtual ~Config(); + + ConverterPtr NewFromString(const string& json, const string& configDirectory); + + ConverterPtr NewFromFile(const string& fileName); + +private: + void* internal; +}; +} diff --git a/winlibs/include/opencc/Conversion.hpp b/winlibs/include/opencc/Conversion.hpp new file mode 100644 index 00000000..6bfc455f --- /dev/null +++ b/winlibs/include/opencc/Conversion.hpp @@ -0,0 +1,47 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "Segmentation.hpp" + +namespace opencc { +/** +* Conversion interface +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT Conversion { +public: + Conversion(DictPtr _dict) : dict(_dict) {} + + // Convert single phrase + string Convert(const string& phrase) const; + + // Convert single phrase + string Convert(const char* phrase) const; + + // Convert segmented text + SegmentsPtr Convert(const SegmentsPtr& input) const; + + const DictPtr GetDict() const { return dict; } + +private: + const DictPtr dict; +}; +} diff --git a/winlibs/include/opencc/ConversionChain.hpp b/winlibs/include/opencc/ConversionChain.hpp new file mode 100644 index 00000000..db9e6251 --- /dev/null +++ b/winlibs/include/opencc/ConversionChain.hpp @@ -0,0 +1,41 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "Conversion.hpp" + +namespace opencc { +/** +* Chain of conversions +* Consists of a list of conversions. Converts input in sequence. +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT ConversionChain { +public: + ConversionChain(const list _conversions); + + SegmentsPtr Convert(const SegmentsPtr& input) const; + + const list GetConversions() const { return conversions; } + +private: + const list conversions; +}; +} diff --git a/winlibs/include/opencc/Converter.hpp b/winlibs/include/opencc/Converter.hpp new file mode 100644 index 00000000..65cdda2a --- /dev/null +++ b/winlibs/include/opencc/Converter.hpp @@ -0,0 +1,51 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "Segmentation.hpp" + +namespace opencc { +/** +* Controller of segmentation and conversion +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT Converter { +public: + Converter(const string& _name, SegmentationPtr _segmentation, + ConversionChainPtr _conversionChain) + : name(_name), segmentation(_segmentation), + conversionChain(_conversionChain) {} + + string Convert(const string& text) const; + + size_t Convert(const char* input, char* output) const; + + const SegmentationPtr GetSegmentation() const { return segmentation; } + + const ConversionChainPtr GetConversionChain() const { + return conversionChain; + } + +private: + const string name; + const SegmentationPtr segmentation; + const ConversionChainPtr conversionChain; +}; +} diff --git a/winlibs/include/opencc/DartsDict.hpp b/winlibs/include/opencc/DartsDict.hpp new file mode 100644 index 00000000..609486db --- /dev/null +++ b/winlibs/include/opencc/DartsDict.hpp @@ -0,0 +1,59 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "SerializableDict.hpp" + +namespace opencc { +/** +* Darts dictionary +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT DartsDict : public Dict, public SerializableDict { +public: + virtual ~DartsDict(); + + virtual size_t KeyMaxLength() const; + + virtual Optional Match(const char* word) const; + + virtual Optional MatchPrefix(const char* word) const; + + virtual LexiconPtr GetLexicon() const; + + virtual void SerializeToFile(FILE* fp) const; + + /** + * Constructs a DartsDict from another dictionary. + */ + static DartsDictPtr NewFromDict(const Dict& thatDict); + + static DartsDictPtr NewFromFile(FILE* fp); + +private: + DartsDict(); + + size_t maxLength; + LexiconPtr lexicon; + + class DartsInternal; + DartsInternal* internal; +}; +} diff --git a/winlibs/include/opencc/Dict.hpp b/winlibs/include/opencc/Dict.hpp new file mode 100644 index 00000000..f923f378 --- /dev/null +++ b/winlibs/include/opencc/Dict.hpp @@ -0,0 +1,81 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "DictEntry.hpp" + +namespace opencc { +/** +* Abstract class of dictionary +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT Dict { +public: + /** + * Matches a word exactly and returns the DictEntry or Optional::Null(). + */ + virtual Optional Match(const char* word) const = 0; + + /** + * Matches a word exactly and returns the DictEntry or Optional::Null(). + */ + Optional Match(const string& word) const { + return Match(word.c_str()); + } + + /** + * Matches the longest matched prefix of a word. + * For example given a dictionary having "a", "an", "b", "ba", "ban", "bana", + * the longest prefix of "banana" matched is "bana". + */ + virtual Optional MatchPrefix(const char* word) const; + + /** + * Matches the longest matched prefix of a word. + */ + Optional MatchPrefix(const string& word) const { + return MatchPrefix(word.c_str()); + } + + /** + * Returns all matched prefixes of a word, sorted by the length (desc). + * For example given a dictionary having "a", "an", "b", "ba", "ban", "bana", + * all the matched prefixes of "banana" are "bana", "ban", "ba", "b". + */ + virtual vector MatchAllPrefixes(const char* word) const; + + /** + * Returns all matched prefixes of a word, sorted by the length (desc). + */ + vector MatchAllPrefixes(const string& word) const { + return MatchAllPrefixes(word.c_str()); + } + + /** + * Returns the length of the longest key in the dictionary. + */ + virtual size_t KeyMaxLength() const = 0; + + /** + * Returns all entries in the dictionary. + */ + virtual LexiconPtr GetLexicon() const = 0; +}; +} diff --git a/winlibs/include/opencc/DictEntry.hpp b/winlibs/include/opencc/DictEntry.hpp new file mode 100644 index 00000000..ecef4891 --- /dev/null +++ b/winlibs/include/opencc/DictEntry.hpp @@ -0,0 +1,197 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "UTF8Util.hpp" +#include "Segments.hpp" + +namespace opencc { +/** +* Key-values pair entry +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT DictEntry { +public: + virtual ~DictEntry() {} + + virtual const char* Key() const = 0; + + virtual vector Values() const = 0; + + virtual const char* GetDefault() const = 0; + + virtual size_t NumValues() const = 0; + + virtual string ToString() const = 0; + + size_t KeyLength() const { return strlen(Key()); } + + bool operator<(const DictEntry& that) const { + return strcmp(Key(), that.Key()) < 0; + } + + bool operator==(const DictEntry& that) const { + return strcmp(Key(), that.Key()) == 0; + } + + static bool PtrLessThan(const DictEntry* a, const DictEntry* b) { + return *a < *b; + } +}; + +class OPENCC_EXPORT NoValueDictEntry : public DictEntry { +public: + NoValueDictEntry(const string& _key) : key(_key) {} + + virtual ~NoValueDictEntry() {} + + virtual const char* Key() const { return key.c_str(); } + + virtual vector Values() const { return vector(); } + + virtual const char* GetDefault() const { return Key(); } + + virtual size_t NumValues() const { return 0; } + + virtual string ToString() const { return key; } + +private: + string key; +}; + +class OPENCC_EXPORT SingleValueDictEntry : public DictEntry { +public: + virtual const char* Value() const = 0; + + virtual vector Values() const { + return vector{Value()}; + } + + virtual const char* GetDefault() const { return Value(); } + + virtual size_t NumValues() const { return 1; } + + virtual string ToString() const { return string(Key()) + "\t" + Value(); } +}; + +class OPENCC_EXPORT StrSingleValueDictEntry : public SingleValueDictEntry { +public: + StrSingleValueDictEntry(const string& _key, const string& _value) + : key(_key), value(_value) {} + + virtual ~StrSingleValueDictEntry() {} + + virtual const char* Key() const { return key.c_str(); } + + virtual const char* Value() const { return value.c_str(); } + +private: + string key; + string value; +}; + +class OPENCC_EXPORT MultiValueDictEntry : public DictEntry { +public: + virtual const char* GetDefault() const { + if (NumValues() > 0) { + return Values().at(0); + } else { + return Key(); + } + } + + virtual string ToString() const; +}; + +class OPENCC_EXPORT StrMultiValueDictEntry : public MultiValueDictEntry { +public: + StrMultiValueDictEntry(const string& _key, const vector& _values) + : key(_key), values(_values) {} + + StrMultiValueDictEntry(const string& _key, const vector& _values) + : key(_key) { + values.reserve(_values.size()); + for (const char* str : _values) { + values.push_back(str); + } + } + + virtual ~StrMultiValueDictEntry() {} + + virtual const char* Key() const { return key.c_str(); } + + size_t NumValues() const { return values.size(); } + + vector Values() const { + vector values; + for (const string& value : this->values) { + values.push_back(value.c_str()); + } + return values; + } + +private: + string key; + vector values; +}; + +class OPENCC_EXPORT PtrDictEntry : public MultiValueDictEntry { +public: + PtrDictEntry(const char* _key, const vector& _values) + : key(_key), values(_values) {} + + virtual ~PtrDictEntry() {} + + virtual const char* Key() const { return key; } + + size_t NumValues() const { return values.size(); } + + vector Values() const { return values; } + +private: + const char* key; + vector values; +}; + +class OPENCC_EXPORT DictEntryFactory { +public: + static DictEntry* New(const string& key) { return new NoValueDictEntry(key); } + + static DictEntry* New(const string& key, const string& value) { + return new StrSingleValueDictEntry(key, value); + } + + static DictEntry* New(const string& key, const vector& values) { + return new StrMultiValueDictEntry(key, values); + } + + static DictEntry* New(const DictEntry* entry) { + if (entry->NumValues() == 0) { + return new NoValueDictEntry(entry->Key()); + } else if (entry->NumValues() == 1) { + const auto svEntry = static_cast(entry); + return new StrSingleValueDictEntry(svEntry->Key(), svEntry->Value()); + } else { + const auto mvEntry = static_cast(entry); + return new StrMultiValueDictEntry(mvEntry->Key(), mvEntry->Values()); + } + } +}; +} diff --git a/winlibs/include/opencc/DictGroup.hpp b/winlibs/include/opencc/DictGroup.hpp new file mode 100644 index 00000000..8098addd --- /dev/null +++ b/winlibs/include/opencc/DictGroup.hpp @@ -0,0 +1,53 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "Dict.hpp" + +namespace opencc { +/** +* Group of dictionaries +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT DictGroup : public Dict { +public: + DictGroup(const list& dicts); + + static DictGroupPtr NewFromDict(const Dict& dict); + + virtual ~DictGroup(); + + virtual size_t KeyMaxLength() const; + + virtual Optional Match(const char* word) const; + + virtual Optional MatchPrefix(const char* word) const; + + virtual vector MatchAllPrefixes(const char* word) const; + + virtual LexiconPtr GetLexicon() const; + + const list GetDicts() const { return dicts; } + +private: + const size_t keyMaxLength; + const list dicts; +}; +} diff --git a/winlibs/include/opencc/Exception.hpp b/winlibs/include/opencc/Exception.hpp new file mode 100644 index 00000000..a1134e85 --- /dev/null +++ b/winlibs/include/opencc/Exception.hpp @@ -0,0 +1,88 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "Export.hpp" + +#ifdef _MSC_VER + +// Until Visual Studio 2013 (12.0), C++ 11 "noexcept" qualifier is not supported +#define noexcept +#endif // ifdef _MSC_VER + +namespace opencc { + +class OPENCC_EXPORT Exception : public std::exception { +public: + Exception() {} + + virtual ~Exception() throw() {} + + Exception(const std::string& _message) : message(_message) {} + + virtual const char* what() const noexcept { return message.c_str(); } + +protected: + std::string message; +}; + +class OPENCC_EXPORT FileNotFound : public Exception { +public: + FileNotFound(const std::string& fileName) + : Exception(fileName + " not found or not accessible.") {} +}; + +class OPENCC_EXPORT FileNotWritable : public Exception { +public: + FileNotWritable(const std::string& fileName) + : Exception(fileName + " not writable.") {} +}; + +class OPENCC_EXPORT InvalidFormat : public Exception { +public: + InvalidFormat(const std::string& message) + : Exception("Invalid format: " + message) {} +}; + +class OPENCC_EXPORT InvalidTextDictionary : public InvalidFormat { +public: + InvalidTextDictionary(const std::string& _message, size_t lineNum) + : InvalidFormat("") { + std::ostringstream buffer; + buffer << "Invalid text dictionary at line " << lineNum << ": " << _message; + message = buffer.str(); + } +}; + +class OPENCC_EXPORT InvalidUTF8 : public Exception { +public: + InvalidUTF8(const std::string& _message) + : Exception("Invalid UTF8: " + _message) {} +}; + +class OPENCC_EXPORT ShouldNotBeHere : public Exception { +public: + ShouldNotBeHere() : Exception("ShouldNotBeHere! This must be a bug.") {} +}; + +} // namespace opencc diff --git a/winlibs/include/opencc/Export.hpp b/winlibs/include/opencc/Export.hpp new file mode 100644 index 00000000..e511b8a4 --- /dev/null +++ b/winlibs/include/opencc/Export.hpp @@ -0,0 +1,40 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if defined(Opencc_BUILT_AS_STATIC) || !defined(_WIN32) +#define OPENCC_EXPORT +#define OPENCC_NO_EXPORT +#else // if defined(Opencc_BUILT_AS_STATIC) || !defined(_WIN32) +#ifndef OPENCC_EXPORT +#ifdef libopencc_EXPORTS + +/* We are building this library */ +#define OPENCC_EXPORT __declspec(dllexport) +#else // ifdef libopencc_EXPORTS + +/* We are using this library */ +#define OPENCC_EXPORT __declspec(dllimport) +#endif // ifdef libopencc_EXPORTS +#endif // ifndef OPENCC_EXPORT + +#ifndef OPENCC_NO_EXPORT +#define OPENCC_NO_EXPORT +#endif // ifndef OPENCC_NO_EXPORT +#endif // if defined(Opencc_BUILT_AS_STATIC) || !defined(_WIN32) diff --git a/winlibs/include/opencc/Lexicon.hpp b/winlibs/include/opencc/Lexicon.hpp new file mode 100644 index 00000000..1bb6ab6a --- /dev/null +++ b/winlibs/include/opencc/Lexicon.hpp @@ -0,0 +1,56 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "DictEntry.hpp" + +namespace opencc { +/** +* Storage of all entries +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT Lexicon { +public: + Lexicon() {} + + ~Lexicon() { + for (DictEntry* entry : entries) { + delete entry; + } + } + + void Add(DictEntry* entry) { entries.push_back(entry); } + + void Sort() { + std::sort(entries.begin(), entries.end(), DictEntry::PtrLessThan); + } + + const DictEntry* At(size_t index) const { return entries.at(index); } + + size_t Length() const { return entries.size(); } + + vector::const_iterator begin() const { return entries.begin(); } + + vector::const_iterator end() const { return entries.end(); } + +private: + vector entries; +}; +} diff --git a/winlibs/include/opencc/MaxMatchSegmentation.hpp b/winlibs/include/opencc/MaxMatchSegmentation.hpp new file mode 100644 index 00000000..ace021e6 --- /dev/null +++ b/winlibs/include/opencc/MaxMatchSegmentation.hpp @@ -0,0 +1,43 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "DictGroup.hpp" +#include "Segmentation.hpp" + +namespace opencc { +/** +* Implementation of maximal match segmentation +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT MaxMatchSegmentation : public Segmentation { +public: + MaxMatchSegmentation(const DictPtr _dict) : dict(_dict) {} + + virtual ~MaxMatchSegmentation() {} + + virtual SegmentsPtr Segment(const string& text) const; + + const DictPtr GetDict() const { return dict; } + +private: + const DictPtr dict; +}; +} diff --git a/winlibs/include/opencc/Optional.hpp b/winlibs/include/opencc/Optional.hpp new file mode 100644 index 00000000..9279101b --- /dev/null +++ b/winlibs/include/opencc/Optional.hpp @@ -0,0 +1,76 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace opencc { +/** +* A class that wraps type T into a nullable type. +* @ingroup opencc_cpp_api +*/ +template class Optional { +public: + /** + * The constructor of Optional. + */ + Optional(T actual) : isNull(false), data(actual) {} + + /** + * Returns true if the instance is null. + */ + bool IsNull() const { return isNull; } + + /** + * Returns the containing data of the instance. + */ + const T& Get() const { return data; } + + /** + * Constructs a null instance. + */ + static Optional Null() { return Optional(); } + +private: + Optional() : isNull(true) {} + + bool isNull; + T data; +}; + +/** +* Specialization of Optional for pointers. +* +* Reduce a bool. +*/ +template class Optional { +private: + Optional() : data(nullptr) {} + + typedef T* TPtr; + TPtr data; + +public: + Optional(TPtr actual) : data(actual) {} + + bool IsNull() const { return data == nullptr; } + + const TPtr& Get() const { return data; } + + static Optional Null() { return Optional(); } +}; +} \ No newline at end of file diff --git a/winlibs/include/opencc/Segmentation.hpp b/winlibs/include/opencc/Segmentation.hpp new file mode 100644 index 00000000..84e7f216 --- /dev/null +++ b/winlibs/include/opencc/Segmentation.hpp @@ -0,0 +1,32 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" + +namespace opencc { +/** +* Abstract segmentation +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT Segmentation { +public: + virtual SegmentsPtr Segment(const string& text) const = 0; +}; +} diff --git a/winlibs/include/opencc/Segments.hpp b/winlibs/include/opencc/Segments.hpp new file mode 100644 index 00000000..6225e1a4 --- /dev/null +++ b/winlibs/include/opencc/Segments.hpp @@ -0,0 +1,112 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" + +namespace opencc { +/** +* Segmented text +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT Segments { +public: + Segments() {} + + Segments(std::initializer_list initList) { + for (const string& item : initList) { + AddSegment(item); + } + } + + Segments(std::initializer_list initList) { + for (const string& item : initList) { + AddSegment(item); + } + } + + void AddSegment(const char* unmanagedString) { + indexes.push_back(std::make_pair(unmanaged.size(), false)); + unmanaged.push_back(unmanagedString); + } + + void AddSegment(const string& str) { + indexes.push_back(std::make_pair(managed.size(), true)); + managed.push_back(str); + } + + class iterator : public std::iterator { + public: + iterator(const Segments* const _segments, size_t _cursor) + : segments(_segments), cursor(_cursor) {} + + iterator& operator++() { + cursor++; + return *this; + } + + bool operator==(const iterator& that) const { + return cursor == that.cursor && segments == that.segments; + } + + bool operator!=(const iterator& that) const { + return !this->operator==(that); + } + + const char* operator*() const { return segments->At(cursor); } + + private: + const Segments* const segments; + size_t cursor; + }; + + const char* At(size_t cursor) const { + const auto& index = indexes[cursor]; + if (index.second) { + return managed[index.first].c_str(); + } else { + return unmanaged[index.first]; + } + } + + size_t Length() const { return indexes.size(); } + + iterator begin() const { return iterator(this, 0); } + + iterator end() const { return iterator(this, indexes.size()); } + + string ToString() const { + // TODO implement a nested structure to reduce concatenation, + // like a purely functional differential list + std::ostringstream buffer; + for (const char* segment : *this) { + buffer << segment; + } + return buffer.str(); + } + +private: + Segments(const Segments&) {} + + vector unmanaged; + vector managed; + // index, managed + vector> indexes; +}; +} diff --git a/winlibs/include/opencc/SerializableDict.hpp b/winlibs/include/opencc/SerializableDict.hpp new file mode 100644 index 00000000..f0d52b0f --- /dev/null +++ b/winlibs/include/opencc/SerializableDict.hpp @@ -0,0 +1,69 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Dict.hpp" + +namespace opencc { +/** +* Serializable dictionary interface +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT SerializableDict { +public: + /** + * Serializes the dictionary and writes in to a file. + */ + virtual void SerializeToFile(FILE* fp) const = 0; + + /** + * Serializes the dictionary and writes in to a file. + */ + virtual void SerializeToFile(const string& fileName) const { + FILE* fp = fopen(fileName.c_str(), "wb"); + if (fp == NULL) { + throw FileNotWritable(fileName); + } + SerializeToFile(fp); + fclose(fp); + } + + template + static bool TryLoadFromFile(const string& fileName, + std::shared_ptr* dict) { + FILE* fp = fopen(fileName.c_str(), "rb"); + if (fp == NULL) { + return false; + } + std::shared_ptr loadedDict = DICT::NewFromFile(fp); + fclose(fp); + *dict = loadedDict; + return true; + } + + template + static std::shared_ptr NewFromFile(const string& fileName) { + std::shared_ptr dict; + if (!TryLoadFromFile(fileName, &dict)) { + throw FileNotFound(fileName); + } + return dict; + } +}; +} diff --git a/winlibs/include/opencc/SimpleConverter.hpp b/winlibs/include/opencc/SimpleConverter.hpp new file mode 100644 index 00000000..0dabefbb --- /dev/null +++ b/winlibs/include/opencc/SimpleConverter.hpp @@ -0,0 +1,88 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __OPENCC_SIMPLECONVERTER_HPP_ +#define __OPENCC_SIMPLECONVERTER_HPP_ + +/** +* @defgroup opencc_simple_api OpenCC C++ Simple API +* +* Simple API in C++ language +*/ + +namespace opencc { +/** +* A high level converter +* This interface does not require C++11 to compile. +* @ingroup opencc_simple_api +*/ +class OPENCC_EXPORT SimpleConverter { +public: + /** + * Constructor of SimpleConverter + * @param configFileName File name of configuration. + */ + SimpleConverter(const std::string& configFileName); + + ~SimpleConverter(); + + /** + * Converts a text + * @param input Text to be converted. + */ + std::string Convert(const std::string& input) const; + + /** + * Converts a text + * @param input A C-Style string (terminated by '\0') to be converted. + */ + std::string Convert(const char* input) const; + + /** + * Converts a text + * @param input A C-Style string limited by a given length to be converted. + * @param length Maximal length in byte of the input string. + */ + std::string Convert(const char* input, size_t length) const; + + /** + * Converts a text and writes to an allocated buffer + * Please make sure the buffer has sufficent space. + * @param input A C-Style string (terminated by '\0') to be converted. + * @param output Buffer to write the converted text. + * @return Length of converted text. + */ + size_t Convert(const char* input, char* output) const; + + /** + * Converts a text and writes to an allocated buffer + * Please make sure the buffer has sufficent space. + * @param input A C-Style string limited by a given length to be converted. + * @param length Maximal length in byte of the input string. + * @param output Buffer to write the converted text. + * @return Length of converted text. + */ + size_t Convert(const char* input, size_t length, char* output) const; + +private: + const void* internalData; +}; + +} // namespace opencc + +#endif diff --git a/winlibs/include/opencc/TextDict.hpp b/winlibs/include/opencc/TextDict.hpp new file mode 100644 index 00000000..5000b27c --- /dev/null +++ b/winlibs/include/opencc/TextDict.hpp @@ -0,0 +1,60 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" +#include "SerializableDict.hpp" + +namespace opencc { +/** +* Text dictionary +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT TextDict : public Dict, public SerializableDict { +public: + /** + * Constructor of TextDict. + * _lexicon must be sorted. + */ + TextDict(const LexiconPtr& _lexicon); + + virtual ~TextDict(); + + virtual size_t KeyMaxLength() const; + + virtual Optional Match(const char* word) const; + + virtual LexiconPtr GetLexicon() const; + + virtual void SerializeToFile(FILE* fp) const; + + /** + * Constructs a TextDict from another dictionary. + */ + static TextDictPtr NewFromDict(const Dict& dict); + + static TextDictPtr NewFromFile(FILE* fp); + + static TextDictPtr NewFromSortedFile(FILE* fp); + +private: + const size_t maxLength; + const LexiconPtr lexicon; +}; +} diff --git a/winlibs/include/opencc/UTF8Util.hpp b/winlibs/include/opencc/UTF8Util.hpp new file mode 100644 index 00000000..a662290d --- /dev/null +++ b/winlibs/include/opencc/UTF8Util.hpp @@ -0,0 +1,245 @@ +/* + * Open Chinese Convert + * + * Copyright 2013 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" + +namespace opencc { +/** +* UTF8 string utilities +* @ingroup opencc_cpp_api +*/ +class OPENCC_EXPORT UTF8Util { +public: + /** + * Detect UTF8 BOM and skip it. + */ + static void SkipUtf8Bom(FILE* fp); + + /** + * Returns the length in byte for the next UTF8 character. + * On error returns 0. + */ + static size_t NextCharLengthNoException(const char* str) { + char ch = *str; + if ((ch & 0xF0) == 0xE0) { + return 3; + } else if ((ch & 0x80) == 0x00) { + return 1; + } else if ((ch & 0xE0) == 0xC0) { + return 2; + } else if ((ch & 0xF8) == 0xF0) { + return 4; + } else if ((ch & 0xFC) == 0xF8) { + return 5; + } else if ((ch & 0xFE) == 0xFC) { + return 6; + } + return 0; + } + + /** + * Returns the length in byte for the next UTF8 character. + */ + static size_t NextCharLength(const char* str) { + size_t length = NextCharLengthNoException(str); + if (length == 0) { + throw InvalidUTF8(str); + } + return length; + } + + /** + * Returns the length in byte for the previous UTF8 character. + */ + static size_t PrevCharLength(const char* str) { + { + const size_t length = NextCharLengthNoException(str - 3); + if (length == 3) { + return length; + } + } + { + const size_t length = NextCharLengthNoException(str - 1); + if (length == 1) { + return length; + } + } + { + const size_t length = NextCharLengthNoException(str - 2); + if (length == 2) { + return length; + } + } + for (size_t i = 4; i <= 6; i++) { + const size_t length = NextCharLengthNoException(str - i); + if (length == i) { + return length; + } + } + throw InvalidUTF8(str); + } + + /** + * Returns the char* pointer over the next UTF8 character. + */ + static const char* NextChar(const char* str) { + return str + NextCharLength(str); + } + + /** + * Move the char* pointer before the previous UTF8 character. + */ + static const char* PrevChar(const char* str) { + return str - PrevCharLength(str); + } + + /** + * Returns the UTF8 length of a valid UTF8 string. + */ + static size_t Length(const char* str) { + size_t length = 0; + while (*str != '\0') { + str = NextChar(str); + length++; + } + return length; + } + + /** + * Finds a character in the same line. + * @param str The text to be searched in. + * @param ch The character to find. + * @return The pointer that points to the found chacter in str or EOL/EOF. + */ + static const char* FindNextInline(const char* str, const char ch) { + while (!IsLineEndingOrFileEnding(*str) && *str != ch) { + str = NextChar(str); + } + return str; + } + + /** + * Returns ture if the character is a line ending or end of file. + */ + static bool IsLineEndingOrFileEnding(const char ch) { + return ch == '\0' || ch == '\n' || ch == '\r'; + } + + /** + * Copies a substring with given length to a new std::string. + */ + static string FromSubstr(const char* str, size_t length) { + string newStr; + newStr.resize(length); + strncpy(const_cast(newStr.c_str()), str, length); + return newStr; + } + + /** + * Returns true if the given string is longer or as long as the given length. + */ + static bool NotShorterThan(const char* str, size_t byteLength) { + while (byteLength > 0) { + if (*str == '\0') { + return false; + } + byteLength--; + str++; + } + return true; + } + + /** + * Truncates a string with a maximal length in byte. + * No UTF8 character will be broken. + */ + static string TruncateUTF8(const char* str, size_t maxByteLength) { + string wordTrunc; + if (NotShorterThan(str, maxByteLength)) { + size_t len = 0; + const char* pStr = str; + for (;;) { + const size_t charLength = NextCharLength(pStr); + if (len + charLength > maxByteLength) { + break; + } + pStr += charLength; + len += charLength; + } + wordTrunc = FromSubstr(str, len); + } else { + wordTrunc = str; + } + return wordTrunc; + } + + /** + * Replaces all patterns in a string in place. + */ + static void ReplaceAll(string& str, const char* from, const char* to) { + string::size_type pos = 0; + string::size_type fromLen = strlen(from); + string::size_type toLen = strlen(to); + while ((pos = str.find(from, pos)) != string::npos) { + str.replace(pos, fromLen, to); + pos += toLen; + } + } + + /** + * Joins a string vector in to a string with a separator. + */ + static string Join(const vector& strings, const string& separator) { + std::ostringstream buffer; + bool first = true; + for (const auto& str : strings) { + if (!first) { + buffer << separator; + } + buffer << str; + first = false; + } + return buffer.str(); + } + + /** + * Joins a string vector in to a string. + */ + static string Join(const vector& strings) { + std::ostringstream buffer; + for (const auto& str : strings) { + buffer << str; + } + return buffer.str(); + } + + static void GetByteMap(const char* str, const size_t utf8Length, + vector* byteMap) { + if (byteMap->size() < utf8Length) { + byteMap->resize(utf8Length); + } + const char* pstr = str; + for (size_t i = 0; i < utf8Length; i++) { + (*byteMap)[i] = pstr - str; + pstr = NextChar(pstr); + } + } +}; +} diff --git a/winlibs/include/opencc/opencc.h b/winlibs/include/opencc/opencc.h new file mode 100644 index 00000000..f180ce25 --- /dev/null +++ b/winlibs/include/opencc/opencc.h @@ -0,0 +1,150 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2014 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __OPENCC_H_ +#define __OPENCC_H_ + +#ifdef __cplusplus + +#include +#include "Export.hpp" +#include "SimpleConverter.hpp" + +extern "C" { +#else +#include +#endif + +#ifndef OPENCC_EXPORT +#define OPENCC_EXPORT +#endif + +/** +* @defgroup opencc_c_api OpenCC C API +* +* API in C language +*/ + +/** +* Filename of default Simplified to Traditional configuration +* +* @ingroup opencc_c_api +*/ +#define OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD "s2t.json" + +/** +* Filename of default Traditional to Simplified configuration +* +* @ingroup opencc_c_api +*/ +#define OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP "t2s.json" + +/** +* Type of opencc descriptor +* +* @ingroup opencc_c_api +*/ +typedef void* opencc_t; + +/** +* Makes an instance of opencc +* +* @param configFileName Location of configuration file. If this is set to NULL, +* OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD will be loaded. +* @return A description pointer of the newly allocated instance of +* opencc. On error the return value will be (opencc_t) -1. +* @ingroup opencc_c_api +*/ +OPENCC_EXPORT opencc_t opencc_open(const char* configFileName); + +/** +* Destroys an instance of opencc +* +* @param opencc The description pointer. +* @return 0 on success or non-zero number on failure. +* @ingroup opencc_c_api +*/ +OPENCC_EXPORT int opencc_close(opencc_t opencc); + +/** +* Converts UTF-8 string +* +* @param opencc The opencc description pointer. +* @param input The UTF-8 encoded string. +* @param length The maximum length in byte to convert. If length is (size_t)-1, +* the whole string (terminated by '\0') will be converted. +* @param output The buffer to store converted text. You MUST make sure this +* buffer has sufficient space. +* +* @return The length of converted string or (size_t)-1 on error. +* +* @ingroup opencc_c_api +*/ +OPENCC_EXPORT size_t opencc_convert_utf8_to_buffer(opencc_t opencc, + const char* input, + size_t length, + char* output); + +/** +* Converts UTF-8 string +* This function returns an allocated C-Style string, which stores +* the converted string. +* You MUST call opencc_convert_utf8_free() to release allocated memory. +* +* @param opencc The opencc description pointer. +* @param input The UTF-8 encoded string. +* @param length The maximum length in byte to convert. If length is (size_t)-1, +* the whole string (terminated by '\0') will be converted. +* +* @return The newly allocated UTF-8 string that stores text converted, +* or NULL on error. +* @ingroup opencc_c_api +*/ +OPENCC_EXPORT char* opencc_convert_utf8(opencc_t opencc, + const char* input, + size_t length); + +/** +* Releases allocated buffer by opencc_convert_utf8 +* +* @param str Pointer to the allocated string buffer by opencc_convert_utf8. +* +* @ingroup opencc_c_api +*/ +OPENCC_EXPORT void opencc_convert_utf8_free(char* str); + +/** +* Returns the last error message +* +* Note that this function is the only one which is NOT thread-safe. +* +* @ingroup opencc_c_api +*/ +OPENCC_EXPORT const char* opencc_error(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +/** +* @defgroup opencc_cpp_api OpenCC C++ Comprehensive API +* +* Comprehensive API in C++ language +*/ + +#endif diff --git a/winlibs/lib/libopencc.dll.a b/winlibs/lib/libopencc.dll.a new file mode 100644 index 00000000..5c4e7ca0 Binary files /dev/null and b/winlibs/lib/libopencc.dll.a differ diff --git a/winlibs/lib/opencc.dll b/winlibs/lib/opencc.dll new file mode 100644 index 00000000..551882a9 Binary files /dev/null and b/winlibs/lib/opencc.dll differ