Add OpenCC libraries and data for Windows

This commit is contained in:
Abs62 2015-10-26 21:20:12 +03:00
parent b5ac249062
commit 9f32149a30
34 changed files with 1856 additions and 0 deletions

BIN
opencc/HKVariants.ocd Normal file

Binary file not shown.

Binary file not shown.

BIN
opencc/STCharacters.ocd Normal file

Binary file not shown.

BIN
opencc/STPhrases.ocd Normal file

Binary file not shown.

BIN
opencc/TSCharacters.ocd Normal file

Binary file not shown.

BIN
opencc/TSPhrases.ocd Normal file

Binary file not shown.

BIN
opencc/TWVariants.ocd Normal file

Binary file not shown.

33
opencc/s2hk.json Normal file
View file

@ -0,0 +1,33 @@
{
"name": "Simplified Chinese to Traditional Chinese (Hong Kong standard)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "ocd",
"file": "STPhrases.ocd"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "ocd",
"file": "STPhrases.ocd"
}, {
"type": "ocd",
"file": "STCharacters.ocd"
}]
}
}, {
"dict": {
"type": "group",
"dicts": [{
"type": "ocd",
"file": "HKVariantsPhrases.ocd"
}, {
"type": "ocd",
"file": "HKVariants.ocd"
}]
}
}]
}

27
opencc/s2tw.json Normal file
View file

@ -0,0 +1,27 @@
{
"name": "Simplified Chinese to Traditional Chinese (Taiwan standard)",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "ocd",
"file": "STPhrases.ocd"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "ocd",
"file": "STPhrases.ocd"
}, {
"type": "ocd",
"file": "STCharacters.ocd"
}]
}
}, {
"dict": {
"type": "ocd",
"file": "TWVariants.ocd"
}
}]
}

22
opencc/t2s.json Normal file
View file

@ -0,0 +1,22 @@
{
"name": "Traditional Chinese to Simplified Chinese",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "ocd",
"file": "TSPhrases.ocd"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "ocd",
"file": "TSPhrases.ocd"
}, {
"type": "ocd",
"file": "TSCharacters.ocd"
}]
}
}]
}

View file

@ -0,0 +1,53 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "SerializableDict.hpp"
namespace opencc {
/**
* Binary dictionary for faster deserialization
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT BinaryDict : public SerializableDict {
public:
BinaryDict(const LexiconPtr& _lexicon) : lexicon(_lexicon) {}
virtual ~BinaryDict() {}
virtual void SerializeToFile(FILE* fp) const;
static BinaryDictPtr NewFromFile(FILE* fp);
const LexiconPtr& GetLexicon() const { return lexicon; }
size_t KeyMaxLength() const;
private:
LexiconPtr lexicon;
string keyBuffer;
string valueBuffer;
void ConstructBuffer(string& keyBuffer, vector<size_t>& keyOffset,
size_t& keyTotalLength, string& valueBuffer,
vector<size_t>& valueOffset,
size_t& valueTotalLength) const;
};
}

View file

@ -0,0 +1,92 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
// Microsoft Visual C++ specific
#if defined(_MSC_VER) && (_MSC_VER >= 1020)
#pragma warning(disable : 4251 4266 4350 4503 4512 4514 4710 4820)
#endif
#include <algorithm>
#include <fstream>
#include <functional>
#include <iostream>
#include <list>
#include <map>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <cassert>
#include <cstddef>
#include <cstdio>
#include <cstring>
#include <ctime>
#include "Exception.hpp"
#include "Export.hpp"
#include "Optional.hpp"
using std::list;
using std::string;
using std::vector;
// Forward decalarations and alias
namespace opencc {
class BinaryDict;
class Config;
class Conversion;
class ConversionChain;
class Converter;
class DartsDict;
class Dict;
class DictEntry;
class DictGroup;
class Lexicon;
class MultiValueDictEntry;
class NoValueDictEntry;
class Segmentation;
class Segments;
class SerializableDict;
class SingleValueDictEntry;
class TextDict;
typedef std::shared_ptr<BinaryDict> BinaryDictPtr;
typedef std::shared_ptr<Conversion> ConversionPtr;
typedef std::shared_ptr<ConversionChain> ConversionChainPtr;
typedef std::shared_ptr<Converter> ConverterPtr;
typedef std::shared_ptr<DartsDict> DartsDictPtr;
typedef std::shared_ptr<Dict> DictPtr;
typedef std::shared_ptr<DictGroup> DictGroupPtr;
typedef std::shared_ptr<Lexicon> LexiconPtr;
typedef std::shared_ptr<Segmentation> SegmentationPtr;
typedef std::shared_ptr<Segments> SegmentsPtr;
typedef std::shared_ptr<SerializableDict> SerializableDictPtr;
typedef std::shared_ptr<TextDict> TextDictPtr;
}
#ifndef PKGDATADIR
const string PACKAGE_DATA_DIRECTORY = "";
#else // ifndef PKGDATADIR
const string PACKAGE_DATA_DIRECTORY = PKGDATADIR "/";
#endif // ifndef PKGDATADIR
#ifndef VERSION
#define VERSION "1.0.*"
#endif // ifndef VERSION

View file

@ -0,0 +1,41 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
namespace opencc {
/**
* Configuration loader
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT Config {
public:
Config();
virtual ~Config();
ConverterPtr NewFromString(const string& json, const string& configDirectory);
ConverterPtr NewFromFile(const string& fileName);
private:
void* internal;
};
}

View file

@ -0,0 +1,47 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "Segmentation.hpp"
namespace opencc {
/**
* Conversion interface
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT Conversion {
public:
Conversion(DictPtr _dict) : dict(_dict) {}
// Convert single phrase
string Convert(const string& phrase) const;
// Convert single phrase
string Convert(const char* phrase) const;
// Convert segmented text
SegmentsPtr Convert(const SegmentsPtr& input) const;
const DictPtr GetDict() const { return dict; }
private:
const DictPtr dict;
};
}

View file

@ -0,0 +1,41 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "Conversion.hpp"
namespace opencc {
/**
* Chain of conversions
* Consists of a list of conversions. Converts input in sequence.
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT ConversionChain {
public:
ConversionChain(const list<ConversionPtr> _conversions);
SegmentsPtr Convert(const SegmentsPtr& input) const;
const list<ConversionPtr> GetConversions() const { return conversions; }
private:
const list<ConversionPtr> conversions;
};
}

View file

@ -0,0 +1,51 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "Segmentation.hpp"
namespace opencc {
/**
* Controller of segmentation and conversion
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT Converter {
public:
Converter(const string& _name, SegmentationPtr _segmentation,
ConversionChainPtr _conversionChain)
: name(_name), segmentation(_segmentation),
conversionChain(_conversionChain) {}
string Convert(const string& text) const;
size_t Convert(const char* input, char* output) const;
const SegmentationPtr GetSegmentation() const { return segmentation; }
const ConversionChainPtr GetConversionChain() const {
return conversionChain;
}
private:
const string name;
const SegmentationPtr segmentation;
const ConversionChainPtr conversionChain;
};
}

View file

@ -0,0 +1,59 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "SerializableDict.hpp"
namespace opencc {
/**
* Darts dictionary
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT DartsDict : public Dict, public SerializableDict {
public:
virtual ~DartsDict();
virtual size_t KeyMaxLength() const;
virtual Optional<const DictEntry*> Match(const char* word) const;
virtual Optional<const DictEntry*> MatchPrefix(const char* word) const;
virtual LexiconPtr GetLexicon() const;
virtual void SerializeToFile(FILE* fp) const;
/**
* Constructs a DartsDict from another dictionary.
*/
static DartsDictPtr NewFromDict(const Dict& thatDict);
static DartsDictPtr NewFromFile(FILE* fp);
private:
DartsDict();
size_t maxLength;
LexiconPtr lexicon;
class DartsInternal;
DartsInternal* internal;
};
}

View file

@ -0,0 +1,81 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "DictEntry.hpp"
namespace opencc {
/**
* Abstract class of dictionary
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT Dict {
public:
/**
* Matches a word exactly and returns the DictEntry or Optional::Null().
*/
virtual Optional<const DictEntry*> Match(const char* word) const = 0;
/**
* Matches a word exactly and returns the DictEntry or Optional::Null().
*/
Optional<const DictEntry*> Match(const string& word) const {
return Match(word.c_str());
}
/**
* Matches the longest matched prefix of a word.
* For example given a dictionary having "a", "an", "b", "ba", "ban", "bana",
* the longest prefix of "banana" matched is "bana".
*/
virtual Optional<const DictEntry*> MatchPrefix(const char* word) const;
/**
* Matches the longest matched prefix of a word.
*/
Optional<const DictEntry*> MatchPrefix(const string& word) const {
return MatchPrefix(word.c_str());
}
/**
* Returns all matched prefixes of a word, sorted by the length (desc).
* For example given a dictionary having "a", "an", "b", "ba", "ban", "bana",
* all the matched prefixes of "banana" are "bana", "ban", "ba", "b".
*/
virtual vector<const DictEntry*> MatchAllPrefixes(const char* word) const;
/**
* Returns all matched prefixes of a word, sorted by the length (desc).
*/
vector<const DictEntry*> MatchAllPrefixes(const string& word) const {
return MatchAllPrefixes(word.c_str());
}
/**
* Returns the length of the longest key in the dictionary.
*/
virtual size_t KeyMaxLength() const = 0;
/**
* Returns all entries in the dictionary.
*/
virtual LexiconPtr GetLexicon() const = 0;
};
}

View file

@ -0,0 +1,197 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "UTF8Util.hpp"
#include "Segments.hpp"
namespace opencc {
/**
* Key-values pair entry
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT DictEntry {
public:
virtual ~DictEntry() {}
virtual const char* Key() const = 0;
virtual vector<const char*> Values() const = 0;
virtual const char* GetDefault() const = 0;
virtual size_t NumValues() const = 0;
virtual string ToString() const = 0;
size_t KeyLength() const { return strlen(Key()); }
bool operator<(const DictEntry& that) const {
return strcmp(Key(), that.Key()) < 0;
}
bool operator==(const DictEntry& that) const {
return strcmp(Key(), that.Key()) == 0;
}
static bool PtrLessThan(const DictEntry* a, const DictEntry* b) {
return *a < *b;
}
};
class OPENCC_EXPORT NoValueDictEntry : public DictEntry {
public:
NoValueDictEntry(const string& _key) : key(_key) {}
virtual ~NoValueDictEntry() {}
virtual const char* Key() const { return key.c_str(); }
virtual vector<const char*> Values() const { return vector<const char*>(); }
virtual const char* GetDefault() const { return Key(); }
virtual size_t NumValues() const { return 0; }
virtual string ToString() const { return key; }
private:
string key;
};
class OPENCC_EXPORT SingleValueDictEntry : public DictEntry {
public:
virtual const char* Value() const = 0;
virtual vector<const char*> Values() const {
return vector<const char*>{Value()};
}
virtual const char* GetDefault() const { return Value(); }
virtual size_t NumValues() const { return 1; }
virtual string ToString() const { return string(Key()) + "\t" + Value(); }
};
class OPENCC_EXPORT StrSingleValueDictEntry : public SingleValueDictEntry {
public:
StrSingleValueDictEntry(const string& _key, const string& _value)
: key(_key), value(_value) {}
virtual ~StrSingleValueDictEntry() {}
virtual const char* Key() const { return key.c_str(); }
virtual const char* Value() const { return value.c_str(); }
private:
string key;
string value;
};
class OPENCC_EXPORT MultiValueDictEntry : public DictEntry {
public:
virtual const char* GetDefault() const {
if (NumValues() > 0) {
return Values().at(0);
} else {
return Key();
}
}
virtual string ToString() const;
};
class OPENCC_EXPORT StrMultiValueDictEntry : public MultiValueDictEntry {
public:
StrMultiValueDictEntry(const string& _key, const vector<string>& _values)
: key(_key), values(_values) {}
StrMultiValueDictEntry(const string& _key, const vector<const char*>& _values)
: key(_key) {
values.reserve(_values.size());
for (const char* str : _values) {
values.push_back(str);
}
}
virtual ~StrMultiValueDictEntry() {}
virtual const char* Key() const { return key.c_str(); }
size_t NumValues() const { return values.size(); }
vector<const char*> Values() const {
vector<const char*> values;
for (const string& value : this->values) {
values.push_back(value.c_str());
}
return values;
}
private:
string key;
vector<string> values;
};
class OPENCC_EXPORT PtrDictEntry : public MultiValueDictEntry {
public:
PtrDictEntry(const char* _key, const vector<const char*>& _values)
: key(_key), values(_values) {}
virtual ~PtrDictEntry() {}
virtual const char* Key() const { return key; }
size_t NumValues() const { return values.size(); }
vector<const char*> Values() const { return values; }
private:
const char* key;
vector<const char*> values;
};
class OPENCC_EXPORT DictEntryFactory {
public:
static DictEntry* New(const string& key) { return new NoValueDictEntry(key); }
static DictEntry* New(const string& key, const string& value) {
return new StrSingleValueDictEntry(key, value);
}
static DictEntry* New(const string& key, const vector<string>& values) {
return new StrMultiValueDictEntry(key, values);
}
static DictEntry* New(const DictEntry* entry) {
if (entry->NumValues() == 0) {
return new NoValueDictEntry(entry->Key());
} else if (entry->NumValues() == 1) {
const auto svEntry = static_cast<const SingleValueDictEntry*>(entry);
return new StrSingleValueDictEntry(svEntry->Key(), svEntry->Value());
} else {
const auto mvEntry = static_cast<const MultiValueDictEntry*>(entry);
return new StrMultiValueDictEntry(mvEntry->Key(), mvEntry->Values());
}
}
};
}

View file

@ -0,0 +1,53 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "Dict.hpp"
namespace opencc {
/**
* Group of dictionaries
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT DictGroup : public Dict {
public:
DictGroup(const list<DictPtr>& dicts);
static DictGroupPtr NewFromDict(const Dict& dict);
virtual ~DictGroup();
virtual size_t KeyMaxLength() const;
virtual Optional<const DictEntry*> Match(const char* word) const;
virtual Optional<const DictEntry*> MatchPrefix(const char* word) const;
virtual vector<const DictEntry*> MatchAllPrefixes(const char* word) const;
virtual LexiconPtr GetLexicon() const;
const list<DictPtr> GetDicts() const { return dicts; }
private:
const size_t keyMaxLength;
const list<DictPtr> dicts;
};
}

View file

@ -0,0 +1,88 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <sstream>
#include <stdexcept>
#include <string>
#include "Export.hpp"
#ifdef _MSC_VER
// Until Visual Studio 2013 (12.0), C++ 11 "noexcept" qualifier is not supported
#define noexcept
#endif // ifdef _MSC_VER
namespace opencc {
class OPENCC_EXPORT Exception : public std::exception {
public:
Exception() {}
virtual ~Exception() throw() {}
Exception(const std::string& _message) : message(_message) {}
virtual const char* what() const noexcept { return message.c_str(); }
protected:
std::string message;
};
class OPENCC_EXPORT FileNotFound : public Exception {
public:
FileNotFound(const std::string& fileName)
: Exception(fileName + " not found or not accessible.") {}
};
class OPENCC_EXPORT FileNotWritable : public Exception {
public:
FileNotWritable(const std::string& fileName)
: Exception(fileName + " not writable.") {}
};
class OPENCC_EXPORT InvalidFormat : public Exception {
public:
InvalidFormat(const std::string& message)
: Exception("Invalid format: " + message) {}
};
class OPENCC_EXPORT InvalidTextDictionary : public InvalidFormat {
public:
InvalidTextDictionary(const std::string& _message, size_t lineNum)
: InvalidFormat("") {
std::ostringstream buffer;
buffer << "Invalid text dictionary at line " << lineNum << ": " << _message;
message = buffer.str();
}
};
class OPENCC_EXPORT InvalidUTF8 : public Exception {
public:
InvalidUTF8(const std::string& _message)
: Exception("Invalid UTF8: " + _message) {}
};
class OPENCC_EXPORT ShouldNotBeHere : public Exception {
public:
ShouldNotBeHere() : Exception("ShouldNotBeHere! This must be a bug.") {}
};
} // namespace opencc

View file

@ -0,0 +1,40 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#if defined(Opencc_BUILT_AS_STATIC) || !defined(_WIN32)
#define OPENCC_EXPORT
#define OPENCC_NO_EXPORT
#else // if defined(Opencc_BUILT_AS_STATIC) || !defined(_WIN32)
#ifndef OPENCC_EXPORT
#ifdef libopencc_EXPORTS
/* We are building this library */
#define OPENCC_EXPORT __declspec(dllexport)
#else // ifdef libopencc_EXPORTS
/* We are using this library */
#define OPENCC_EXPORT __declspec(dllimport)
#endif // ifdef libopencc_EXPORTS
#endif // ifndef OPENCC_EXPORT
#ifndef OPENCC_NO_EXPORT
#define OPENCC_NO_EXPORT
#endif // ifndef OPENCC_NO_EXPORT
#endif // if defined(Opencc_BUILT_AS_STATIC) || !defined(_WIN32)

View file

@ -0,0 +1,56 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "DictEntry.hpp"
namespace opencc {
/**
* Storage of all entries
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT Lexicon {
public:
Lexicon() {}
~Lexicon() {
for (DictEntry* entry : entries) {
delete entry;
}
}
void Add(DictEntry* entry) { entries.push_back(entry); }
void Sort() {
std::sort(entries.begin(), entries.end(), DictEntry::PtrLessThan);
}
const DictEntry* At(size_t index) const { return entries.at(index); }
size_t Length() const { return entries.size(); }
vector<DictEntry*>::const_iterator begin() const { return entries.begin(); }
vector<DictEntry*>::const_iterator end() const { return entries.end(); }
private:
vector<DictEntry*> entries;
};
}

View file

@ -0,0 +1,43 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "DictGroup.hpp"
#include "Segmentation.hpp"
namespace opencc {
/**
* Implementation of maximal match segmentation
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT MaxMatchSegmentation : public Segmentation {
public:
MaxMatchSegmentation(const DictPtr _dict) : dict(_dict) {}
virtual ~MaxMatchSegmentation() {}
virtual SegmentsPtr Segment(const string& text) const;
const DictPtr GetDict() const { return dict; }
private:
const DictPtr dict;
};
}

View file

@ -0,0 +1,76 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
namespace opencc {
/**
* A class that wraps type T into a nullable type.
* @ingroup opencc_cpp_api
*/
template <typename T> class Optional {
public:
/**
* The constructor of Optional.
*/
Optional(T actual) : isNull(false), data(actual) {}
/**
* Returns true if the instance is null.
*/
bool IsNull() const { return isNull; }
/**
* Returns the containing data of the instance.
*/
const T& Get() const { return data; }
/**
* Constructs a null instance.
*/
static Optional<T> Null() { return Optional(); }
private:
Optional() : isNull(true) {}
bool isNull;
T data;
};
/**
* Specialization of Optional for pointers.
*
* Reduce a bool.
*/
template <typename T> class Optional<T*> {
private:
Optional() : data(nullptr) {}
typedef T* TPtr;
TPtr data;
public:
Optional(TPtr actual) : data(actual) {}
bool IsNull() const { return data == nullptr; }
const TPtr& Get() const { return data; }
static Optional<TPtr> Null() { return Optional(); }
};
}

View file

@ -0,0 +1,32 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
namespace opencc {
/**
* Abstract segmentation
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT Segmentation {
public:
virtual SegmentsPtr Segment(const string& text) const = 0;
};
}

View file

@ -0,0 +1,112 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
namespace opencc {
/**
* Segmented text
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT Segments {
public:
Segments() {}
Segments(std::initializer_list<const char*> initList) {
for (const string& item : initList) {
AddSegment(item);
}
}
Segments(std::initializer_list<string> initList) {
for (const string& item : initList) {
AddSegment(item);
}
}
void AddSegment(const char* unmanagedString) {
indexes.push_back(std::make_pair(unmanaged.size(), false));
unmanaged.push_back(unmanagedString);
}
void AddSegment(const string& str) {
indexes.push_back(std::make_pair(managed.size(), true));
managed.push_back(str);
}
class iterator : public std::iterator<std::input_iterator_tag, const char*> {
public:
iterator(const Segments* const _segments, size_t _cursor)
: segments(_segments), cursor(_cursor) {}
iterator& operator++() {
cursor++;
return *this;
}
bool operator==(const iterator& that) const {
return cursor == that.cursor && segments == that.segments;
}
bool operator!=(const iterator& that) const {
return !this->operator==(that);
}
const char* operator*() const { return segments->At(cursor); }
private:
const Segments* const segments;
size_t cursor;
};
const char* At(size_t cursor) const {
const auto& index = indexes[cursor];
if (index.second) {
return managed[index.first].c_str();
} else {
return unmanaged[index.first];
}
}
size_t Length() const { return indexes.size(); }
iterator begin() const { return iterator(this, 0); }
iterator end() const { return iterator(this, indexes.size()); }
string ToString() const {
// TODO implement a nested structure to reduce concatenation,
// like a purely functional differential list
std::ostringstream buffer;
for (const char* segment : *this) {
buffer << segment;
}
return buffer.str();
}
private:
Segments(const Segments&) {}
vector<const char*> unmanaged;
vector<string> managed;
// index, managed
vector<std::pair<size_t, bool>> indexes;
};
}

View file

@ -0,0 +1,69 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Dict.hpp"
namespace opencc {
/**
* Serializable dictionary interface
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT SerializableDict {
public:
/**
* Serializes the dictionary and writes in to a file.
*/
virtual void SerializeToFile(FILE* fp) const = 0;
/**
* Serializes the dictionary and writes in to a file.
*/
virtual void SerializeToFile(const string& fileName) const {
FILE* fp = fopen(fileName.c_str(), "wb");
if (fp == NULL) {
throw FileNotWritable(fileName);
}
SerializeToFile(fp);
fclose(fp);
}
template <typename DICT>
static bool TryLoadFromFile(const string& fileName,
std::shared_ptr<DICT>* dict) {
FILE* fp = fopen(fileName.c_str(), "rb");
if (fp == NULL) {
return false;
}
std::shared_ptr<DICT> loadedDict = DICT::NewFromFile(fp);
fclose(fp);
*dict = loadedDict;
return true;
}
template <typename DICT>
static std::shared_ptr<DICT> NewFromFile(const string& fileName) {
std::shared_ptr<DICT> dict;
if (!TryLoadFromFile<DICT>(fileName, &dict)) {
throw FileNotFound(fileName);
}
return dict;
}
};
}

View file

@ -0,0 +1,88 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __OPENCC_SIMPLECONVERTER_HPP_
#define __OPENCC_SIMPLECONVERTER_HPP_
/**
* @defgroup opencc_simple_api OpenCC C++ Simple API
*
* Simple API in C++ language
*/
namespace opencc {
/**
* A high level converter
* This interface does not require C++11 to compile.
* @ingroup opencc_simple_api
*/
class OPENCC_EXPORT SimpleConverter {
public:
/**
* Constructor of SimpleConverter
* @param configFileName File name of configuration.
*/
SimpleConverter(const std::string& configFileName);
~SimpleConverter();
/**
* Converts a text
* @param input Text to be converted.
*/
std::string Convert(const std::string& input) const;
/**
* Converts a text
* @param input A C-Style string (terminated by '\0') to be converted.
*/
std::string Convert(const char* input) const;
/**
* Converts a text
* @param input A C-Style string limited by a given length to be converted.
* @param length Maximal length in byte of the input string.
*/
std::string Convert(const char* input, size_t length) const;
/**
* Converts a text and writes to an allocated buffer
* Please make sure the buffer has sufficent space.
* @param input A C-Style string (terminated by '\0') to be converted.
* @param output Buffer to write the converted text.
* @return Length of converted text.
*/
size_t Convert(const char* input, char* output) const;
/**
* Converts a text and writes to an allocated buffer
* Please make sure the buffer has sufficent space.
* @param input A C-Style string limited by a given length to be converted.
* @param length Maximal length in byte of the input string.
* @param output Buffer to write the converted text.
* @return Length of converted text.
*/
size_t Convert(const char* input, size_t length, char* output) const;
private:
const void* internalData;
};
} // namespace opencc
#endif

View file

@ -0,0 +1,60 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
#include "SerializableDict.hpp"
namespace opencc {
/**
* Text dictionary
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT TextDict : public Dict, public SerializableDict {
public:
/**
* Constructor of TextDict.
* _lexicon must be sorted.
*/
TextDict(const LexiconPtr& _lexicon);
virtual ~TextDict();
virtual size_t KeyMaxLength() const;
virtual Optional<const DictEntry*> Match(const char* word) const;
virtual LexiconPtr GetLexicon() const;
virtual void SerializeToFile(FILE* fp) const;
/**
* Constructs a TextDict from another dictionary.
*/
static TextDictPtr NewFromDict(const Dict& dict);
static TextDictPtr NewFromFile(FILE* fp);
static TextDictPtr NewFromSortedFile(FILE* fp);
private:
const size_t maxLength;
const LexiconPtr lexicon;
};
}

View file

@ -0,0 +1,245 @@
/*
* Open Chinese Convert
*
* Copyright 2013 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Common.hpp"
namespace opencc {
/**
* UTF8 string utilities
* @ingroup opencc_cpp_api
*/
class OPENCC_EXPORT UTF8Util {
public:
/**
* Detect UTF8 BOM and skip it.
*/
static void SkipUtf8Bom(FILE* fp);
/**
* Returns the length in byte for the next UTF8 character.
* On error returns 0.
*/
static size_t NextCharLengthNoException(const char* str) {
char ch = *str;
if ((ch & 0xF0) == 0xE0) {
return 3;
} else if ((ch & 0x80) == 0x00) {
return 1;
} else if ((ch & 0xE0) == 0xC0) {
return 2;
} else if ((ch & 0xF8) == 0xF0) {
return 4;
} else if ((ch & 0xFC) == 0xF8) {
return 5;
} else if ((ch & 0xFE) == 0xFC) {
return 6;
}
return 0;
}
/**
* Returns the length in byte for the next UTF8 character.
*/
static size_t NextCharLength(const char* str) {
size_t length = NextCharLengthNoException(str);
if (length == 0) {
throw InvalidUTF8(str);
}
return length;
}
/**
* Returns the length in byte for the previous UTF8 character.
*/
static size_t PrevCharLength(const char* str) {
{
const size_t length = NextCharLengthNoException(str - 3);
if (length == 3) {
return length;
}
}
{
const size_t length = NextCharLengthNoException(str - 1);
if (length == 1) {
return length;
}
}
{
const size_t length = NextCharLengthNoException(str - 2);
if (length == 2) {
return length;
}
}
for (size_t i = 4; i <= 6; i++) {
const size_t length = NextCharLengthNoException(str - i);
if (length == i) {
return length;
}
}
throw InvalidUTF8(str);
}
/**
* Returns the char* pointer over the next UTF8 character.
*/
static const char* NextChar(const char* str) {
return str + NextCharLength(str);
}
/**
* Move the char* pointer before the previous UTF8 character.
*/
static const char* PrevChar(const char* str) {
return str - PrevCharLength(str);
}
/**
* Returns the UTF8 length of a valid UTF8 string.
*/
static size_t Length(const char* str) {
size_t length = 0;
while (*str != '\0') {
str = NextChar(str);
length++;
}
return length;
}
/**
* Finds a character in the same line.
* @param str The text to be searched in.
* @param ch The character to find.
* @return The pointer that points to the found chacter in str or EOL/EOF.
*/
static const char* FindNextInline(const char* str, const char ch) {
while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
str = NextChar(str);
}
return str;
}
/**
* Returns ture if the character is a line ending or end of file.
*/
static bool IsLineEndingOrFileEnding(const char ch) {
return ch == '\0' || ch == '\n' || ch == '\r';
}
/**
* Copies a substring with given length to a new std::string.
*/
static string FromSubstr(const char* str, size_t length) {
string newStr;
newStr.resize(length);
strncpy(const_cast<char*>(newStr.c_str()), str, length);
return newStr;
}
/**
* Returns true if the given string is longer or as long as the given length.
*/
static bool NotShorterThan(const char* str, size_t byteLength) {
while (byteLength > 0) {
if (*str == '\0') {
return false;
}
byteLength--;
str++;
}
return true;
}
/**
* Truncates a string with a maximal length in byte.
* No UTF8 character will be broken.
*/
static string TruncateUTF8(const char* str, size_t maxByteLength) {
string wordTrunc;
if (NotShorterThan(str, maxByteLength)) {
size_t len = 0;
const char* pStr = str;
for (;;) {
const size_t charLength = NextCharLength(pStr);
if (len + charLength > maxByteLength) {
break;
}
pStr += charLength;
len += charLength;
}
wordTrunc = FromSubstr(str, len);
} else {
wordTrunc = str;
}
return wordTrunc;
}
/**
* Replaces all patterns in a string in place.
*/
static void ReplaceAll(string& str, const char* from, const char* to) {
string::size_type pos = 0;
string::size_type fromLen = strlen(from);
string::size_type toLen = strlen(to);
while ((pos = str.find(from, pos)) != string::npos) {
str.replace(pos, fromLen, to);
pos += toLen;
}
}
/**
* Joins a string vector in to a string with a separator.
*/
static string Join(const vector<string>& strings, const string& separator) {
std::ostringstream buffer;
bool first = true;
for (const auto& str : strings) {
if (!first) {
buffer << separator;
}
buffer << str;
first = false;
}
return buffer.str();
}
/**
* Joins a string vector in to a string.
*/
static string Join(const vector<string>& strings) {
std::ostringstream buffer;
for (const auto& str : strings) {
buffer << str;
}
return buffer.str();
}
static void GetByteMap(const char* str, const size_t utf8Length,
vector<size_t>* byteMap) {
if (byteMap->size() < utf8Length) {
byteMap->resize(utf8Length);
}
const char* pstr = str;
for (size_t i = 0; i < utf8Length; i++) {
(*byteMap)[i] = pstr - str;
pstr = NextChar(pstr);
}
}
};
}

View file

@ -0,0 +1,150 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __OPENCC_H_
#define __OPENCC_H_
#ifdef __cplusplus
#include <string>
#include "Export.hpp"
#include "SimpleConverter.hpp"
extern "C" {
#else
#include <stddef.h>
#endif
#ifndef OPENCC_EXPORT
#define OPENCC_EXPORT
#endif
/**
* @defgroup opencc_c_api OpenCC C API
*
* API in C language
*/
/**
* Filename of default Simplified to Traditional configuration
*
* @ingroup opencc_c_api
*/
#define OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD "s2t.json"
/**
* Filename of default Traditional to Simplified configuration
*
* @ingroup opencc_c_api
*/
#define OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP "t2s.json"
/**
* Type of opencc descriptor
*
* @ingroup opencc_c_api
*/
typedef void* opencc_t;
/**
* Makes an instance of opencc
*
* @param configFileName Location of configuration file. If this is set to NULL,
* OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD will be loaded.
* @return A description pointer of the newly allocated instance of
* opencc. On error the return value will be (opencc_t) -1.
* @ingroup opencc_c_api
*/
OPENCC_EXPORT opencc_t opencc_open(const char* configFileName);
/**
* Destroys an instance of opencc
*
* @param opencc The description pointer.
* @return 0 on success or non-zero number on failure.
* @ingroup opencc_c_api
*/
OPENCC_EXPORT int opencc_close(opencc_t opencc);
/**
* Converts UTF-8 string
*
* @param opencc The opencc description pointer.
* @param input The UTF-8 encoded string.
* @param length The maximum length in byte to convert. If length is (size_t)-1,
* the whole string (terminated by '\0') will be converted.
* @param output The buffer to store converted text. You MUST make sure this
* buffer has sufficient space.
*
* @return The length of converted string or (size_t)-1 on error.
*
* @ingroup opencc_c_api
*/
OPENCC_EXPORT size_t opencc_convert_utf8_to_buffer(opencc_t opencc,
const char* input,
size_t length,
char* output);
/**
* Converts UTF-8 string
* This function returns an allocated C-Style string, which stores
* the converted string.
* You MUST call opencc_convert_utf8_free() to release allocated memory.
*
* @param opencc The opencc description pointer.
* @param input The UTF-8 encoded string.
* @param length The maximum length in byte to convert. If length is (size_t)-1,
* the whole string (terminated by '\0') will be converted.
*
* @return The newly allocated UTF-8 string that stores text converted,
* or NULL on error.
* @ingroup opencc_c_api
*/
OPENCC_EXPORT char* opencc_convert_utf8(opencc_t opencc,
const char* input,
size_t length);
/**
* Releases allocated buffer by opencc_convert_utf8
*
* @param str Pointer to the allocated string buffer by opencc_convert_utf8.
*
* @ingroup opencc_c_api
*/
OPENCC_EXPORT void opencc_convert_utf8_free(char* str);
/**
* Returns the last error message
*
* Note that this function is the only one which is NOT thread-safe.
*
* @ingroup opencc_c_api
*/
OPENCC_EXPORT const char* opencc_error(void);
#ifdef __cplusplus
} // extern "C"
#endif
/**
* @defgroup opencc_cpp_api OpenCC C++ Comprehensive API
*
* Comprehensive API in C++ language
*/
#endif

BIN
winlibs/lib/libopencc.dll.a Normal file

Binary file not shown.

BIN
winlibs/lib/opencc.dll Normal file

Binary file not shown.