mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 12:44:07 +00:00
247 lines
6.9 KiB
C++
247 lines
6.9 KiB
C++
|
/*
|
||
|
* Open Chinese Convert
|
||
|
*
|
||
|
* Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
#include <cstring>
|
||
|
|
||
|
#include "Common.hpp"
|
||
|
#include "UTF8Util.hpp"
|
||
|
|
||
|
namespace opencc {
|
||
|
|
||
|
namespace internal {
|
||
|
|
||
|
inline size_t FNVHash(const char* text, const size_t byteLength,
|
||
|
const size_t FNV_prime, const size_t FNV_offset_basis) {
|
||
|
size_t hash = FNV_offset_basis;
|
||
|
for (const char* pstr = text; pstr < text + byteLength; pstr++) {
|
||
|
hash ^= *pstr;
|
||
|
hash *= FNV_prime;
|
||
|
}
|
||
|
return hash;
|
||
|
}
|
||
|
|
||
|
template <int> size_t FNVHash(const char* text, const size_t byteLength);
|
||
|
|
||
|
template <>
|
||
|
inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
|
||
|
return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
|
||
|
}
|
||
|
|
||
|
#if SIZE_MAX == 0xffffffffffffffff
|
||
|
template <>
|
||
|
inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
|
||
|
return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
} // namespace internal
|
||
|
|
||
|
template <typename LENGTH_TYPE> class UTF8StringSliceBase {
|
||
|
public:
|
||
|
typedef LENGTH_TYPE LengthType;
|
||
|
|
||
|
UTF8StringSliceBase(const char* _str)
|
||
|
: str(_str), utf8Length(static_cast<LengthType>(UTF8Util::Length(_str))),
|
||
|
byteLength(static_cast<LengthType>(strlen(_str))) {}
|
||
|
|
||
|
UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
|
||
|
: str(_str), utf8Length(_utf8Length) {
|
||
|
CalculateByteLength();
|
||
|
}
|
||
|
|
||
|
UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
|
||
|
const LengthType _byteLength)
|
||
|
: str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
|
||
|
CalculateByteLength();
|
||
|
}
|
||
|
|
||
|
LengthType UTF8Length() const { return utf8Length; }
|
||
|
|
||
|
LengthType ByteLength() const { return byteLength; }
|
||
|
|
||
|
UTF8StringSliceBase Left(const LengthType numberOfCharacters) const {
|
||
|
if (numberOfCharacters == UTF8Length()) {
|
||
|
return *this;
|
||
|
} else {
|
||
|
return UTF8StringSliceBase(str, numberOfCharacters);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
UTF8StringSliceBase Right(const LengthType numberOfCharacters) const {
|
||
|
if (numberOfCharacters == UTF8Length()) {
|
||
|
return *this;
|
||
|
} else {
|
||
|
const char* pstr = str + byteLength;
|
||
|
for (size_t i = 0; i < numberOfCharacters; i++) {
|
||
|
pstr = UTF8Util::PrevChar(pstr);
|
||
|
}
|
||
|
return UTF8StringSliceBase(pstr, numberOfCharacters);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
UTF8StringSliceBase SubString(const LengthType offset,
|
||
|
const LengthType numberOfCharacters) const {
|
||
|
if (offset == 0) {
|
||
|
return Left(numberOfCharacters);
|
||
|
} else {
|
||
|
const char* pstr = str;
|
||
|
for (size_t i = 0; i < offset; i++) {
|
||
|
pstr = UTF8Util::NextChar(pstr);
|
||
|
}
|
||
|
return UTF8StringSliceBase(pstr, numberOfCharacters);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
std::string ToString() const { return std::string(str, str + byteLength); }
|
||
|
|
||
|
const char* CString() const { return str; }
|
||
|
|
||
|
LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
|
||
|
if (str == that.str) {
|
||
|
return (std::min)(utf8Length, that.utf8Length);
|
||
|
} else {
|
||
|
const char* pstr1 = str;
|
||
|
const char* pstr2 = that.str;
|
||
|
for (size_t length = 0; length < utf8Length && length < that.utf8Length;
|
||
|
length++) {
|
||
|
size_t charLen1 = UTF8Util::NextCharLength(pstr1);
|
||
|
size_t charLen2 = UTF8Util::NextCharLength(pstr2);
|
||
|
if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
|
||
|
return length;
|
||
|
}
|
||
|
pstr1 += charLen1;
|
||
|
pstr2 += charLen2;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void MoveRight() {
|
||
|
if (utf8Length > 0) {
|
||
|
const size_t charLen = UTF8Util::NextCharLength(str);
|
||
|
str += charLen;
|
||
|
utf8Length--;
|
||
|
byteLength -= charLen;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void MoveLeft() {
|
||
|
if (utf8Length > 0) {
|
||
|
const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
|
||
|
utf8Length--;
|
||
|
byteLength -= charLen;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int ReverseCompare(const UTF8StringSliceBase& that) const {
|
||
|
const char* pstr1 = str + byteLength;
|
||
|
const char* pstr2 = that.str + that.byteLength;
|
||
|
const size_t length = (std::min)(utf8Length, that.utf8Length);
|
||
|
for (size_t i = 0; i < length; i++) {
|
||
|
const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
|
||
|
const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
|
||
|
pstr1 -= charLen1;
|
||
|
pstr2 -= charLen2;
|
||
|
const int cmp = strncmp(pstr1, pstr2, (std::min)(charLen1, charLen2));
|
||
|
if (cmp < 0) {
|
||
|
return -1;
|
||
|
} else if (cmp > 0) {
|
||
|
return 1;
|
||
|
} else if (charLen1 < charLen2) {
|
||
|
return -1;
|
||
|
} else if (charLen1 > charLen2) {
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
if (utf8Length < that.utf8Length) {
|
||
|
return -1;
|
||
|
} else if (utf8Length > that.utf8Length) {
|
||
|
return 1;
|
||
|
} else {
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
|
||
|
return static_cast<LengthType>(
|
||
|
ToString().find(pattern.str, 0, pattern.byteLength));
|
||
|
}
|
||
|
|
||
|
bool operator<(const UTF8StringSliceBase& that) const {
|
||
|
return Compare(that) < 0;
|
||
|
}
|
||
|
|
||
|
bool operator>(const UTF8StringSliceBase& that) const {
|
||
|
return Compare(that) > 0;
|
||
|
}
|
||
|
|
||
|
bool operator==(const UTF8StringSliceBase& that) const {
|
||
|
return (str == that.str && utf8Length == that.utf8Length) ||
|
||
|
Compare(that) == 0;
|
||
|
}
|
||
|
|
||
|
bool operator!=(const UTF8StringSliceBase& that) const {
|
||
|
return !this->operator==(that);
|
||
|
}
|
||
|
|
||
|
class Hasher {
|
||
|
public:
|
||
|
size_t operator()(const UTF8StringSliceBase& text) const {
|
||
|
return internal::FNVHash<sizeof(size_t)>(text.CString(),
|
||
|
text.ByteLength());
|
||
|
}
|
||
|
};
|
||
|
|
||
|
private:
|
||
|
inline int Compare(const UTF8StringSliceBase& that) const {
|
||
|
int cmp = strncmp(str, that.str, (std::min)(byteLength, that.byteLength));
|
||
|
if (cmp == 0) {
|
||
|
if (utf8Length < that.utf8Length) {
|
||
|
cmp = -1;
|
||
|
} else if (utf8Length > that.utf8Length) {
|
||
|
cmp = 1;
|
||
|
} else {
|
||
|
cmp = 0;
|
||
|
}
|
||
|
}
|
||
|
return cmp;
|
||
|
}
|
||
|
|
||
|
void CalculateByteLength() {
|
||
|
const char* pstr = str;
|
||
|
for (size_t i = 0; i < utf8Length; i++) {
|
||
|
pstr = UTF8Util::NextChar(pstr);
|
||
|
}
|
||
|
byteLength = static_cast<LengthType>(pstr - str);
|
||
|
}
|
||
|
|
||
|
const char* str;
|
||
|
LengthType utf8Length;
|
||
|
LengthType byteLength;
|
||
|
};
|
||
|
|
||
|
typedef UTF8StringSliceBase<size_t> UTF8StringSlice;
|
||
|
|
||
|
template <typename LENGTH_TYPE>
|
||
|
std::ostream& operator<<(::std::ostream& os,
|
||
|
const UTF8StringSliceBase<LENGTH_TYPE>& str) {
|
||
|
return os << str.ToString();
|
||
|
}
|
||
|
|
||
|
} // namespace opencc
|