goldendict-ng/winlibs/include/opencc/UTF8StringSlice.hpp

247 lines
6.9 KiB
C++
Raw Normal View History

2022-02-11 12:55:16 +00:00
/*
* Open Chinese Convert
*
* Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
2022-02-11 12:55:16 +00:00
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstring>
2022-02-11 12:55:16 +00:00
#include "Common.hpp"
#include "UTF8Util.hpp"
namespace opencc {
namespace internal {
inline size_t FNVHash(const char* text, const size_t byteLength,
const size_t FNV_prime, const size_t FNV_offset_basis) {
size_t hash = FNV_offset_basis;
for (const char* pstr = text; pstr < text + byteLength; pstr++) {
hash ^= *pstr;
hash *= FNV_prime;
}
return hash;
}
template <int> size_t FNVHash(const char* text, const size_t byteLength);
template <>
inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
}
#if SIZE_MAX == 0xffffffffffffffff
template <>
inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
}
#endif
} // namespace internal
template <typename LENGTH_TYPE> class UTF8StringSliceBase {
public:
typedef LENGTH_TYPE LengthType;
UTF8StringSliceBase(const char* _str)
: str(_str), utf8Length(static_cast<LengthType>(UTF8Util::Length(_str))),
byteLength(static_cast<LengthType>(strlen(_str))) {}
UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
: str(_str), utf8Length(_utf8Length) {
CalculateByteLength();
}
UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
const LengthType _byteLength)
: str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
CalculateByteLength();
}
LengthType UTF8Length() const { return utf8Length; }
LengthType ByteLength() const { return byteLength; }
UTF8StringSliceBase Left(const LengthType numberOfCharacters) const {
if (numberOfCharacters == UTF8Length()) {
return *this;
} else {
return UTF8StringSliceBase(str, numberOfCharacters);
}
}
UTF8StringSliceBase Right(const LengthType numberOfCharacters) const {
if (numberOfCharacters == UTF8Length()) {
return *this;
} else {
const char* pstr = str + byteLength;
for (size_t i = 0; i < numberOfCharacters; i++) {
pstr = UTF8Util::PrevChar(pstr);
}
return UTF8StringSliceBase(pstr, numberOfCharacters);
}
}
UTF8StringSliceBase SubString(const LengthType offset,
const LengthType numberOfCharacters) const {
if (offset == 0) {
return Left(numberOfCharacters);
} else {
const char* pstr = str;
for (size_t i = 0; i < offset; i++) {
pstr = UTF8Util::NextChar(pstr);
}
return UTF8StringSliceBase(pstr, numberOfCharacters);
}
}
std::string ToString() const { return std::string(str, str + byteLength); }
2022-02-11 12:55:16 +00:00
const char* CString() const { return str; }
LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
if (str == that.str) {
return (std::min)(utf8Length, that.utf8Length);
2022-02-11 12:55:16 +00:00
} else {
const char* pstr1 = str;
const char* pstr2 = that.str;
for (size_t length = 0; length < utf8Length && length < that.utf8Length;
length++) {
size_t charLen1 = UTF8Util::NextCharLength(pstr1);
size_t charLen2 = UTF8Util::NextCharLength(pstr2);
if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
return length;
}
pstr1 += charLen1;
pstr2 += charLen2;
}
return 0;
}
}
void MoveRight() {
if (utf8Length > 0) {
const size_t charLen = UTF8Util::NextCharLength(str);
str += charLen;
utf8Length--;
byteLength -= charLen;
}
}
void MoveLeft() {
if (utf8Length > 0) {
const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
utf8Length--;
byteLength -= charLen;
}
}
int ReverseCompare(const UTF8StringSliceBase& that) const {
const char* pstr1 = str + byteLength;
const char* pstr2 = that.str + that.byteLength;
const size_t length = (std::min)(utf8Length, that.utf8Length);
2022-02-11 12:55:16 +00:00
for (size_t i = 0; i < length; i++) {
const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
pstr1 -= charLen1;
pstr2 -= charLen2;
const int cmp = strncmp(pstr1, pstr2, (std::min)(charLen1, charLen2));
2022-02-11 12:55:16 +00:00
if (cmp < 0) {
return -1;
} else if (cmp > 0) {
return 1;
} else if (charLen1 < charLen2) {
return -1;
} else if (charLen1 > charLen2) {
return 1;
}
}
if (utf8Length < that.utf8Length) {
return -1;
} else if (utf8Length > that.utf8Length) {
return 1;
} else {
return 0;
}
}
LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
return static_cast<LengthType>(
ToString().find(pattern.str, 0, pattern.byteLength));
}
bool operator<(const UTF8StringSliceBase& that) const {
return Compare(that) < 0;
}
bool operator>(const UTF8StringSliceBase& that) const {
return Compare(that) > 0;
}
bool operator==(const UTF8StringSliceBase& that) const {
return (str == that.str && utf8Length == that.utf8Length) ||
Compare(that) == 0;
}
bool operator!=(const UTF8StringSliceBase& that) const {
return !this->operator==(that);
}
class Hasher {
public:
size_t operator()(const UTF8StringSliceBase& text) const {
return internal::FNVHash<sizeof(size_t)>(text.CString(),
text.ByteLength());
}
};
private:
inline int Compare(const UTF8StringSliceBase& that) const {
int cmp = strncmp(str, that.str, (std::min)(byteLength, that.byteLength));
2022-02-11 12:55:16 +00:00
if (cmp == 0) {
if (utf8Length < that.utf8Length) {
cmp = -1;
} else if (utf8Length > that.utf8Length) {
cmp = 1;
} else {
cmp = 0;
}
}
return cmp;
}
void CalculateByteLength() {
const char* pstr = str;
for (size_t i = 0; i < utf8Length; i++) {
pstr = UTF8Util::NextChar(pstr);
}
byteLength = static_cast<LengthType>(pstr - str);
}
const char* str;
LengthType utf8Length;
LengthType byteLength;
};
typedef UTF8StringSliceBase<size_t> UTF8StringSlice;
template <typename LENGTH_TYPE>
std::ostream& operator<<(::std::ostream& os,
const UTF8StringSliceBase<LENGTH_TYPE>& str) {
return os << str.ToString();
}
} // namespace opencc