goldendict-ng/winlibs/include/opencc/PhraseExtract.hpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

196 lines
5.2 KiB
C++
Raw Normal View History

2022-02-11 12:55:16 +00:00
/*
* Open Chinese Convert
*
* Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
2022-02-11 12:55:16 +00:00
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <functional>
2022-02-11 12:55:16 +00:00
#include <unordered_map>
#include "Common.hpp"
#include "UTF8StringSlice.hpp"
namespace opencc {
class OPENCC_EXPORT PhraseExtract {
public:
typedef UTF8StringSlice::LengthType LengthType;
typedef UTF8StringSliceBase<unsigned char> UTF8StringSlice8Bit;
PhraseExtract();
virtual ~PhraseExtract();
void Extract(const std::string& text) {
2022-02-11 12:55:16 +00:00
SetFullText(text);
ExtractSuffixes();
CalculateFrequency();
CalculateSuffixEntropy();
ReleaseSuffixes();
ExtractPrefixes();
CalculatePrefixEntropy();
ReleasePrefixes();
ExtractWordCandidates();
CalculateCohesions();
SelectWords();
}
void SetFullText(const std::string& fullText) {
2022-02-11 12:55:16 +00:00
utf8FullText = UTF8StringSlice(fullText.c_str());
}
void SetFullText(const char* fullText) {
utf8FullText = UTF8StringSlice(fullText);
}
void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
void SetWordMinLength(const LengthType _wordMinLength) {
wordMinLength = _wordMinLength;
}
void SetWordMaxLength(const LengthType _wordMaxLength) {
wordMaxLength = _wordMaxLength;
}
void SetPrefixSetLength(const LengthType _prefixSetLength) {
prefixSetLength = _prefixSetLength;
}
void SetSuffixSetLength(const LengthType _suffixSetLength) {
suffixSetLength = _suffixSetLength;
}
// PreCalculationFilter is called after frequencies statistics.
void SetPreCalculationFilter(
const std::function<bool(const PhraseExtract&,
const UTF8StringSlice8Bit&)>& filter) {
2022-02-11 12:55:16 +00:00
preCalculationFilter = filter;
}
void SetPostCalculationFilter(
const std::function<bool(const PhraseExtract&,
const UTF8StringSlice8Bit&)>& filter) {
2022-02-11 12:55:16 +00:00
postCalculationFilter = filter;
}
void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
2022-02-11 12:55:16 +00:00
void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
2022-02-11 12:55:16 +00:00
const std::vector<UTF8StringSlice8Bit>& Words() const { return words; }
2022-02-11 12:55:16 +00:00
const std::vector<UTF8StringSlice8Bit>& WordCandidates() const {
2022-02-11 12:55:16 +00:00
return wordCandidates;
}
struct Signals {
size_t frequency;
double cohesion;
double suffixEntropy;
double prefixEntropy;
};
const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
size_t Frequency(const UTF8StringSlice8Bit& word) const;
double Probability(const UTF8StringSlice8Bit& word) const;
double LogProbability(const UTF8StringSlice8Bit& word) const;
void Reset();
void ExtractSuffixes();
void ExtractPrefixes();
void ExtractWordCandidates();
void CalculateFrequency();
void CalculateCohesions();
void CalculateSuffixEntropy();
void CalculatePrefixEntropy();
void SelectWords();
static bool
DefaultPreCalculationFilter(const PhraseExtract&,
const PhraseExtract::UTF8StringSlice8Bit&);
static bool
DefaultPostCalculationFilter(const PhraseExtract&,
const PhraseExtract::UTF8StringSlice8Bit&);
private:
class DictType;
// Pointwise Mutual Information
double PMI(const UTF8StringSlice8Bit& wordCandidate,
const UTF8StringSlice8Bit& part1,
const UTF8StringSlice8Bit& part2) const;
double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
double CalculateEntropy(
const std::unordered_map<UTF8StringSlice8Bit, size_t,
UTF8StringSlice8Bit::Hasher>& choices) const;
2022-02-11 12:55:16 +00:00
LengthType wordMinLength;
LengthType wordMaxLength;
LengthType prefixSetLength;
LengthType suffixSetLength;
std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
preCalculationFilter;
std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
postCalculationFilter;
bool prefixesExtracted;
bool suffixesExtracted;
bool frequenciesCalculated;
bool wordCandidatesExtracted;
bool cohesionsCalculated;
bool prefixEntropiesCalculated;
bool suffixEntropiesCalculated;
bool wordsSelected;
UTF8StringSlice utf8FullText;
size_t totalOccurrence;
double logTotalOccurrence;
std::vector<UTF8StringSlice8Bit> prefixes;
std::vector<UTF8StringSlice8Bit> suffixes;
std::vector<UTF8StringSlice8Bit> wordCandidates;
std::vector<UTF8StringSlice8Bit> words;
2022-02-11 12:55:16 +00:00
DictType* signals;
friend class PhraseExtractTest;
};
} // namespace opencc