mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 08:34:08 +00:00
192 lines
5.1 KiB
C++
192 lines
5.1 KiB
C++
/*
|
|
* Open Chinese Convert
|
|
*
|
|
* Copyright 2015 BYVoid <byvoid@byvoid.com>
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <unordered_map>
|
|
|
|
#include "Common.hpp"
|
|
#include "UTF8StringSlice.hpp"
|
|
|
|
namespace opencc {
|
|
|
|
class OPENCC_EXPORT PhraseExtract {
|
|
public:
|
|
typedef UTF8StringSlice::LengthType LengthType;
|
|
|
|
typedef UTF8StringSliceBase<unsigned char> UTF8StringSlice8Bit;
|
|
|
|
PhraseExtract();
|
|
|
|
virtual ~PhraseExtract();
|
|
|
|
void Extract(const string& text) {
|
|
SetFullText(text);
|
|
ExtractSuffixes();
|
|
CalculateFrequency();
|
|
CalculateSuffixEntropy();
|
|
ReleaseSuffixes();
|
|
ExtractPrefixes();
|
|
CalculatePrefixEntropy();
|
|
ReleasePrefixes();
|
|
ExtractWordCandidates();
|
|
CalculateCohesions();
|
|
SelectWords();
|
|
}
|
|
|
|
void SetFullText(const string& fullText) {
|
|
utf8FullText = UTF8StringSlice(fullText.c_str());
|
|
}
|
|
|
|
void SetFullText(const char* fullText) {
|
|
utf8FullText = UTF8StringSlice(fullText);
|
|
}
|
|
|
|
void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
|
|
|
|
void SetWordMinLength(const LengthType _wordMinLength) {
|
|
wordMinLength = _wordMinLength;
|
|
}
|
|
|
|
void SetWordMaxLength(const LengthType _wordMaxLength) {
|
|
wordMaxLength = _wordMaxLength;
|
|
}
|
|
|
|
void SetPrefixSetLength(const LengthType _prefixSetLength) {
|
|
prefixSetLength = _prefixSetLength;
|
|
}
|
|
|
|
void SetSuffixSetLength(const LengthType _suffixSetLength) {
|
|
suffixSetLength = _suffixSetLength;
|
|
}
|
|
|
|
// PreCalculationFilter is called after frequencies statistics.
|
|
void SetPreCalculationFilter(const std::function<
|
|
bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) {
|
|
preCalculationFilter = filter;
|
|
}
|
|
|
|
void SetPostCalculationFilter(const std::function<
|
|
bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) {
|
|
postCalculationFilter = filter;
|
|
}
|
|
|
|
void ReleaseSuffixes() { vector<UTF8StringSlice8Bit>().swap(suffixes); }
|
|
|
|
void ReleasePrefixes() { vector<UTF8StringSlice8Bit>().swap(prefixes); }
|
|
|
|
const vector<UTF8StringSlice8Bit>& Words() const { return words; }
|
|
|
|
const vector<UTF8StringSlice8Bit>& WordCandidates() const {
|
|
return wordCandidates;
|
|
}
|
|
|
|
struct Signals {
|
|
size_t frequency;
|
|
double cohesion;
|
|
double suffixEntropy;
|
|
double prefixEntropy;
|
|
};
|
|
|
|
const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
|
|
double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
|
|
double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
|
|
double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
|
|
double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
|
|
size_t Frequency(const UTF8StringSlice8Bit& word) const;
|
|
|
|
double Probability(const UTF8StringSlice8Bit& word) const;
|
|
|
|
double LogProbability(const UTF8StringSlice8Bit& word) const;
|
|
|
|
void Reset();
|
|
|
|
void ExtractSuffixes();
|
|
|
|
void ExtractPrefixes();
|
|
|
|
void ExtractWordCandidates();
|
|
|
|
void CalculateFrequency();
|
|
|
|
void CalculateCohesions();
|
|
|
|
void CalculateSuffixEntropy();
|
|
|
|
void CalculatePrefixEntropy();
|
|
|
|
void SelectWords();
|
|
|
|
static bool
|
|
DefaultPreCalculationFilter(const PhraseExtract&,
|
|
const PhraseExtract::UTF8StringSlice8Bit&);
|
|
|
|
static bool
|
|
DefaultPostCalculationFilter(const PhraseExtract&,
|
|
const PhraseExtract::UTF8StringSlice8Bit&);
|
|
|
|
private:
|
|
class DictType;
|
|
|
|
// Pointwise Mutual Information
|
|
double PMI(const UTF8StringSlice8Bit& wordCandidate,
|
|
const UTF8StringSlice8Bit& part1,
|
|
const UTF8StringSlice8Bit& part2) const;
|
|
|
|
double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
|
|
double CalculateEntropy(const std::unordered_map<
|
|
UTF8StringSlice8Bit, size_t, UTF8StringSlice8Bit::Hasher>& choices) const;
|
|
|
|
LengthType wordMinLength;
|
|
LengthType wordMaxLength;
|
|
LengthType prefixSetLength;
|
|
LengthType suffixSetLength;
|
|
std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
|
|
preCalculationFilter;
|
|
std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
|
|
postCalculationFilter;
|
|
|
|
bool prefixesExtracted;
|
|
bool suffixesExtracted;
|
|
bool frequenciesCalculated;
|
|
bool wordCandidatesExtracted;
|
|
bool cohesionsCalculated;
|
|
bool prefixEntropiesCalculated;
|
|
bool suffixEntropiesCalculated;
|
|
bool wordsSelected;
|
|
|
|
UTF8StringSlice utf8FullText;
|
|
size_t totalOccurrence;
|
|
double logTotalOccurrence;
|
|
vector<UTF8StringSlice8Bit> prefixes;
|
|
vector<UTF8StringSlice8Bit> suffixes;
|
|
vector<UTF8StringSlice8Bit> wordCandidates;
|
|
vector<UTF8StringSlice8Bit> words;
|
|
DictType* signals;
|
|
|
|
friend class PhraseExtractTest;
|
|
};
|
|
|
|
} // namespace opencc
|