goldendict-ng/winlibs/include/opencc/PhraseExtract.hpp

/*
 * Open Chinese Convert
 *
 * Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <functional>
#include <unordered_map>

#include "Common.hpp"
#include "UTF8StringSlice.hpp"

namespace opencc {

class OPENCC_EXPORT PhraseExtract {
public:
  typedef UTF8StringSlice::LengthType LengthType;

  typedef UTF8StringSliceBase<unsigned char> UTF8StringSlice8Bit;

  PhraseExtract();

  virtual ~PhraseExtract();

  void Extract(const std::string& text) {
    SetFullText(text);
    ExtractSuffixes();
    CalculateFrequency();
    CalculateSuffixEntropy();
    ReleaseSuffixes();
    ExtractPrefixes();
    CalculatePrefixEntropy();
    ReleasePrefixes();
    ExtractWordCandidates();
    CalculateCohesions();
    SelectWords();
  }

  void SetFullText(const std::string& fullText) {
    utf8FullText = UTF8StringSlice(fullText.c_str());
  }

  void SetFullText(const char* fullText) {
    utf8FullText = UTF8StringSlice(fullText);
  }

  void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }

  void SetWordMinLength(const LengthType _wordMinLength) {
    wordMinLength = _wordMinLength;
  }

  void SetWordMaxLength(const LengthType _wordMaxLength) {
    wordMaxLength = _wordMaxLength;
  }

  void SetPrefixSetLength(const LengthType _prefixSetLength) {
    prefixSetLength = _prefixSetLength;
  }

  void SetSuffixSetLength(const LengthType _suffixSetLength) {
    suffixSetLength = _suffixSetLength;
  }

  // PreCalculationFilter is called after frequencies statistics.
  void SetPreCalculationFilter(
      const std::function<bool(const PhraseExtract&,
                               const UTF8StringSlice8Bit&)>& filter) {
    preCalculationFilter = filter;
  }

  void SetPostCalculationFilter(
      const std::function<bool(const PhraseExtract&,
                               const UTF8StringSlice8Bit&)>& filter) {
    postCalculationFilter = filter;
  }

  void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }

  void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }

  const std::vector<UTF8StringSlice8Bit>& Words() const { return words; }

  const std::vector<UTF8StringSlice8Bit>& WordCandidates() const {
    return wordCandidates;
  }

  struct Signals {
    size_t frequency;
    double cohesion;
    double suffixEntropy;
    double prefixEntropy;
  };

  const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;

  double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;

  double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;

  double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;

  double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;

  size_t Frequency(const UTF8StringSlice8Bit& word) const;

  double Probability(const UTF8StringSlice8Bit& word) const;

  double LogProbability(const UTF8StringSlice8Bit& word) const;

  void Reset();

  void ExtractSuffixes();

  void ExtractPrefixes();

  void ExtractWordCandidates();

  void CalculateFrequency();

  void CalculateCohesions();

  void CalculateSuffixEntropy();

  void CalculatePrefixEntropy();

  void SelectWords();

  static bool
  DefaultPreCalculationFilter(const PhraseExtract&,
                              const PhraseExtract::UTF8StringSlice8Bit&);

  static bool
  DefaultPostCalculationFilter(const PhraseExtract&,
                               const PhraseExtract::UTF8StringSlice8Bit&);

private:
  class DictType;

  // Pointwise Mutual Information
  double PMI(const UTF8StringSlice8Bit& wordCandidate,
             const UTF8StringSlice8Bit& part1,
             const UTF8StringSlice8Bit& part2) const;

  double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;

  double CalculateEntropy(
      const std::unordered_map<UTF8StringSlice8Bit, size_t,
                               UTF8StringSlice8Bit::Hasher>& choices) const;

  LengthType wordMinLength;
  LengthType wordMaxLength;
  LengthType prefixSetLength;
  LengthType suffixSetLength;
  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
      preCalculationFilter;
  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
      postCalculationFilter;

  bool prefixesExtracted;
  bool suffixesExtracted;
  bool frequenciesCalculated;
  bool wordCandidatesExtracted;
  bool cohesionsCalculated;
  bool prefixEntropiesCalculated;
  bool suffixEntropiesCalculated;
  bool wordsSelected;

  UTF8StringSlice utf8FullText;
  size_t totalOccurrence;
  double logTotalOccurrence;
  std::vector<UTF8StringSlice8Bit> prefixes;
  std::vector<UTF8StringSlice8Bit> suffixes;
  std::vector<UTF8StringSlice8Bit> wordCandidates;
  std::vector<UTF8StringSlice8Bit> words;
  DictType* signals;

  friend class PhraseExtractTest;
};

} // namespace opencc