From 071144c808cc1a06b5f833c4d423f2166cc2de2d Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 22 Apr 2023 12:03:00 -0500 Subject: [PATCH] Use full version of sudachi dictionary --- bot/crawlers.py | 1 + bot/entries/jitenon_kotowaza.py | 2 +- bot/yomichan/grammar.py | 70 ++++++++++++++++++++++----------- requirements.txt | 2 +- 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/bot/crawlers.py b/bot/crawlers.py index b705a02..c55896e 100644 --- a/bot/crawlers.py +++ b/bot/crawlers.py @@ -3,6 +3,7 @@ from bs4 import BeautifulSoup import bot.scraper as Scraper import bot.yomichan.export as YomichanExport + from bot.entries.jitenon_kotowaza import JitenonKotowaza from bot.entries.jitenon_yoji import JitenonYoji diff --git a/bot/entries/jitenon_kotowaza.py b/bot/entries/jitenon_kotowaza.py index 9f4b8d8..6019c02 100644 --- a/bot/entries/jitenon_kotowaza.py +++ b/bot/entries/jitenon_kotowaza.py @@ -21,7 +21,7 @@ class JitenonKotowaza(Jitenon): for idx, headword in enumerate(self._headwords()): (expression, reading) = headword definition_tags = None - inflection_rules = Grammar.sudachi_rules(expression, reading) + inflection_rules = Grammar.sudachi_rules(expression) score = -idx glossary = self.yomichan_glossary sequence = self.sequence diff --git a/bot/yomichan/grammar.py b/bot/yomichan/grammar.py index 6f30061..1a47ade 100644 --- a/bot/yomichan/grammar.py +++ b/bot/yomichan/grammar.py @@ -1,38 +1,60 @@ from sudachipy import tokenizer from sudachipy import dictionary +import bot.data as Data -def sudachi_rules(expression, reading): - tokenizer_obj = dictionary.Dictionary().create() +__U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む", + "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"] + +__SUDACHI_DICTIONARY = None +__SUDACHI_INFLECTION_TYPES = None + + +def sudachi_rules(expression): + global __SUDACHI_DICTIONARY + global __SUDACHI_INFLECTION_TYPES + if __SUDACHI_DICTIONARY is None: + __SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create() + if __SUDACHI_INFLECTION_TYPES is None: + categories = Data.yomichan_inflection_categories() + __SUDACHI_INFLECTION_TYPES = categories["sudachi"] splitmode = tokenizer.Tokenizer.SplitMode.A - tokens = tokenizer_obj.tokenize(expression, splitmode) + tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode) + if len(tokens) == 0: + return "" pos = tokens[len(tokens)-1].part_of_speech()[4] tags = pos.split("-") - rules = __sudachi_tags_to_rules(tags, expression, reading) + rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES) return rules -def __sudachi_tags_to_rules(tags, expression, reading): - u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む", - "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"] +def tags_to_rules(expression, tags, inflection_types): rules = set() + exp_final_character = expression[len(expression)-1:] for tag in tags: - if expression.endswith("い"): - if tag == "形容詞" or "ナイ" in tag or "タイ" in tag: - rules.add("adj-i") - if expression.endswith("る"): - if "一" in tag or tag == "レル": + if tag in inflection_types["sahen"]: + if expression.endswith("する"): + rules.add("vs") + elif expression.endswith("為る"): + rules.add("vs") + elif expression.endswith("ずる"): + rules.add("vz") + elif expression.endswith("す"): + rules.add("v5") + if tag in inflection_types["godan"]: + if exp_final_character in __U_KANA_LIST: + rules.add("v5") + if tag in inflection_types["ichidan"]: + if expression.endswith("る"): rules.add("v1") - if "二" in tag or "四" in tag or "五" in tag: - for u_ending in u_endings: - if expression.endswith(u_ending): - rules.add("v5") - break - if "サ" in tag and (expression.endswith("する") or expression == "為る"): - rules.add("vs") - if "サ" in tag and expression.endswith("ずる"): - rules.add("vz") - if expression.endswith("来る") and reading.endswith("くる"): - rules = set() - rules.add("vk") + if tag in inflection_types["keiyoushi"]: + if expression.endswith("い"): + rules.add("adj-i") + if tag in inflection_types["kahen"]: + if expression.endswith("くる"): + rules.add("vk") + elif expression.endswith("来る"): + rules.add("vk") + if tag in inflection_types["sudachi"]: + return sudachi_rules(expression) return " ".join(list(rules)) diff --git a/requirements.txt b/requirements.txt index db52d42..566327c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ idna==3.4 requests==2.28.2 six==1.16.0 soupsieve==2.4 -SudachiDict-core==20230110 +SudachiDict-full==20230110 SudachiPy==0.6.7 urllib3==1.26.15 webencodings==0.5.1