Use full version of sudachi dictionary

2023-04-22 12:03:00 -05:00 · 2023-04-22 12:03:00 -05:00 · 071144c808
parent eee82f0b5a
commit 071144c808
4 changed files with 49 additions and 26 deletions
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
 import bot.scraper as Scraper
 import bot.yomichan.export as YomichanExport
 from bot.entries.jitenon_kotowaza import JitenonKotowaza
 from bot.entries.jitenon_yoji import JitenonYoji
--- a/bot/entries/jitenon_kotowaza.py
+++ b/bot/entries/jitenon_kotowaza.py
@ -21,7 +21,7 @@ class JitenonKotowaza(Jitenon):
        for idx, headword in enumerate(self._headwords()):
            (expression, reading) = headword
            definition_tags = None
-            inflection_rules = Grammar.sudachi_rules(expression, reading)
+            inflection_rules = Grammar.sudachi_rules(expression)
            score = -idx
            glossary = self.yomichan_glossary
            sequence = self.sequence
--- a/bot/yomichan/grammar.py
+++ b/bot/yomichan/grammar.py
@ -1,38 +1,60 @@
 from sudachipy import tokenizer
 from sudachipy import dictionary
 import bot.data as Data
-def sudachi_rules(expression, reading):
+__U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
-    tokenizer_obj = dictionary.Dictionary().create()
+                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
 __SUDACHI_DICTIONARY = None
 __SUDACHI_INFLECTION_TYPES = None
 def sudachi_rules(expression):
    global __SUDACHI_DICTIONARY
    global __SUDACHI_INFLECTION_TYPES
    if __SUDACHI_DICTIONARY is None:
        __SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
    if __SUDACHI_INFLECTION_TYPES is None:
        categories = Data.yomichan_inflection_categories()
        __SUDACHI_INFLECTION_TYPES = categories["sudachi"]
    splitmode = tokenizer.Tokenizer.SplitMode.A
-    tokens = tokenizer_obj.tokenize(expression, splitmode)
+    tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
    if len(tokens) == 0:
        return ""
    pos = tokens[len(tokens)-1].part_of_speech()[4]
    tags = pos.split("-")
-    rules = __sudachi_tags_to_rules(tags, expression, reading)
+    rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
    return rules
-def __sudachi_tags_to_rules(tags, expression, reading):
+def tags_to_rules(expression, tags, inflection_types):
    u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
    rules = set()
    exp_final_character = expression[len(expression)-1:]
    for tag in tags:
-        if expression.endswith("い"):
+        if tag in inflection_types["sahen"]:
-            if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
+            if expression.endswith("する"):
                rules.add("adj-i")
        if expression.endswith("る"):
            if "一" in tag or tag == "レル":
                rules.add("v1")
        if "二" in tag or "四" in tag or "五" in tag:
            for u_ending in u_endings:
                if expression.endswith(u_ending):
                    rules.add("v5")
                    break
        if "サ" in tag and (expression.endswith("する") or expression == "為る"):
                rules.add("vs")
-        if "サ" in tag and expression.endswith("ずる"):
+            elif expression.endswith("為る"):
                rules.add("vs")
            elif expression.endswith("ずる"):
                rules.add("vz")
-    if expression.endswith("来る") and reading.endswith("くる"):
+            elif expression.endswith("す"):
-        rules = set()
+                rules.add("v5")
        if tag in inflection_types["godan"]:
            if exp_final_character in __U_KANA_LIST:
                rules.add("v5")
        if tag in inflection_types["ichidan"]:
            if expression.endswith("る"):
                rules.add("v1")
        if tag in inflection_types["keiyoushi"]:
            if expression.endswith("い"):
                rules.add("adj-i")
        if tag in inflection_types["kahen"]:
            if expression.endswith("くる"):
                rules.add("vk")
            elif expression.endswith("来る"):
                rules.add("vk")
        if tag in inflection_types["sudachi"]:
            return sudachi_rules(expression)
    return " ".join(list(rules))
--- a/requirements.txt
+++ b/requirements.txt
@ -8,7 +8,7 @@ idna==3.4
 requests==2.28.2
 six==1.16.0
 soupsieve==2.4
-SudachiDict-core==20230110
+SudachiDict-full==20230110
 SudachiPy==0.6.7
 urllib3==1.26.15
 webencodings==0.5.1