Use full version of sudachi dictionary

2023-04-22 12:03:00 -05:00 · 2023-04-22 12:03:00 -05:00 · 071144c808
parent eee82f0b5a
commit 071144c808
4 changed files with 49 additions and 26 deletions
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@ -3,6 +3,7 @@ from bs4 import BeautifulSoup

 import bot.scraper as Scraper
 import bot.yomichan.export as YomichanExport
+
 from bot.entries.jitenon_kotowaza import JitenonKotowaza
 from bot.entries.jitenon_yoji import JitenonYoji

--- a/bot/entries/jitenon_kotowaza.py
+++ b/bot/entries/jitenon_kotowaza.py
@ -21,7 +21,7 @@ class JitenonKotowaza(Jitenon):
        for idx, headword in enumerate(self._headwords()):
            (expression, reading) = headword
            definition_tags = None
-            inflection_rules = Grammar.sudachi_rules(expression, reading)
+            inflection_rules = Grammar.sudachi_rules(expression)
            score = -idx
            glossary = self.yomichan_glossary
            sequence = self.sequence
--- a/bot/yomichan/grammar.py
+++ b/bot/yomichan/grammar.py
@ -1,38 +1,60 @@
 from sudachipy import tokenizer
 from sudachipy import dictionary

+import bot.data as Data

-def sudachi_rules(expression, reading):
-    tokenizer_obj = dictionary.Dictionary().create()
+__U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
+                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
+
+__SUDACHI_DICTIONARY = None
+__SUDACHI_INFLECTION_TYPES = None
+
+
+def sudachi_rules(expression):
+    global __SUDACHI_DICTIONARY
+    global __SUDACHI_INFLECTION_TYPES
+    if __SUDACHI_DICTIONARY is None:
+        __SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
+    if __SUDACHI_INFLECTION_TYPES is None:
+        categories = Data.yomichan_inflection_categories()
+        __SUDACHI_INFLECTION_TYPES = categories["sudachi"]
    splitmode = tokenizer.Tokenizer.SplitMode.A
-    tokens = tokenizer_obj.tokenize(expression, splitmode)
+    tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
+    if len(tokens) == 0:
+        return ""
    pos = tokens[len(tokens)-1].part_of_speech()[4]
    tags = pos.split("-")
-    rules = __sudachi_tags_to_rules(tags, expression, reading)
+    rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
    return rules


-def __sudachi_tags_to_rules(tags, expression, reading):
-    u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
-                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
+def tags_to_rules(expression, tags, inflection_types):
    rules = set()
+    exp_final_character = expression[len(expression)-1:]
    for tag in tags:
-        if expression.endswith("い"):
-            if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
-                rules.add("adj-i")
-        if expression.endswith("る"):
-            if "一" in tag or tag == "レル":
+        if tag in inflection_types["sahen"]:
+            if expression.endswith("する"):
+                rules.add("vs")
+            elif expression.endswith("為る"):
+                rules.add("vs")
+            elif expression.endswith("ずる"):
+                rules.add("vz")
+            elif expression.endswith("す"):
+                rules.add("v5")
+        if tag in inflection_types["godan"]:
+            if exp_final_character in __U_KANA_LIST:
+                rules.add("v5")
+        if tag in inflection_types["ichidan"]:
+            if expression.endswith("る"):
                rules.add("v1")
-        if "二" in tag or "四" in tag or "五" in tag:
-            for u_ending in u_endings:
-                if expression.endswith(u_ending):
-                    rules.add("v5")
-                    break
-        if "サ" in tag and (expression.endswith("する") or expression == "為る"):
-            rules.add("vs")
-        if "サ" in tag and expression.endswith("ずる"):
-            rules.add("vz")
-    if expression.endswith("来る") and reading.endswith("くる"):
-        rules = set()
-        rules.add("vk")
+        if tag in inflection_types["keiyoushi"]:
+            if expression.endswith("い"):
+                rules.add("adj-i")
+        if tag in inflection_types["kahen"]:
+            if expression.endswith("くる"):
+                rules.add("vk")
+            elif expression.endswith("来る"):
+                rules.add("vk")
+        if tag in inflection_types["sudachi"]:
+            return sudachi_rules(expression)
    return " ".join(list(rules))
--- a/requirements.txt
+++ b/requirements.txt
@ -8,7 +8,7 @@ idna==3.4
 requests==2.28.2
 six==1.16.0
 soupsieve==2.4
-SudachiDict-core==20230110
+SudachiDict-full==20230110
 SudachiPy==0.6.7
 urllib3==1.26.15
 webencodings==0.5.1