From 071144c808cc1a06b5f833c4d423f2166cc2de2d Mon Sep 17 00:00:00 2001
From: stephenmk <stephenmk@users.noreply.github.com>
Date: Sat, 22 Apr 2023 12:03:00 -0500
Subject: [PATCH] Use full version of sudachi dictionary

---
 bot/crawlers.py                 |  1 +
 bot/entries/jitenon_kotowaza.py |  2 +-
 bot/yomichan/grammar.py         | 70 ++++++++++++++++++++++-----------
 requirements.txt                |  2 +-
 4 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/bot/crawlers.py b/bot/crawlers.py
index b705a02..c55896e 100644
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
 
 import bot.scraper as Scraper
 import bot.yomichan.export as YomichanExport
+
 from bot.entries.jitenon_kotowaza import JitenonKotowaza
 from bot.entries.jitenon_yoji import JitenonYoji
 
diff --git a/bot/entries/jitenon_kotowaza.py b/bot/entries/jitenon_kotowaza.py
index 9f4b8d8..6019c02 100644
--- a/bot/entries/jitenon_kotowaza.py
+++ b/bot/entries/jitenon_kotowaza.py
@@ -21,7 +21,7 @@ class JitenonKotowaza(Jitenon):
         for idx, headword in enumerate(self._headwords()):
             (expression, reading) = headword
             definition_tags = None
-            inflection_rules = Grammar.sudachi_rules(expression, reading)
+            inflection_rules = Grammar.sudachi_rules(expression)
             score = -idx
             glossary = self.yomichan_glossary
             sequence = self.sequence
diff --git a/bot/yomichan/grammar.py b/bot/yomichan/grammar.py
index 6f30061..1a47ade 100644
--- a/bot/yomichan/grammar.py
+++ b/bot/yomichan/grammar.py
@@ -1,38 +1,60 @@
 from sudachipy import tokenizer
 from sudachipy import dictionary
 
+import bot.data as Data
 
-def sudachi_rules(expression, reading):
-    tokenizer_obj = dictionary.Dictionary().create()
+__U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
+                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
+
+__SUDACHI_DICTIONARY = None
+__SUDACHI_INFLECTION_TYPES = None
+
+
+def sudachi_rules(expression):
+    global __SUDACHI_DICTIONARY
+    global __SUDACHI_INFLECTION_TYPES
+    if __SUDACHI_DICTIONARY is None:
+        __SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
+    if __SUDACHI_INFLECTION_TYPES is None:
+        categories = Data.yomichan_inflection_categories()
+        __SUDACHI_INFLECTION_TYPES = categories["sudachi"]
     splitmode = tokenizer.Tokenizer.SplitMode.A
-    tokens = tokenizer_obj.tokenize(expression, splitmode)
+    tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
+    if len(tokens) == 0:
+        return ""
     pos = tokens[len(tokens)-1].part_of_speech()[4]
     tags = pos.split("-")
-    rules = __sudachi_tags_to_rules(tags, expression, reading)
+    rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
     return rules
 
 
-def __sudachi_tags_to_rules(tags, expression, reading):
-    u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
-                 "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
+def tags_to_rules(expression, tags, inflection_types):
     rules = set()
+    exp_final_character = expression[len(expression)-1:]
     for tag in tags:
-        if expression.endswith("い"):
-            if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
-                rules.add("adj-i")
-        if expression.endswith("る"):
-            if "一" in tag or tag == "レル":
+        if tag in inflection_types["sahen"]:
+            if expression.endswith("する"):
+                rules.add("vs")
+            elif expression.endswith("為る"):
+                rules.add("vs")
+            elif expression.endswith("ずる"):
+                rules.add("vz")
+            elif expression.endswith("す"):
+                rules.add("v5")
+        if tag in inflection_types["godan"]:
+            if exp_final_character in __U_KANA_LIST:
+                rules.add("v5")
+        if tag in inflection_types["ichidan"]:
+            if expression.endswith("る"):
                 rules.add("v1")
-        if "二" in tag or "四" in tag or "五" in tag:
-            for u_ending in u_endings:
-                if expression.endswith(u_ending):
-                    rules.add("v5")
-                    break
-        if "サ" in tag and (expression.endswith("する") or expression == "為る"):
-            rules.add("vs")
-        if "サ" in tag and expression.endswith("ずる"):
-            rules.add("vz")
-    if expression.endswith("来る") and reading.endswith("くる"):
-        rules = set()
-        rules.add("vk")
+        if tag in inflection_types["keiyoushi"]:
+            if expression.endswith("い"):
+                rules.add("adj-i")
+        if tag in inflection_types["kahen"]:
+            if expression.endswith("くる"):
+                rules.add("vk")
+            elif expression.endswith("来る"):
+                rules.add("vk")
+        if tag in inflection_types["sudachi"]:
+            return sudachi_rules(expression)
     return " ".join(list(rules))
diff --git a/requirements.txt b/requirements.txt
index db52d42..566327c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ idna==3.4
 requests==2.28.2
 six==1.16.0
 soupsieve==2.4
-SudachiDict-core==20230110
+SudachiDict-full==20230110
 SudachiPy==0.6.7
 urllib3==1.26.15
 webencodings==0.5.1