Use full version of sudachi dictionary

This commit is contained in:
stephenmk 2023-04-22 12:03:00 -05:00
parent eee82f0b5a
commit 071144c808
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
4 changed files with 49 additions and 26 deletions

View file

@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper
import bot.yomichan.export as YomichanExport
from bot.entries.jitenon_kotowaza import JitenonKotowaza
from bot.entries.jitenon_yoji import JitenonYoji

View file

@ -21,7 +21,7 @@ class JitenonKotowaza(Jitenon):
for idx, headword in enumerate(self._headwords()):
(expression, reading) = headword
definition_tags = None
inflection_rules = Grammar.sudachi_rules(expression, reading)
inflection_rules = Grammar.sudachi_rules(expression)
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence

View file

@ -1,38 +1,60 @@
from sudachipy import tokenizer
from sudachipy import dictionary
import bot.data as Data
def sudachi_rules(expression, reading):
tokenizer_obj = dictionary.Dictionary().create()
__U_KANA_LIST = ["", "", "", "", "", "", "",
"", "", "", "", "", "", ""]
__SUDACHI_DICTIONARY = None
__SUDACHI_INFLECTION_TYPES = None
def sudachi_rules(expression):
global __SUDACHI_DICTIONARY
global __SUDACHI_INFLECTION_TYPES
if __SUDACHI_DICTIONARY is None:
__SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
if __SUDACHI_INFLECTION_TYPES is None:
categories = Data.yomichan_inflection_categories()
__SUDACHI_INFLECTION_TYPES = categories["sudachi"]
splitmode = tokenizer.Tokenizer.SplitMode.A
tokens = tokenizer_obj.tokenize(expression, splitmode)
tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
if len(tokens) == 0:
return ""
pos = tokens[len(tokens)-1].part_of_speech()[4]
tags = pos.split("-")
rules = __sudachi_tags_to_rules(tags, expression, reading)
rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
return rules
def __sudachi_tags_to_rules(tags, expression, reading):
u_endings = ["", "", "", "", "", "", "",
"", "", "", "", "", "", ""]
def tags_to_rules(expression, tags, inflection_types):
rules = set()
exp_final_character = expression[len(expression)-1:]
for tag in tags:
if expression.endswith(""):
if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
rules.add("adj-i")
if expression.endswith(""):
if "" in tag or tag == "レル":
if tag in inflection_types["sahen"]:
if expression.endswith("する"):
rules.add("vs")
elif expression.endswith("為る"):
rules.add("vs")
elif expression.endswith("ずる"):
rules.add("vz")
elif expression.endswith(""):
rules.add("v5")
if tag in inflection_types["godan"]:
if exp_final_character in __U_KANA_LIST:
rules.add("v5")
if tag in inflection_types["ichidan"]:
if expression.endswith(""):
rules.add("v1")
if "" in tag or "" in tag or "" in tag:
for u_ending in u_endings:
if expression.endswith(u_ending):
rules.add("v5")
break
if "" in tag and (expression.endswith("する") or expression == "為る"):
rules.add("vs")
if "" in tag and expression.endswith("ずる"):
rules.add("vz")
if expression.endswith("来る") and reading.endswith("くる"):
rules = set()
rules.add("vk")
if tag in inflection_types["keiyoushi"]:
if expression.endswith(""):
rules.add("adj-i")
if tag in inflection_types["kahen"]:
if expression.endswith("くる"):
rules.add("vk")
elif expression.endswith("来る"):
rules.add("vk")
if tag in inflection_types["sudachi"]:
return sudachi_rules(expression)
return " ".join(list(rules))

View file

@ -8,7 +8,7 @@ idna==3.4
requests==2.28.2
six==1.16.0
soupsieve==2.4
SudachiDict-core==20230110
SudachiDict-full==20230110
SudachiPy==0.6.7
urllib3==1.26.15
webencodings==0.5.1