Use full version of sudachi dictionary
This commit is contained in:
parent
eee82f0b5a
commit
071144c808
|
@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import bot.scraper as Scraper
|
import bot.scraper as Scraper
|
||||||
import bot.yomichan.export as YomichanExport
|
import bot.yomichan.export as YomichanExport
|
||||||
|
|
||||||
from bot.entries.jitenon_kotowaza import JitenonKotowaza
|
from bot.entries.jitenon_kotowaza import JitenonKotowaza
|
||||||
from bot.entries.jitenon_yoji import JitenonYoji
|
from bot.entries.jitenon_yoji import JitenonYoji
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ class JitenonKotowaza(Jitenon):
|
||||||
for idx, headword in enumerate(self._headwords()):
|
for idx, headword in enumerate(self._headwords()):
|
||||||
(expression, reading) = headword
|
(expression, reading) = headword
|
||||||
definition_tags = None
|
definition_tags = None
|
||||||
inflection_rules = Grammar.sudachi_rules(expression, reading)
|
inflection_rules = Grammar.sudachi_rules(expression)
|
||||||
score = -idx
|
score = -idx
|
||||||
glossary = self.yomichan_glossary
|
glossary = self.yomichan_glossary
|
||||||
sequence = self.sequence
|
sequence = self.sequence
|
||||||
|
|
|
@ -1,38 +1,60 @@
|
||||||
from sudachipy import tokenizer
|
from sudachipy import tokenizer
|
||||||
from sudachipy import dictionary
|
from sudachipy import dictionary
|
||||||
|
|
||||||
|
import bot.data as Data
|
||||||
|
|
||||||
def sudachi_rules(expression, reading):
|
__U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
|
||||||
tokenizer_obj = dictionary.Dictionary().create()
|
"ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
|
||||||
|
|
||||||
|
__SUDACHI_DICTIONARY = None
|
||||||
|
__SUDACHI_INFLECTION_TYPES = None
|
||||||
|
|
||||||
|
|
||||||
|
def sudachi_rules(expression):
|
||||||
|
global __SUDACHI_DICTIONARY
|
||||||
|
global __SUDACHI_INFLECTION_TYPES
|
||||||
|
if __SUDACHI_DICTIONARY is None:
|
||||||
|
__SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
|
||||||
|
if __SUDACHI_INFLECTION_TYPES is None:
|
||||||
|
categories = Data.yomichan_inflection_categories()
|
||||||
|
__SUDACHI_INFLECTION_TYPES = categories["sudachi"]
|
||||||
splitmode = tokenizer.Tokenizer.SplitMode.A
|
splitmode = tokenizer.Tokenizer.SplitMode.A
|
||||||
tokens = tokenizer_obj.tokenize(expression, splitmode)
|
tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
|
||||||
|
if len(tokens) == 0:
|
||||||
|
return ""
|
||||||
pos = tokens[len(tokens)-1].part_of_speech()[4]
|
pos = tokens[len(tokens)-1].part_of_speech()[4]
|
||||||
tags = pos.split("-")
|
tags = pos.split("-")
|
||||||
rules = __sudachi_tags_to_rules(tags, expression, reading)
|
rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
|
||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
|
||||||
def __sudachi_tags_to_rules(tags, expression, reading):
|
def tags_to_rules(expression, tags, inflection_types):
|
||||||
u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
|
|
||||||
"ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
|
|
||||||
rules = set()
|
rules = set()
|
||||||
|
exp_final_character = expression[len(expression)-1:]
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
if expression.endswith("い"):
|
if tag in inflection_types["sahen"]:
|
||||||
if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
|
if expression.endswith("する"):
|
||||||
rules.add("adj-i")
|
|
||||||
if expression.endswith("る"):
|
|
||||||
if "一" in tag or tag == "レル":
|
|
||||||
rules.add("v1")
|
|
||||||
if "二" in tag or "四" in tag or "五" in tag:
|
|
||||||
for u_ending in u_endings:
|
|
||||||
if expression.endswith(u_ending):
|
|
||||||
rules.add("v5")
|
|
||||||
break
|
|
||||||
if "サ" in tag and (expression.endswith("する") or expression == "為る"):
|
|
||||||
rules.add("vs")
|
rules.add("vs")
|
||||||
if "サ" in tag and expression.endswith("ずる"):
|
elif expression.endswith("為る"):
|
||||||
|
rules.add("vs")
|
||||||
|
elif expression.endswith("ずる"):
|
||||||
rules.add("vz")
|
rules.add("vz")
|
||||||
if expression.endswith("来る") and reading.endswith("くる"):
|
elif expression.endswith("す"):
|
||||||
rules = set()
|
rules.add("v5")
|
||||||
|
if tag in inflection_types["godan"]:
|
||||||
|
if exp_final_character in __U_KANA_LIST:
|
||||||
|
rules.add("v5")
|
||||||
|
if tag in inflection_types["ichidan"]:
|
||||||
|
if expression.endswith("る"):
|
||||||
|
rules.add("v1")
|
||||||
|
if tag in inflection_types["keiyoushi"]:
|
||||||
|
if expression.endswith("い"):
|
||||||
|
rules.add("adj-i")
|
||||||
|
if tag in inflection_types["kahen"]:
|
||||||
|
if expression.endswith("くる"):
|
||||||
rules.add("vk")
|
rules.add("vk")
|
||||||
|
elif expression.endswith("来る"):
|
||||||
|
rules.add("vk")
|
||||||
|
if tag in inflection_types["sudachi"]:
|
||||||
|
return sudachi_rules(expression)
|
||||||
return " ".join(list(rules))
|
return " ".join(list(rules))
|
||||||
|
|
|
@ -8,7 +8,7 @@ idna==3.4
|
||||||
requests==2.28.2
|
requests==2.28.2
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
soupsieve==2.4
|
soupsieve==2.4
|
||||||
SudachiDict-core==20230110
|
SudachiDict-full==20230110
|
||||||
SudachiPy==0.6.7
|
SudachiPy==0.6.7
|
||||||
urllib3==1.26.15
|
urllib3==1.26.15
|
||||||
webencodings==0.5.1
|
webencodings==0.5.1
|
||||||
|
|
Loading…
Reference in a new issue