jitenbot/jitenon_yoji.py
stephenmk f9ad9e6d21
First version
Support for Jitenon's yoji dictionary
2023-04-07 22:05:36 -05:00

117 lines
4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import yomichan as Yomichan
import util as Util
class JitenonYoji:
columns = {
"四字熟語": ["yojijukugo", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
self.sequence = sequence
self.yomichan_glossary = [""]
for column in self.columns.values():
setattr(self, column[0], column[1])
def add_soup(self, yoji_soup):
table = yoji_soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = row.td.decode_contents()
self.__set_column(colname, colval)
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self.__headwords()):
(yoji, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
yoji, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def __set_column(self, colname, colval):
attr_name = self.columns[colname][0]
attr_value = getattr(self, attr_name)
colval = colval.replace("\n", "").replace(",", "").strip()
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
elif isinstance(attr_value, list):
if len(attr_value) == 0:
setattr(self, attr_name, [colval])
else:
attr_value.append(colval)
setattr(self, attr_name, attr_value)
def __headwords(self):
words = []
for yomikata in self.__yomikatas():
headword = [self.yojijukugo, yomikata]
if headword in words:
words.remove(headword)
words.append(headword)
for headword in self.__ikei_headwords():
if headword in words:
words.remove(headword)
words.append(headword)
return words
def __yomikatas(self):
m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
if m:
return [self.yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", self.yomikata)
if m:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", self.yomikata)
if m:
return Util.expand_shouryaku(self.yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", self.yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
def __ikei_headwords(self):
ikei_headwords = []
for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ]+)$", val)
if m:
headword = [m.group(1), m.group(2)]
ikei_headwords.append(headword)
else:
raise Exception(f"Invalid 異形 format: {val}\n{self}")
return ikei_headwords
def __str__(self):
colvals = [str(self.sequence)]
for attr in self.columns.values():
attr_val = getattr(self, attr[0])
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)