jitenbot/jitenon_yoji.py
2023-04-10 11:14:52 -05:00

132 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from datetime import datetime, date
from bs4 import BeautifulSoup
import yomichan as Yomichan
import util as Util
class JitenonYoji:
columns = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
self.sequence = sequence
self.yomichan_glossary = [""]
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self.columns.values():
setattr(self, column[0], column[1])
def add_document(self, html):
yoji_soup = BeautifulSoup(html, features="html5lib")
self.__set_modified_date(html)
self.attribution = yoji_soup.find(class_="copyright").text
table = yoji_soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = row.td.decode_contents()
self.__set_column(colname, colval)
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self.__headwords()):
(yoji, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
yoji, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
if not m:
return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date
def __set_column(self, colname, colval):
attr_name = self.columns[colname][0]
attr_value = getattr(self, attr_name)
colval = colval.replace("\n", "").replace(",", "").strip()
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
elif isinstance(attr_value, list):
if len(attr_value) == 0:
setattr(self, attr_name, [colval])
else:
attr_value.append(colval)
setattr(self, attr_name, attr_value)
def __headwords(self):
words = []
for yomikata in self.__yomikatas():
headword = [self.expression, yomikata]
if headword in words:
words.remove(headword)
words.append(headword)
for headword in self.__ikei_headwords():
if headword in words:
words.remove(headword)
words.append(headword)
return words
def __yomikatas(self):
yomikata = self.yomikata.replace(" ", "")
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
if m:
return [yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
if m:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", yomikata)
if m:
return Util.expand_shouryaku(yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
def __ikei_headwords(self):
ikei_headwords = []
for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ]+)$", val)
if m:
headword = [m.group(1), m.group(2)]
ikei_headwords.append(headword)
else:
raise Exception(f"Invalid 異形 format: {val}\n{self}")
return ikei_headwords
def __str__(self):
colvals = [str(self.sequence)]
for attr in self.columns.values():
attr_val = getattr(self, attr[0])
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)