jitenbot/bot/entries/jitenon.py

173 lines
5.6 KiB
Python
Raw Normal View History

import re
from datetime import datetime, date
from bs4 import BeautifulSoup
2023-04-23 02:01:52 +00:00
import bot.expressions as Expressions
2023-04-22 22:56:52 +00:00
class JitenonEntry:
def __init__(self, entry_id):
self.entry_id = entry_id
self.markup = ""
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self.COLUMNS.values():
setattr(self, column[0], column[1])
self._headwords = None
def set_markup(self, path):
with open(path, "r") as f:
html = f.read()
soup = BeautifulSoup(html, features="html5lib")
self.__set_modified_date(html)
self.attribution = soup.find(class_="copyright").text
table = soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = self.__clean_text(row.td.text)
self.__set_column(colname, colval)
self.markup = table.decode()
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
return self._headwords
def get_first_expression(self):
headwords = self.get_headwords()
expressions = next(iter(headwords.values()))
expression = expressions[0]
return expression
def get_first_reading(self):
headwords = self.get_headwords()
reading = next(iter(headwords.keys()))
return reading
def _set_headwords(self):
headwords = {}
for yomikata in self.__yomikatas():
headwords[yomikata] = [self.expression]
ikei_headwords = self.__ikei_headwords()
for reading, expressions in ikei_headwords.items():
if reading not in headwords:
headwords[reading] = []
for expression in expressions:
if expression not in headwords[reading]:
headwords[reading].append(expression)
self._headwords = headwords
def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
if not m:
return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date
def __set_column(self, colname, colval):
attr_name = self.COLUMNS[colname][0]
attr_value = getattr(self, attr_name)
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
elif isinstance(attr_value, list):
if len(attr_value) == 0:
setattr(self, attr_name, [colval])
else:
attr_value.append(colval)
def __yomikatas(self):
yomikata = self.yomikata
2023-04-10 20:20:33 +00:00
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
2023-04-10 16:14:52 +00:00
return [yomikata]
2023-04-10 20:20:33 +00:00
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
if m:
return [m.group(1)]
2023-04-10 20:20:33 +00:00
m = re.search(r"^[ぁ-ヿ、]+[ぁ-ヿ、][ぁ-ヿ、]+$", yomikata)
if m:
2023-04-23 02:01:52 +00:00
return Expressions.expand_shouryaku(yomikata)
2023-04-10 20:20:33 +00:00
m = re.search(r"^([ぁ-ヿ、]+)([ぁ-ヿ/\s、]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
2023-04-10 20:20:33 +00:00
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""]
def __ikei_headwords(self):
ikei_headwords = {}
for val in self.ikei:
2023-04-10 20:20:33 +00:00
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if not m:
2023-04-10 20:20:33 +00:00
print(f"Invalid 異形 format: {val}\n{self}\n")
continue
expression = m.group(1)
reading = m.group(2)
if reading not in ikei_headwords:
ikei_headwords[reading] = []
if expression not in ikei_headwords[reading]:
ikei_headwords[reading].append(expression)
return ikei_headwords
@staticmethod
def __clean_text(text):
text = text.replace("\n", "")
text = text.replace(",", "")
text = text.replace(" ", "")
text = text.strip()
return text
def __str__(self):
colvals = [str(self.entry_id)]
for attr in self.COLUMNS.values():
attr_val = getattr(self, attr[0])
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)
class JitenonYojiEntry(JitenonEntry):
COLUMNS = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
super().__init__(sequence)
class JitenonKotowazaEntry(JitenonEntry):
COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"例文": ["reibun", ""],
"異形": ["ikei", []],
"類句": ["ruiku", []],
}
def __init__(self, sequence):
super().__init__(sequence)
def _set_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
self._headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
super()._set_headwords()