jitenbot/bot/entries/jitenon.py
2023-04-22 21:01:52 -05:00

162 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from datetime import datetime, date
from bs4 import BeautifulSoup
import bot.expressions as Expressions
class JitenonEntry:
def __init__(self, entry_id):
self.entry_id = entry_id
self.markup = ""
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self.COLUMNS.values():
setattr(self, column[0], column[1])
self._headwords = None
def set_markup(self, path):
with open(path, "r") as f:
html = f.read()
soup = BeautifulSoup(html, features="html5lib")
self.__set_modified_date(html)
self.attribution = soup.find(class_="copyright").text
table = soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = self.__clean_text(row.td.text)
self.__set_column(colname, colval)
self.markup = table.decode()
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
return self._headwords
def _set_headwords(self):
headwords = {}
for yomikata in self.__yomikatas():
headwords[yomikata] = [self.expression]
ikei_headwords = self.__ikei_headwords()
for reading, expressions in ikei_headwords.items():
if reading not in headwords:
headwords[reading] = []
for expression in expressions:
if expression not in headwords[reading]:
headwords[reading].append(expression)
self._headwords = headwords
def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
if not m:
return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date
def __set_column(self, colname, colval):
attr_name = self.COLUMNS[colname][0]
attr_value = getattr(self, attr_name)
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
elif isinstance(attr_value, list):
if len(attr_value) == 0:
setattr(self, attr_name, [colval])
else:
attr_value.append(colval)
def __yomikatas(self):
yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
return [yomikata]
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
if m:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ、]+[ぁ-ヿ、][ぁ-ヿ、]+$", yomikata)
if m:
return Expressions.expand_shouryaku(yomikata)
m = re.search(r"^([ぁ-ヿ、]+)([ぁ-ヿ/\s、]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""]
def __ikei_headwords(self):
ikei_headwords = {}
for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if not m:
print(f"Invalid 異形 format: {val}\n{self}\n")
continue
expression = m.group(1)
reading = m.group(2)
if reading not in ikei_headwords:
ikei_headwords[reading] = []
if expression not in ikei_headwords[reading]:
ikei_headwords[reading].append(expression)
return ikei_headwords
@staticmethod
def __clean_text(text):
text = text.replace("\n", "")
text = text.replace(",", "")
text = text.replace(" ", "")
text = text.strip()
return text
def __str__(self):
colvals = [str(self.entry_id)]
for attr in self.COLUMNS.values():
attr_val = getattr(self, attr[0])
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)
class JitenonYojiEntry(JitenonEntry):
COLUMNS = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
super().__init__(sequence)
class JitenonKotowazaEntry(JitenonEntry):
COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"例文": ["reibun", ""],
"異形": ["ikei", []],
"類句": ["ruiku", []],
}
def __init__(self, sequence):
super().__init__(sequence)
def _set_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
self._headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
super()._set_headwords()