jitenbot/entries/jitenon.py

107 lines
3.7 KiB
Python
Raw Normal View History

import re
from datetime import datetime, date
from bs4 import BeautifulSoup
2023-04-10 20:20:33 +00:00
import yomichan.soup as YomichanSoup
import util as Util
2023-04-10 20:20:33 +00:00
class Jitenon:
def __init__(self, sequence):
self.sequence = sequence
self.yomichan_glossary = [""]
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self.columns.values():
setattr(self, column[0], column[1])
def add_document(self, html):
yoji_soup = BeautifulSoup(html, features="html5lib")
self.__set_modified_date(html)
self.attribution = yoji_soup.find(class_="copyright").text
table = yoji_soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
2023-04-10 20:20:33 +00:00
colval = row.td.text
self.__set_column(colname, colval)
2023-04-10 20:20:33 +00:00
gloss = YomichanSoup.make_gloss(table) # note: modifies table
self.yomichan_glossary = [gloss]
def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
if not m:
return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date
def __set_column(self, colname, colval):
attr_name = self.columns[colname][0]
attr_value = getattr(self, attr_name)
colval = colval.replace("\n", "").replace(",", "").strip()
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
elif isinstance(attr_value, list):
if len(attr_value) == 0:
setattr(self, attr_name, [colval])
else:
attr_value.append(colval)
setattr(self, attr_name, attr_value)
2023-04-10 20:20:33 +00:00
def _headwords(self):
words = []
for yomikata in self.__yomikatas():
2023-04-10 16:14:52 +00:00
headword = [self.expression, yomikata]
if headword in words:
words.remove(headword)
words.append(headword)
for headword in self.__ikei_headwords():
if headword in words:
words.remove(headword)
words.append(headword)
return words
def __yomikatas(self):
2023-04-10 16:14:52 +00:00
yomikata = self.yomikata.replace(" ", "")
2023-04-10 20:20:33 +00:00
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
2023-04-10 16:14:52 +00:00
return [yomikata]
2023-04-10 20:20:33 +00:00
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
if m:
return [m.group(1)]
2023-04-10 20:20:33 +00:00
m = re.search(r"^[ぁ-ヿ、]+[ぁ-ヿ、][ぁ-ヿ、]+$", yomikata)
if m:
2023-04-10 16:14:52 +00:00
return Util.expand_shouryaku(yomikata)
2023-04-10 20:20:33 +00:00
m = re.search(r"^([ぁ-ヿ、]+)([ぁ-ヿ/\s、]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
2023-04-10 20:20:33 +00:00
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return ""
def __ikei_headwords(self):
ikei_headwords = []
for val in self.ikei:
2023-04-10 20:20:33 +00:00
val = val.replace(" ", "")
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if m:
headword = [m.group(1), m.group(2)]
ikei_headwords.append(headword)
else:
2023-04-10 20:20:33 +00:00
print(f"Invalid 異形 format: {val}\n{self}\n")
return ikei_headwords
def __str__(self):
colvals = [str(self.sequence)]
for attr in self.columns.values():
attr_val = getattr(self, attr[0])
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)