2023-04-08 03:05:36 +00:00
|
|
|
|
import re
|
2023-04-08 23:17:09 +00:00
|
|
|
|
from datetime import datetime, date
|
|
|
|
|
from bs4 import BeautifulSoup
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-04-11 02:08:36 +00:00
|
|
|
|
import yomichan.html_gloss as YomichanGloss
|
2023-04-08 03:05:36 +00:00
|
|
|
|
import util as Util
|
|
|
|
|
|
|
|
|
|
|
2023-04-10 20:20:33 +00:00
|
|
|
|
class Jitenon:
|
2023-04-08 03:05:36 +00:00
|
|
|
|
def __init__(self, sequence):
|
|
|
|
|
self.sequence = sequence
|
|
|
|
|
self.yomichan_glossary = [""]
|
2023-04-08 23:17:09 +00:00
|
|
|
|
self.modified_date = date(1970, 1, 1)
|
|
|
|
|
self.attribution = ""
|
2023-04-08 03:05:36 +00:00
|
|
|
|
for column in self.columns.values():
|
|
|
|
|
setattr(self, column[0], column[1])
|
|
|
|
|
|
2023-04-08 23:17:09 +00:00
|
|
|
|
def add_document(self, html):
|
|
|
|
|
yoji_soup = BeautifulSoup(html, features="html5lib")
|
|
|
|
|
self.__set_modified_date(html)
|
|
|
|
|
self.attribution = yoji_soup.find(class_="copyright").text
|
2023-04-08 03:05:36 +00:00
|
|
|
|
table = yoji_soup.find(class_="kanjirighttb")
|
|
|
|
|
rows = table.find("tbody").find_all("tr")
|
|
|
|
|
colname = ""
|
|
|
|
|
for row in rows:
|
|
|
|
|
colname = row.th.text if row.th is not None else colname
|
2023-04-10 22:33:10 +00:00
|
|
|
|
colval = self.__clean(row.td.text)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
self.__set_column(colname, colval)
|
2023-04-10 22:33:10 +00:00
|
|
|
|
self.__prepare_yomichan_soup(table)
|
2023-04-11 02:08:36 +00:00
|
|
|
|
gloss = YomichanGloss.make_gloss(table)
|
2023-04-10 20:20:33 +00:00
|
|
|
|
self.yomichan_glossary = [gloss]
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-04-08 23:17:09 +00:00
|
|
|
|
def __set_modified_date(self, html):
|
|
|
|
|
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
|
|
|
|
if not m:
|
|
|
|
|
return
|
|
|
|
|
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
|
|
|
|
self.modified_date = date
|
|
|
|
|
|
2023-04-10 22:33:10 +00:00
|
|
|
|
def __clean(self, text):
|
|
|
|
|
text = text.replace("\n", "")
|
|
|
|
|
text = text.replace(",", "、")
|
|
|
|
|
text = text.replace(" ", "")
|
|
|
|
|
text = text.strip()
|
|
|
|
|
return text
|
|
|
|
|
|
2023-04-08 03:05:36 +00:00
|
|
|
|
def __set_column(self, colname, colval):
|
|
|
|
|
attr_name = self.columns[colname][0]
|
|
|
|
|
attr_value = getattr(self, attr_name)
|
|
|
|
|
if isinstance(attr_value, str):
|
|
|
|
|
setattr(self, attr_name, colval)
|
|
|
|
|
elif isinstance(attr_value, list):
|
|
|
|
|
if len(attr_value) == 0:
|
|
|
|
|
setattr(self, attr_name, [colval])
|
|
|
|
|
else:
|
|
|
|
|
attr_value.append(colval)
|
2023-04-10 22:33:10 +00:00
|
|
|
|
|
|
|
|
|
def __prepare_yomichan_soup(self, soup):
|
|
|
|
|
patterns = [
|
|
|
|
|
r"^(.+)([ぁ-ヿ、\s]+)$",
|
|
|
|
|
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
|
|
|
|
]
|
|
|
|
|
for a in soup.find_all("a"):
|
|
|
|
|
for pattern in patterns:
|
|
|
|
|
m = re.search(pattern, a.text)
|
|
|
|
|
if m:
|
|
|
|
|
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
|
|
|
|
break
|
|
|
|
|
for p in soup.find_all("p"):
|
|
|
|
|
p.name = "span"
|
|
|
|
|
for th in soup.find_all("th"):
|
|
|
|
|
th['style'] = "vertical-align: middle; text-align: center;"
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-04-10 20:20:33 +00:00
|
|
|
|
def _headwords(self):
|
2023-04-08 03:05:36 +00:00
|
|
|
|
words = []
|
|
|
|
|
for yomikata in self.__yomikatas():
|
2023-04-10 16:14:52 +00:00
|
|
|
|
headword = [self.expression, yomikata]
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if headword in words:
|
|
|
|
|
words.remove(headword)
|
|
|
|
|
words.append(headword)
|
|
|
|
|
for headword in self.__ikei_headwords():
|
|
|
|
|
if headword in words:
|
|
|
|
|
words.remove(headword)
|
|
|
|
|
words.append(headword)
|
|
|
|
|
return words
|
|
|
|
|
|
|
|
|
|
def __yomikatas(self):
|
2023-04-10 22:33:10 +00:00
|
|
|
|
yomikata = self.yomikata
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
2023-04-10 16:14:52 +00:00
|
|
|
|
return [yomikata]
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
|
|
|
|
return [m.group(1)]
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
2023-04-10 16:14:52 +00:00
|
|
|
|
return Util.expand_shouryaku(yomikata)
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
|
|
|
|
yomikatas = [m.group(1)]
|
|
|
|
|
alts = m.group(2).split("/")
|
|
|
|
|
for alt in alts:
|
|
|
|
|
yomikatas.append(alt.strip())
|
|
|
|
|
return yomikatas
|
2023-04-10 20:20:33 +00:00
|
|
|
|
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
2023-04-10 22:33:10 +00:00
|
|
|
|
return [""]
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
|
|
|
|
def __ikei_headwords(self):
|
|
|
|
|
ikei_headwords = []
|
|
|
|
|
for val in self.ikei:
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
|
|
|
|
headword = [m.group(1), m.group(2)]
|
|
|
|
|
ikei_headwords.append(headword)
|
|
|
|
|
else:
|
2023-04-10 20:20:33 +00:00
|
|
|
|
print(f"Invalid 異形 format: {val}\n{self}\n")
|
2023-04-08 03:05:36 +00:00
|
|
|
|
return ikei_headwords
|
|
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
colvals = [str(self.sequence)]
|
|
|
|
|
for attr in self.columns.values():
|
|
|
|
|
attr_val = getattr(self, attr[0])
|
|
|
|
|
if isinstance(attr_val, str):
|
|
|
|
|
colvals.append(attr_val)
|
|
|
|
|
elif isinstance(attr_val, list):
|
|
|
|
|
colvals.append(";".join(attr_val))
|
|
|
|
|
return ",".join(colvals)
|