2023-04-08 03:05:36 +00:00
|
|
|
|
import re
|
2023-07-08 21:49:03 +00:00
|
|
|
|
from abc import abstractmethod
|
2023-04-08 23:17:09 +00:00
|
|
|
|
from datetime import datetime, date
|
|
|
|
|
from bs4 import BeautifulSoup
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-07-27 00:28:50 +00:00
|
|
|
|
from bot.entries.base.entry import Entry
|
|
|
|
|
import bot.entries.base.expressions as Expressions
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
|
|
|
|
|
2023-07-27 00:28:50 +00:00
|
|
|
|
class JitenonEntry(Entry):
|
2023-07-08 21:49:03 +00:00
|
|
|
|
def __init__(self, target, entry_id):
|
|
|
|
|
super().__init__(target, entry_id)
|
|
|
|
|
self.expression = ""
|
|
|
|
|
self.yomikata = ""
|
|
|
|
|
self.definition = ""
|
|
|
|
|
self.other_forms = []
|
2023-04-08 23:17:09 +00:00
|
|
|
|
self.modified_date = date(1970, 1, 1)
|
|
|
|
|
self.attribution = ""
|
2023-07-08 21:49:03 +00:00
|
|
|
|
|
|
|
|
|
def get_global_identifier(self):
|
|
|
|
|
return f"@{self.target.value}-{format(self.entry_id, '06')}"
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
|
def set_page(self, page):
|
|
|
|
|
soup = BeautifulSoup(page, features="html5lib")
|
|
|
|
|
self.__set_modified_date(page)
|
2023-05-07 03:07:06 +00:00
|
|
|
|
self.__set_attribution(soup)
|
2023-04-23 01:26:54 +00:00
|
|
|
|
table = soup.find(class_="kanjirighttb")
|
2023-05-07 03:07:06 +00:00
|
|
|
|
if table is None:
|
|
|
|
|
raise ValueError("Error: table data not found in page.")
|
2023-04-08 03:05:36 +00:00
|
|
|
|
rows = table.find("tbody").find_all("tr")
|
|
|
|
|
colname = ""
|
|
|
|
|
for row in rows:
|
|
|
|
|
colname = row.th.text if row.th is not None else colname
|
2023-04-23 01:26:54 +00:00
|
|
|
|
colval = self.__clean_text(row.td.text)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
self.__set_column(colname, colval)
|
2023-05-01 22:31:28 +00:00
|
|
|
|
self._page = table.decode()
|
|
|
|
|
|
|
|
|
|
def get_page_soup(self):
|
|
|
|
|
soup = BeautifulSoup(self._page, "html5lib")
|
|
|
|
|
return soup
|
2023-04-23 01:26:54 +00:00
|
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
|
def get_part_of_speech_tags(self):
|
|
|
|
|
# Jitenon doesn't have any
|
|
|
|
|
return []
|
2023-04-23 05:17:42 +00:00
|
|
|
|
|
2023-07-08 21:49:03 +00:00
|
|
|
|
def _get_headwords(self):
|
2023-04-23 01:26:54 +00:00
|
|
|
|
headwords = {}
|
2023-07-08 21:49:03 +00:00
|
|
|
|
for reading in self._get_readings():
|
|
|
|
|
headwords[reading] = [self.expression]
|
|
|
|
|
other_form_headwords = self._other_form_headwords()
|
|
|
|
|
for reading, expressions in other_form_headwords.items():
|
2023-04-23 01:26:54 +00:00
|
|
|
|
if reading not in headwords:
|
|
|
|
|
headwords[reading] = []
|
|
|
|
|
for expression in expressions:
|
|
|
|
|
if expression not in headwords[reading]:
|
|
|
|
|
headwords[reading].append(expression)
|
2023-07-08 21:49:03 +00:00
|
|
|
|
return headwords
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def _get_column_map(self):
|
2023-07-27 04:48:24 +00:00
|
|
|
|
raise NotImplementedError
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
|
def __set_modified_date(self, page):
|
|
|
|
|
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
2023-05-07 03:07:06 +00:00
|
|
|
|
if m is None:
|
2023-04-08 23:17:09 +00:00
|
|
|
|
return
|
2023-07-08 21:49:03 +00:00
|
|
|
|
modified_date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
|
|
|
|
self.modified_date = modified_date
|
2023-04-08 23:17:09 +00:00
|
|
|
|
|
2023-05-07 03:07:06 +00:00
|
|
|
|
def __set_attribution(self, soup):
|
|
|
|
|
attribution = soup.find(class_="copyright")
|
|
|
|
|
if attribution is not None:
|
|
|
|
|
self.attribution = soup.find(class_="copyright").text
|
|
|
|
|
else:
|
|
|
|
|
self.attribution = ""
|
|
|
|
|
|
2023-04-08 03:05:36 +00:00
|
|
|
|
def __set_column(self, colname, colval):
|
2023-07-08 21:49:03 +00:00
|
|
|
|
column_map = self._get_column_map()
|
|
|
|
|
attr_name = column_map[colname]
|
2023-04-08 03:05:36 +00:00
|
|
|
|
attr_value = getattr(self, attr_name)
|
|
|
|
|
if isinstance(attr_value, str):
|
|
|
|
|
setattr(self, attr_name, colval)
|
|
|
|
|
elif isinstance(attr_value, list):
|
|
|
|
|
if len(attr_value) == 0:
|
|
|
|
|
setattr(self, attr_name, [colval])
|
|
|
|
|
else:
|
|
|
|
|
attr_value.append(colval)
|
2023-04-10 22:33:10 +00:00
|
|
|
|
|
2023-07-08 21:49:03 +00:00
|
|
|
|
def _get_readings(self):
|
2023-04-10 22:33:10 +00:00
|
|
|
|
yomikata = self.yomikata
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
2023-04-10 16:14:52 +00:00
|
|
|
|
return [yomikata]
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
|
|
|
|
return [m.group(1)]
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
2023-05-01 22:31:28 +00:00
|
|
|
|
return Expressions.expand_abbreviation(yomikata)
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if m:
|
|
|
|
|
yomikatas = [m.group(1)]
|
|
|
|
|
alts = m.group(2).split("/")
|
|
|
|
|
for alt in alts:
|
|
|
|
|
yomikatas.append(alt.strip())
|
|
|
|
|
return yomikatas
|
2023-04-10 20:20:33 +00:00
|
|
|
|
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
2023-04-10 22:33:10 +00:00
|
|
|
|
return [""]
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-07-08 21:49:03 +00:00
|
|
|
|
def _other_form_headwords(self):
|
|
|
|
|
other_form_headwords = {}
|
|
|
|
|
for val in self.other_forms:
|
2023-04-10 20:20:33 +00:00
|
|
|
|
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
2023-04-23 01:26:54 +00:00
|
|
|
|
if not m:
|
2023-04-10 20:20:33 +00:00
|
|
|
|
print(f"Invalid 異形 format: {val}\n{self}\n")
|
2023-04-23 01:26:54 +00:00
|
|
|
|
continue
|
|
|
|
|
expression = m.group(1)
|
|
|
|
|
reading = m.group(2)
|
2023-07-08 21:49:03 +00:00
|
|
|
|
if reading not in other_form_headwords:
|
|
|
|
|
other_form_headwords[reading] = []
|
|
|
|
|
if expression not in other_form_headwords[reading]:
|
|
|
|
|
other_form_headwords[reading].append(expression)
|
|
|
|
|
return other_form_headwords
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
2023-04-23 01:26:54 +00:00
|
|
|
|
@staticmethod
|
|
|
|
|
def __clean_text(text):
|
|
|
|
|
text = text.replace("\n", "")
|
|
|
|
|
text = text.replace(",", "、")
|
|
|
|
|
text = text.replace(" ", "")
|
|
|
|
|
text = text.strip()
|
|
|
|
|
return text
|
|
|
|
|
|
2023-04-08 03:05:36 +00:00
|
|
|
|
def __str__(self):
|
2023-07-08 21:49:03 +00:00
|
|
|
|
column_map = self._get_column_map()
|
2023-04-23 01:26:54 +00:00
|
|
|
|
colvals = [str(self.entry_id)]
|
2023-07-08 21:49:03 +00:00
|
|
|
|
for attr_name in column_map.values():
|
|
|
|
|
attr_val = getattr(self, attr_name)
|
2023-04-08 03:05:36 +00:00
|
|
|
|
if isinstance(attr_val, str):
|
|
|
|
|
colvals.append(attr_val)
|
|
|
|
|
elif isinstance(attr_val, list):
|
|
|
|
|
colvals.append(";".join(attr_val))
|
|
|
|
|
return ",".join(colvals)
|