jitenbot/bot/entries/base/jitenon_entry.py

143 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from abc import abstractmethod
from datetime import datetime, date
from bs4 import BeautifulSoup
from bot.entries.base.entry import Entry
import bot.entries.base.expressions as Expressions
class JitenonEntry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.expression = ""
self.yomikata = ""
self.definition = ""
self.other_forms = []
self.modified_date = date(1970, 1, 1)
self.attribution = ""
def get_global_identifier(self):
return f"@{self.target.value}-{format(self.entry_id, '06')}"
def set_page(self, page):
soup = BeautifulSoup(page, features="html5lib")
self.__set_modified_date(page)
self.__set_attribution(soup)
table = soup.find(class_="kanjirighttb")
if table is None:
raise ValueError("Error: table data not found in page.")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = self.__clean_text(row.td.text)
self.__set_column(colname, colval)
self._page = table.decode()
def get_page_soup(self):
soup = BeautifulSoup(self._page, "html5lib")
return soup
def get_part_of_speech_tags(self):
# Jitenon doesn't have any
return []
def _get_headwords(self):
headwords = {}
for reading in self._get_readings():
headwords[reading] = [self.expression]
other_form_headwords = self._other_form_headwords()
for reading, expressions in other_form_headwords.items():
if reading not in headwords:
headwords[reading] = []
for expression in expressions:
if expression not in headwords[reading]:
headwords[reading].append(expression)
return headwords
@abstractmethod
def _get_column_map(self):
raise NotImplementedError
def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
if m is None:
return
modified_date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = modified_date
def __set_attribution(self, soup):
attribution = soup.find(class_="copyright")
if attribution is not None:
self.attribution = soup.find(class_="copyright").text
else:
self.attribution = ""
def __set_column(self, colname, colval):
column_map = self._get_column_map()
attr_name = column_map[colname]
attr_value = getattr(self, attr_name)
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
elif isinstance(attr_value, list):
if len(attr_value) == 0:
setattr(self, attr_name, [colval])
else:
attr_value.append(colval)
def _get_readings(self):
yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
return [yomikata]
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
if m:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ、]+[ぁ-ヿ、][ぁ-ヿ、]+$", yomikata)
if m:
return Expressions.expand_abbreviation(yomikata)
m = re.search(r"^([ぁ-ヿ、]+)([ぁ-ヿ/\s、]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""]
def _other_form_headwords(self):
other_form_headwords = {}
for val in self.other_forms:
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if not m:
print(f"Invalid 異形 format: {val}\n{self}\n")
continue
expression = m.group(1)
reading = m.group(2)
if reading not in other_form_headwords:
other_form_headwords[reading] = []
if expression not in other_form_headwords[reading]:
other_form_headwords[reading].append(expression)
return other_form_headwords
@staticmethod
def __clean_text(text):
text = text.replace("\n", "")
text = text.replace(",", "")
text = text.replace(" ", "")
text = text.strip()
return text
def __str__(self):
column_map = self._get_column_map()
colvals = [str(self.entry_id)]
for attr_name in column_map.values():
attr_val = getattr(self, attr_name)
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)