2023-04-23 01:26:54 +00:00
|
|
|
|
import re
|
2023-05-06 03:53:17 +00:00
|
|
|
|
import os
|
|
|
|
|
from bs4 import BeautifulSoup
|
2023-04-23 01:26:54 +00:00
|
|
|
|
|
2023-05-06 03:53:17 +00:00
|
|
|
|
import bot.icons as Icons
|
2023-04-23 01:26:54 +00:00
|
|
|
|
from bot.yomichan.glossary.gloss import make_gloss
|
|
|
|
|
|
|
|
|
|
|
2023-05-06 03:53:17 +00:00
|
|
|
|
class JitenonGlossary():
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self._id_pattern = None
|
|
|
|
|
self._expression_header = None
|
|
|
|
|
|
|
|
|
|
def _replace_punctuation(self, soup):
|
|
|
|
|
punctuation = {
|
|
|
|
|
"/": "/",
|
|
|
|
|
",": "、",
|
|
|
|
|
}
|
|
|
|
|
for el in soup.find_all(string=True):
|
|
|
|
|
text = el.text
|
|
|
|
|
for old, new in punctuation.items():
|
|
|
|
|
text = text.replace(old, new)
|
|
|
|
|
el.replace_with(text)
|
|
|
|
|
|
|
|
|
|
def _add_internal_links(self, soup, entry):
|
|
|
|
|
for el in soup.find_all("a"):
|
|
|
|
|
href = el.attrs["href"]
|
|
|
|
|
m = re.search(self._id_pattern, href)
|
|
|
|
|
if m is not None:
|
|
|
|
|
ref_entry_id = int(m.group(1))
|
|
|
|
|
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
|
|
|
|
|
expression = ref_entry.get_first_expression()
|
|
|
|
|
el.attrs["href"] = f"?query={expression}&wildcards=off"
|
|
|
|
|
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"Invalid href format: {href}")
|
|
|
|
|
|
|
|
|
|
def _convert_paragraphs(self, soup):
|
|
|
|
|
for p in soup.find_all("p"):
|
|
|
|
|
p.name = "div"
|
|
|
|
|
|
|
|
|
|
def _style_table_headers(self, soup):
|
|
|
|
|
for th in soup.find_all("th"):
|
|
|
|
|
th['style'] = "vertical-align: middle; text-align: center;"
|
|
|
|
|
|
|
|
|
|
def _unwrap_table_body(self, soup):
|
|
|
|
|
if soup.find("tbody") is not None:
|
|
|
|
|
soup.tbody.unwrap()
|
|
|
|
|
|
|
|
|
|
def _decompose_table_rows(self, soup, entry):
|
|
|
|
|
for tr in soup.find_all("tr"):
|
|
|
|
|
if tr.find("th") is None:
|
|
|
|
|
continue
|
|
|
|
|
elif tr.th.text == self._expression_header:
|
2023-04-23 05:17:42 +00:00
|
|
|
|
tr.decompose()
|
2023-05-06 03:53:17 +00:00
|
|
|
|
elif tr.th.text == "読み方":
|
|
|
|
|
if self._do_display_yomikata_in_headword(entry):
|
|
|
|
|
tr.decompose()
|
|
|
|
|
elif tr.th.text == "意味":
|
2023-07-08 21:49:03 +00:00
|
|
|
|
definition = tr.td
|
|
|
|
|
definition.name = "div"
|
|
|
|
|
soup.body.insert(0, definition)
|
2023-05-06 03:53:17 +00:00
|
|
|
|
tr.decompose()
|
|
|
|
|
if soup.find("tr") is None:
|
|
|
|
|
soup.table.decompose()
|
|
|
|
|
|
|
|
|
|
def _insert_headword_line(self, soup, entry):
|
|
|
|
|
headword_line = soup.new_tag("span")
|
|
|
|
|
if self._do_display_yomikata_in_headword(entry):
|
|
|
|
|
headword_line.string = f"{entry.yomikata}【{entry.expression}】"
|
|
|
|
|
else:
|
|
|
|
|
headword_line.string = f"【{entry.expression}】"
|
|
|
|
|
soup.body.insert(0, headword_line)
|
|
|
|
|
|
|
|
|
|
def _do_display_yomikata_in_headword(self, entry):
|
|
|
|
|
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
|
|
|
|
|
return False
|
|
|
|
|
elif len(entry.yomikata) > 10:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class JitenonKokugoGlossary(JitenonGlossary):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self._expression_header = "言葉"
|
|
|
|
|
self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
|
|
|
|
|
|
|
|
|
|
def make_glossary(self, entry, image_dir):
|
|
|
|
|
soup = entry.get_page_soup()
|
|
|
|
|
self._remove_antonym_list_item(soup)
|
|
|
|
|
self._replace_number_icons(soup, image_dir)
|
|
|
|
|
self._replace_punctuation(soup)
|
|
|
|
|
self._add_internal_links(soup, entry)
|
|
|
|
|
self._convert_paragraphs(soup)
|
|
|
|
|
self._style_table_headers(soup)
|
|
|
|
|
self._unwrap_table_body(soup)
|
|
|
|
|
self._decompose_table_rows(soup, entry)
|
|
|
|
|
self._insert_headword_line(soup, entry)
|
|
|
|
|
gloss = make_gloss(soup.body)
|
|
|
|
|
glossary = [gloss]
|
|
|
|
|
return glossary
|
|
|
|
|
|
|
|
|
|
def _remove_antonym_list_item(self, soup):
|
|
|
|
|
for el in soup.find_all("li"):
|
|
|
|
|
if el.text == "対義語辞典":
|
|
|
|
|
el.decompose()
|
|
|
|
|
|
|
|
|
|
def _replace_number_icons(self, soup, image_dir):
|
|
|
|
|
for el in soup.find_all("img"):
|
|
|
|
|
alt = el.attrs["alt"]
|
|
|
|
|
text = re.search(r"[0-9]+", alt).group(0)
|
|
|
|
|
filename = f"{text}-fill.svg"
|
|
|
|
|
path = os.path.join(image_dir, filename)
|
|
|
|
|
Icons.make_monochrome_fill_rectangle(path, text)
|
|
|
|
|
ratio = Icons.calculate_ratio(path)
|
|
|
|
|
img = BeautifulSoup("<img/>", "xml").img
|
|
|
|
|
img.attrs = {
|
|
|
|
|
"height": 1.0 if ratio > 1.0 else ratio,
|
|
|
|
|
"width": ratio if ratio > 1.0 else 1.0,
|
|
|
|
|
"sizeUnits": "em",
|
|
|
|
|
"collapsible": False,
|
|
|
|
|
"collapsed": False,
|
|
|
|
|
"background": False,
|
|
|
|
|
"appearance": "monochrome",
|
|
|
|
|
"title": alt,
|
|
|
|
|
"path": f"{os.path.basename(image_dir)}/{filename}",
|
|
|
|
|
}
|
|
|
|
|
el.name = "span"
|
|
|
|
|
el.append(img)
|
|
|
|
|
el.attrs["style"] = "margin-right: 0.25em;"
|
|
|
|
|
|
|
|
|
|
def _do_display_yomikata_in_headword(self, entry):
|
|
|
|
|
return len(entry.yomikata) <= 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class JitenonYojiGlossary(JitenonGlossary):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self._expression_header = "四字熟語"
|
|
|
|
|
self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
|
|
|
|
|
|
|
|
|
|
def make_glossary(self, entry, image_dir):
|
|
|
|
|
soup = entry.get_page_soup()
|
|
|
|
|
self._replace_punctuation(soup)
|
|
|
|
|
self._add_internal_links(soup, entry)
|
|
|
|
|
self._convert_paragraphs(soup)
|
|
|
|
|
self._style_table_headers(soup)
|
|
|
|
|
self._unwrap_table_body(soup)
|
|
|
|
|
self._decompose_table_rows(soup, entry)
|
|
|
|
|
self._insert_headword_line(soup, entry)
|
|
|
|
|
gloss = make_gloss(soup.body)
|
|
|
|
|
glossary = [gloss]
|
|
|
|
|
return glossary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class JitenonKotowazaGlossary(JitenonGlossary):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self._expression_header = "言葉"
|
|
|
|
|
self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
|
|
|
|
|
|
|
|
|
|
def make_glossary(self, entry, image_dir):
|
|
|
|
|
soup = entry.get_page_soup()
|
|
|
|
|
self._replace_punctuation(soup)
|
|
|
|
|
self._add_internal_links(soup, entry)
|
|
|
|
|
self._convert_paragraphs(soup)
|
|
|
|
|
self._style_table_headers(soup)
|
|
|
|
|
self._unwrap_table_body(soup)
|
|
|
|
|
self._decompose_table_rows(soup, entry)
|
|
|
|
|
self._insert_headword_line(soup, entry)
|
|
|
|
|
gloss = make_gloss(soup.body)
|
|
|
|
|
glossary = [gloss]
|
|
|
|
|
return glossary
|