import re from bs4 import BeautifulSoup from bot.yomichan.glossary.gloss import make_gloss def make_glossary(entry): soup = BeautifulSoup(entry.markup, "html5lib") __replace_punctuation(soup) __add_internal_links(soup) __convert_paragraphs(soup) __style_table_headers(soup) __unwrap_table_body(soup) __decompose_table_rows(soup, entry) __insert_headword_line(soup, entry) gloss = make_gloss(soup.body) glossary = [gloss] return glossary def __replace_punctuation(soup): punctuation = { "/": "/", ",": "、", } for el in soup.find_all(string=True): text = el.text for old, new in punctuation.items(): text = text.replace(old, new) el.replace_with(text) def __add_internal_links(soup): patterns = [ r"^(.+)([ぁ-ヿ、\s]+)$", r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$" ] for a in soup.find_all("a"): for pattern in patterns: m = re.search(pattern, a.text) if m: a['href'] = f"?query={m.group(1)}&wildcards=off" break def __convert_paragraphs(soup): for p in soup.find_all("p"): p.name = "span" def __style_table_headers(soup): for th in soup.find_all("th"): th['style'] = "vertical-align: middle; text-align: center;" def __unwrap_table_body(soup): if soup.find("tbody") is not None: soup.tbody.unwrap() def __decompose_table_rows(soup, entry): for tr in soup.find_all("tr"): if tr.find("th") is None: continue elif tr.th.text in ["四字熟語", "言葉"]: tr.decompose() elif tr.th.text == "読み方": if re.match(r"^[ぁ-ヿ、]+$", entry.yomikata): tr.decompose() elif tr.th.text == "意味": imi = tr.td imi.name = "div" soup.body.insert(0, imi) tr.decompose() if soup.find("tr") is None: soup.table.decompose() def __insert_headword_line(soup, entry): headword_line = soup.new_tag("span") headword_line.string = f"{entry.get_first_reading()}【{entry.expression}】" soup.body.insert(0, headword_line)