From a5bb8d6f40f93e412490d9550b73dd89e2504dff Mon Sep 17 00:00:00 2001 From: stephenmk Date: Fri, 28 Jul 2023 17:00:01 -0500 Subject: [PATCH] sankoku8 bugfix: account for images in headwords MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Affects ビャンビャンめん, さんだつ (簒奪), and some words with hypens such as J-POP and CD-ROM. --- bot/entries/sankoku8/base_entry.py | 7 +++++++ bot/entries/sankoku8/preprocess.py | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/bot/entries/sankoku8/base_entry.py b/bot/entries/sankoku8/base_entry.py index 93c0515..8d7a394 100644 --- a/bot/entries/sankoku8/base_entry.py +++ b/bot/entries/sankoku8/base_entry.py @@ -67,6 +67,7 @@ class BaseEntry(SanseidoEntry): def _find_expressions(self, soup): expressions = [] for hyouki in soup.find_all(self._hyouki_name): + self._fill_alts(hyouki) for expression in parse_hyouki_soup(hyouki, [""]): expressions.append(expression) return expressions @@ -95,3 +96,9 @@ class BaseEntry(SanseidoEntry): ] for name in unused_nodes: Soup.delete_soup_nodes(soup, name) + + @staticmethod + def _fill_alts(soup): + for img in soup.find_all("img"): + if img.has_attr("alt"): + img.string = img.attrs["alt"] diff --git a/bot/entries/sankoku8/preprocess.py b/bot/entries/sankoku8/preprocess.py index 73fb31a..aa47d00 100644 --- a/bot/entries/sankoku8/preprocess.py +++ b/bot/entries/sankoku8/preprocess.py @@ -4,9 +4,17 @@ from bs4 import BeautifulSoup from bot.data import get_adobe_glyph +__GAIJI = { + "svg-gaiji/byan.svg": "𰻞", + "svg-gaiji/G16EF.svg": "簒", +} + + def preprocess_page(page): soup = BeautifulSoup(page, features="xml") __replace_glyph_codes(soup) + __add_image_alt_text(soup) + __replace_tatehyphen(soup) page = __strip_page(soup) return page @@ -20,6 +28,21 @@ def __replace_glyph_codes(soup): geta.replace_with(glyph) +def __add_image_alt_text(soup): + for img in soup.find_all("img"): + if not img.has_attr("src"): + continue + src = img.attrs["src"] + if src in __GAIJI: + img.attrs["alt"] = __GAIJI[src] + + +def __replace_tatehyphen(soup): + for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}): + img.string = "−" + img.unwrap() + + def __strip_page(soup): koumoku = soup.find(["項目"]) if koumoku is not None: