sankoku8 bugfix: account for images in headwords

Affects ビャンビャンめん, さんだつ (簒奪), and some words with hypens
such as J-POP and CD-ROM.
This commit is contained in:
stephenmk 2023-07-28 17:00:01 -05:00
parent 4eb7e12f37
commit a5bb8d6f40
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
2 changed files with 30 additions and 0 deletions

View file

@ -67,6 +67,7 @@ class BaseEntry(SanseidoEntry):
def _find_expressions(self, soup):
expressions = []
for hyouki in soup.find_all(self._hyouki_name):
self._fill_alts(hyouki)
for expression in parse_hyouki_soup(hyouki, [""]):
expressions.append(expression)
return expressions
@ -95,3 +96,9 @@ class BaseEntry(SanseidoEntry):
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _fill_alts(soup):
for img in soup.find_all("img"):
if img.has_attr("alt"):
img.string = img.attrs["alt"]

View file

@ -4,9 +4,17 @@ from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
__GAIJI = {
"svg-gaiji/byan.svg": "𰻞",
"svg-gaiji/G16EF.svg": "",
}
def preprocess_page(page):
soup = BeautifulSoup(page, features="xml")
__replace_glyph_codes(soup)
__add_image_alt_text(soup)
__replace_tatehyphen(soup)
page = __strip_page(soup)
return page
@ -20,6 +28,21 @@ def __replace_glyph_codes(soup):
geta.replace_with(glyph)
def __add_image_alt_text(soup):
for img in soup.find_all("img"):
if not img.has_attr("src"):
continue
src = img.attrs["src"]
if src in __GAIJI:
img.attrs["alt"] = __GAIJI[src]
def __replace_tatehyphen(soup):
for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}):
img.string = ""
img.unwrap()
def __strip_page(soup):
koumoku = soup.find(["項目"])
if koumoku is not None: