jitenbot/bot/entries/sankoku8/preprocess.py
stephenmk a5bb8d6f40
sankoku8 bugfix: account for images in headwords
Affects ビャンビャンめん, さんだつ (簒奪), and some words with hypens
such as J-POP and CD-ROM.
2023-07-28 17:00:01 -05:00

52 lines
1.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
__GAIJI = {
"svg-gaiji/byan.svg": "𰻞",
"svg-gaiji/G16EF.svg": "",
}
def preprocess_page(page):
soup = BeautifulSoup(page, features="xml")
__replace_glyph_codes(soup)
__add_image_alt_text(soup)
__replace_tatehyphen(soup)
page = __strip_page(soup)
return page
def __replace_glyph_codes(soup):
for el in soup.find_all("glyph"):
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
code = int(m.group(1))
for geta in el.find_all(string=""):
glyph = get_adobe_glyph(code)
geta.replace_with(glyph)
def __add_image_alt_text(soup):
for img in soup.find_all("img"):
if not img.has_attr("src"):
continue
src = img.attrs["src"]
if src in __GAIJI:
img.attrs["alt"] = __GAIJI[src]
def __replace_tatehyphen(soup):
for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}):
img.string = ""
img.unwrap()
def __strip_page(soup):
koumoku = soup.find(["項目"])
if koumoku is not None:
return koumoku.decode()
else:
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")