jitenbot/bot/entries/daijirin2/preprocess.py
2023-07-26 19:28:50 -05:00

57 lines
1.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
__GAIJI = {
"gaiji/DJRK0002.svg": "𦬇",
"gaiji/U芸E0102.svg": "",
}
def preprocess_page(page):
soup = BeautifulSoup(page, features="xml")
__replace_glyph_codes(soup)
__add_gaiji_alt_text(soup)
__replace_halfwidth_braces(soup)
page = __strip_page(soup)
return page
def __replace_glyph_codes(soup):
for el in soup.find_all(style=True):
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
if not m:
continue
del el.attrs["style"]
if el.has_attr("alt"):
el.string = el.attrs["alt"]
continue
code = int(m.group(1))
for geta in el.find_all(string=""):
glyph = get_adobe_glyph(code)
geta.replace_with(glyph)
def __add_gaiji_alt_text(soup):
for gaiji in soup.find_all(class_="gaiji"):
src = gaiji.attrs["src"] if gaiji.has_attr("src") else ""
if src in __GAIJI:
gaiji.attrs["alt"] = __GAIJI[src]
def __replace_halfwidth_braces(soup):
for x in soup.find_all("送り仮名省略"):
for el in x.find_all(string="("):
el.replace_with("")
for el in x.find_all(string=")"):
el.replace_with("")
def __strip_page(soup):
koumoku = soup.find("項目")
if koumoku is None:
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
return koumoku.decode()