57 lines
1.5 KiB
Python
57 lines
1.5 KiB
Python
|
import re
|
|||
|
from bs4 import BeautifulSoup
|
|||
|
|
|||
|
from bot.data import get_adobe_glyph
|
|||
|
|
|||
|
|
|||
|
__GAIJI = {
|
|||
|
"gaiji/DJRK0002.svg": "𦬇",
|
|||
|
"gaiji/U芸E0102.svg": "芸",
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
def preprocess_page(page):
|
|||
|
soup = BeautifulSoup(page, features="xml")
|
|||
|
__replace_glyph_codes(soup)
|
|||
|
__add_gaiji_alt_text(soup)
|
|||
|
__replace_halfwidth_braces(soup)
|
|||
|
page = __strip_page(soup)
|
|||
|
return page
|
|||
|
|
|||
|
|
|||
|
def __replace_glyph_codes(soup):
|
|||
|
for el in soup.find_all(style=True):
|
|||
|
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
|
|||
|
if not m:
|
|||
|
continue
|
|||
|
del el.attrs["style"]
|
|||
|
if el.has_attr("alt"):
|
|||
|
el.string = el.attrs["alt"]
|
|||
|
continue
|
|||
|
code = int(m.group(1))
|
|||
|
for geta in el.find_all(string="〓"):
|
|||
|
glyph = get_adobe_glyph(code)
|
|||
|
geta.replace_with(glyph)
|
|||
|
|
|||
|
|
|||
|
def __add_gaiji_alt_text(soup):
|
|||
|
for gaiji in soup.find_all(class_="gaiji"):
|
|||
|
src = gaiji.attrs["src"] if gaiji.has_attr("src") else ""
|
|||
|
if src in __GAIJI:
|
|||
|
gaiji.attrs["alt"] = __GAIJI[src]
|
|||
|
|
|||
|
|
|||
|
def __replace_halfwidth_braces(soup):
|
|||
|
for x in soup.find_all("送り仮名省略"):
|
|||
|
for el in x.find_all(string="("):
|
|||
|
el.replace_with("(")
|
|||
|
for el in x.find_all(string=")"):
|
|||
|
el.replace_with(")")
|
|||
|
|
|||
|
|
|||
|
def __strip_page(soup):
|
|||
|
koumoku = soup.find("項目")
|
|||
|
if koumoku is None:
|
|||
|
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
|
|||
|
return koumoku.decode()
|