2023-07-18 05:43:38 +00:00
|
|
|
|
import re
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
from bot.data import get_adobe_glyph
|
|
|
|
|
|
|
|
|
|
|
2023-07-28 22:00:01 +00:00
|
|
|
|
__GAIJI = {
|
|
|
|
|
"svg-gaiji/byan.svg": "𰻞",
|
2023-07-28 23:06:24 +00:00
|
|
|
|
"svg-gaiji/G16EF.svg": "篡",
|
2023-07-28 22:00:01 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2023-07-18 05:43:38 +00:00
|
|
|
|
def preprocess_page(page):
|
|
|
|
|
soup = BeautifulSoup(page, features="xml")
|
|
|
|
|
__replace_glyph_codes(soup)
|
2023-07-28 22:00:01 +00:00
|
|
|
|
__add_image_alt_text(soup)
|
|
|
|
|
__replace_tatehyphen(soup)
|
2023-07-18 05:43:38 +00:00
|
|
|
|
page = __strip_page(soup)
|
|
|
|
|
return page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __replace_glyph_codes(soup):
|
|
|
|
|
for el in soup.find_all("glyph"):
|
|
|
|
|
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
|
|
|
|
|
code = int(m.group(1))
|
|
|
|
|
for geta in el.find_all(string="〓"):
|
|
|
|
|
glyph = get_adobe_glyph(code)
|
|
|
|
|
geta.replace_with(glyph)
|
|
|
|
|
|
|
|
|
|
|
2023-07-28 22:00:01 +00:00
|
|
|
|
def __add_image_alt_text(soup):
|
|
|
|
|
for img in soup.find_all("img"):
|
|
|
|
|
if not img.has_attr("src"):
|
|
|
|
|
continue
|
|
|
|
|
src = img.attrs["src"]
|
|
|
|
|
if src in __GAIJI:
|
|
|
|
|
img.attrs["alt"] = __GAIJI[src]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __replace_tatehyphen(soup):
|
|
|
|
|
for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}):
|
|
|
|
|
img.string = "−"
|
|
|
|
|
img.unwrap()
|
|
|
|
|
|
|
|
|
|
|
2023-07-18 05:43:38 +00:00
|
|
|
|
def __strip_page(soup):
|
|
|
|
|
koumoku = soup.find(["項目"])
|
|
|
|
|
if koumoku is not None:
|
|
|
|
|
return koumoku.decode()
|
|
|
|
|
else:
|
|
|
|
|
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
|