jitenbot/bot/entries/smk8_preprocess.py

92 lines
2.9 KiB
Python
Raw Normal View History

import re
from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
__GAIJI = {
"gaiji/5350.svg": "",
"gaiji/62cb.svg": "",
"gaiji/7be1.svg": "",
}
def preprocess_page(page):
page = __strip_page(page)
page = __replace_glyph_codes(page)
page = __format_hyougai_marks(page)
return page
def __strip_page(page):
soup = BeautifulSoup(page, features="xml")
koumoku = soup.find(["項目", "字音語参照項目"])
if koumoku is not None:
return koumoku.decode()
else:
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
def __replace_glyph_codes(page):
soup = BeautifulSoup(page, features="xml")
for span in soup.find_all("span"):
if "style" in span.attrs:
m = re.search(r"^glyph:([0-9]+);$", span.attrs["style"])
del span.attrs["style"]
if m is None:
continue
code = int(m.group(1))
for geta in span.find_all(string=""):
glyph = get_adobe_glyph(code)
geta.replace_with(glyph)
for hyouki in soup.find_all("親見出表記"):
if "alt" not in hyouki.attrs:
continue
alt = hyouki.attrs["alt"]
codes = re.findall(r"{CID([0-9]+)}", alt)
for code in codes:
glyph = get_adobe_glyph(int(code))
alt = alt.replace(f"{{CID{code}}}", glyph)
hyouki.attrs["alt"] = alt
for gaiji in soup.find_all("外字"):
img = gaiji.img
src = img.attrs["src"] if img.has_attr("src") else ""
if src in __GAIJI:
img.attrs["alt"] = __GAIJI[src]
return soup.decode()
def __format_hyougai_marks(page):
soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("外字"):
el.string = ""
text = soup.text
for x in ["\n", "\t", " "]:
text = text.replace(x, "")
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
page = re.sub(r"〈([^〈]+)〉", r"\1␃", page)
for mark in re.findall(r"《.", text):
if mark[1] == "":
page = page.replace("", "<表外音訓/>", 1)
else:
page = re.sub(f"《([^{mark[1]}]*)({mark[1]})",
r"\1<表外音訓>\2</表外音訓>",
page, count=1)
for mark in re.findall(r"〈.", text):
if mark[1] == "":
page = page.replace("", "<表外字/>", 1)
else:
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
r"\1<表外字>\2</表外字>",
page, count=1)
page = page.replace("", "")
page = page.replace("", "")
soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("表外音訓"):
if el.text == "":
el.append(el.next_sibling)
for el in soup.find_all("表外字"):
if el.text == "":
el.append(el.next_sibling)
return soup.decode()