jitenbot/bot/entries/smk8/preprocess.py
2023-07-26 19:28:50 -05:00

110 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
__GAIJI = {
"gaiji/5350.svg": "",
"gaiji/62cb.svg": "",
"gaiji/7be1.svg": "",
}
def preprocess_page(page):
page = __strip_page(page)
page = __replace_glyph_codes(page)
page = __format_hyougai_marks(page)
page = __remove_pronunciation_parentheses(page)
return page
def __strip_page(page):
soup = BeautifulSoup(page, features="xml")
koumoku = soup.find(["項目", "字音語参照項目"])
if koumoku is not None:
return koumoku.decode()
else:
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
def __replace_glyph_codes(page):
soup = BeautifulSoup(page, features="xml")
for span in soup.find_all("span"):
if "style" in span.attrs:
m = re.search(r"^glyph:([0-9]+);$", span.attrs["style"])
del span.attrs["style"]
if m is None:
continue
code = int(m.group(1))
for geta in span.find_all(string=""):
glyph = get_adobe_glyph(code)
geta.replace_with(glyph)
for hyouki in soup.find_all("親見出表記"):
if "alt" not in hyouki.attrs:
continue
alt = hyouki.attrs["alt"]
codes = re.findall(r"{CID([0-9]+)}", alt)
for code in codes:
glyph = get_adobe_glyph(int(code))
alt = alt.replace(f"{{CID{code}}}", glyph)
hyouki.attrs["alt"] = alt
for gaiji in soup.find_all("外字"):
img = gaiji.img
src = img.attrs["src"] if img.has_attr("src") else ""
if src in __GAIJI:
img.attrs["alt"] = __GAIJI[src]
return soup.decode()
def __format_hyougai_marks(page):
soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("外字"):
el.string = ""
text = soup.text
for x in ["\n", "\t", " "]:
text = text.replace(x, "")
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
page = re.sub(r"〈([^〈]+)〉", r"\1␃", page)
for mark in re.findall(r"《.", text):
if mark[1] == "":
page = page.replace("", "<表外音訓/>", 1)
else:
page = re.sub(f"《([^{mark[1]}]*)({mark[1]})",
r"\1<表外音訓>\2</表外音訓>",
page, count=1)
for mark in re.findall(r"〈.", text):
if mark[1] == "":
page = page.replace("", "<表外字/>", 1)
else:
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
r"\1<表外字>\2</表外字>",
page, count=1)
page = page.replace("", "")
page = page.replace("", "")
soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("表外音訓"):
if el.text == "":
el.append(el.next_sibling)
mark_xml = "<表外音訓マーク>︽</表外音訓マーク>"
mark_soup = BeautifulSoup(mark_xml, "xml")
el.append(mark_soup.表外音訓マーク)
for el in soup.find_all("表外字"):
if el.text == "":
el.append(el.next_sibling)
mark_xml = "<表外字マーク>︿</表外字マーク>"
mark_soup = BeautifulSoup(mark_xml, "xml")
el.append(mark_soup.表外字マーク)
return soup.decode()
def __remove_pronunciation_parentheses(page):
page = page.replace("<表音表記>", "<表音表記>")
page = page.replace("</表音表記>", "</表音表記>")
return page