92 lines
2.9 KiB
Python
92 lines
2.9 KiB
Python
|
import re
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
from bot.data import get_adobe_glyph
|
||
|
|
||
|
|
||
|
__GAIJI = {
|
||
|
"gaiji/5350.svg": "卐",
|
||
|
"gaiji/62cb.svg": "抛",
|
||
|
"gaiji/7be1.svg": "簒",
|
||
|
}
|
||
|
|
||
|
|
||
|
def preprocess_page(page):
|
||
|
page = __strip_page(page)
|
||
|
page = __replace_glyph_codes(page)
|
||
|
page = __format_hyougai_marks(page)
|
||
|
return page
|
||
|
|
||
|
|
||
|
def __strip_page(page):
|
||
|
soup = BeautifulSoup(page, features="xml")
|
||
|
koumoku = soup.find(["項目", "字音語参照項目"])
|
||
|
if koumoku is not None:
|
||
|
return koumoku.decode()
|
||
|
else:
|
||
|
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
|
||
|
|
||
|
|
||
|
def __replace_glyph_codes(page):
|
||
|
soup = BeautifulSoup(page, features="xml")
|
||
|
for span in soup.find_all("span"):
|
||
|
if "style" in span.attrs:
|
||
|
m = re.search(r"^glyph:([0-9]+);$", span.attrs["style"])
|
||
|
del span.attrs["style"]
|
||
|
if m is None:
|
||
|
continue
|
||
|
code = int(m.group(1))
|
||
|
for geta in span.find_all(string="〓"):
|
||
|
glyph = get_adobe_glyph(code)
|
||
|
geta.replace_with(glyph)
|
||
|
for hyouki in soup.find_all("親見出表記"):
|
||
|
if "alt" not in hyouki.attrs:
|
||
|
continue
|
||
|
alt = hyouki.attrs["alt"]
|
||
|
codes = re.findall(r"{CID([0-9]+)}", alt)
|
||
|
for code in codes:
|
||
|
glyph = get_adobe_glyph(int(code))
|
||
|
alt = alt.replace(f"{{CID{code}}}", glyph)
|
||
|
hyouki.attrs["alt"] = alt
|
||
|
for gaiji in soup.find_all("外字"):
|
||
|
img = gaiji.img
|
||
|
src = img.attrs["src"] if img.has_attr("src") else ""
|
||
|
if src in __GAIJI:
|
||
|
img.attrs["alt"] = __GAIJI[src]
|
||
|
return soup.decode()
|
||
|
|
||
|
|
||
|
def __format_hyougai_marks(page):
|
||
|
soup = BeautifulSoup(page, features="xml")
|
||
|
for el in soup.find_all("外字"):
|
||
|
el.string = "〓"
|
||
|
text = soup.text
|
||
|
for x in ["\n", "\t", " "]:
|
||
|
text = text.replace(x, "")
|
||
|
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
|
||
|
page = re.sub(r"〈([^〈]+)〉", r"␂\1␃", page)
|
||
|
for mark in re.findall(r"《.", text):
|
||
|
if mark[1] == "〓":
|
||
|
page = page.replace("《", "<表外音訓/>", 1)
|
||
|
else:
|
||
|
page = re.sub(f"《([^{mark[1]}]*)({mark[1]})",
|
||
|
r"\1<表外音訓>\2</表外音訓>",
|
||
|
page, count=1)
|
||
|
for mark in re.findall(r"〈.", text):
|
||
|
if mark[1] == "〓":
|
||
|
page = page.replace("〈", "<表外字/>", 1)
|
||
|
else:
|
||
|
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
|
||
|
r"\1<表外字>\2</表外字>",
|
||
|
page, count=1)
|
||
|
page = page.replace("␂", "〈")
|
||
|
page = page.replace("␃", "〉")
|
||
|
soup = BeautifulSoup(page, features="xml")
|
||
|
for el in soup.find_all("表外音訓"):
|
||
|
if el.text == "":
|
||
|
el.append(el.next_sibling)
|
||
|
for el in soup.find_all("表外字"):
|
||
|
if el.text == "":
|
||
|
el.append(el.next_sibling)
|
||
|
return soup.decode()
|