import re from bs4 import BeautifulSoup from bot.data import get_adobe_glyph __GAIJI = { "gaiji/5350.svg": "卐", "gaiji/62cb.svg": "抛", "gaiji/7be1.svg": "簒", } def preprocess_page(page): page = __strip_page(page) page = __replace_glyph_codes(page) page = __format_hyougai_marks(page) page = __remove_pronunciation_parentheses(page) return page def __strip_page(page): soup = BeautifulSoup(page, features="xml") koumoku = soup.find(["項目", "字音語参照項目"]) if koumoku is not None: return koumoku.decode() else: raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}") def __replace_glyph_codes(page): soup = BeautifulSoup(page, features="xml") for span in soup.find_all("span"): if "style" in span.attrs: m = re.search(r"^glyph:([0-9]+);$", span.attrs["style"]) del span.attrs["style"] if m is None: continue code = int(m.group(1)) for geta in span.find_all(string="〓"): glyph = get_adobe_glyph(code) geta.replace_with(glyph) for hyouki in soup.find_all("親見出表記"): if "alt" not in hyouki.attrs: continue alt = hyouki.attrs["alt"] codes = re.findall(r"{CID([0-9]+)}", alt) for code in codes: glyph = get_adobe_glyph(int(code)) alt = alt.replace(f"{{CID{code}}}", glyph) hyouki.attrs["alt"] = alt for gaiji in soup.find_all("外字"): img = gaiji.img src = img.attrs["src"] if img.has_attr("src") else "" if src in __GAIJI: img.attrs["alt"] = __GAIJI[src] return soup.decode() def __format_hyougai_marks(page): soup = BeautifulSoup(page, features="xml") for el in soup.find_all("外字"): el.string = "〓" text = soup.text for x in ["\n", "\t", " "]: text = text.replace(x, "") text = re.sub(r"〈([^〈]+)〉", r"\1", text) page = re.sub(r"〈([^〈]+)〉", r"␂\1␃", page) for mark in re.findall(r"《.", text): if mark[1] == "〓": page = page.replace("《", "<表外音訓/>", 1) else: page = re.sub(f"《([^{mark[1]}]*)({mark[1]})", r"\1<表外音訓>\2", page, count=1) for mark in re.findall(r"〈.", text): if mark[1] == "〓": page = page.replace("〈", "<表外字/>", 1) else: page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})", r"\1<表外字>\2", page, count=1) page = page.replace("␂", "〈") page = page.replace("␃", "〉") soup = BeautifulSoup(page, features="xml") for el in soup.find_all("表外音訓"): if el.text == "": el.append(el.next_sibling) mark_xml = "<表外音訓マーク>︽" mark_soup = BeautifulSoup(mark_xml, "xml") el.append(mark_soup.表外音訓マーク) for el in soup.find_all("表外字"): if el.text == "": el.append(el.next_sibling) mark_xml = "<表外字マーク>︿" mark_soup = BeautifulSoup(mark_xml, "xml") el.append(mark_soup.表外字マーク) return soup.decode() def __remove_pronunciation_parentheses(page): page = page.replace("<表音表記>(", "<表音表記>") page = page.replace(")", "") return page