66 lines
2.3 KiB
Python
66 lines
2.3 KiB
Python
|
from bs4 import BeautifulSoup
|
|||
|
|
|||
|
|
|||
|
def parse_hyouki_soup(soup, base_exps):
|
|||
|
omitted_characters = [
|
|||
|
"/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
|
|||
|
]
|
|||
|
exps = base_exps.copy()
|
|||
|
for child in soup.children:
|
|||
|
new_exps = []
|
|||
|
if child.name == "言換G":
|
|||
|
for alt in child.find_all("言換"):
|
|||
|
parts = parse_hyouki_soup(alt, [""])
|
|||
|
for exp in exps:
|
|||
|
for part in parts:
|
|||
|
new_exps.append(exp + part)
|
|||
|
elif child.name == "補足表記":
|
|||
|
alt1 = child.find("表記対象")
|
|||
|
alt2 = child.find("表記内容G")
|
|||
|
parts1 = parse_hyouki_soup(alt1, [""])
|
|||
|
parts2 = parse_hyouki_soup(alt2, [""])
|
|||
|
for exp in exps:
|
|||
|
for part in parts1:
|
|||
|
new_exps.append(exp + part)
|
|||
|
for part in parts2:
|
|||
|
new_exps.append(exp + part)
|
|||
|
elif child.name == "省略":
|
|||
|
parts = parse_hyouki_soup(child, [""])
|
|||
|
for exp in exps:
|
|||
|
new_exps.append(exp)
|
|||
|
for part in parts:
|
|||
|
new_exps.append(exp + part)
|
|||
|
elif child.name is not None:
|
|||
|
new_exps = parse_hyouki_soup(child, exps)
|
|||
|
else:
|
|||
|
text = child.text
|
|||
|
for char in omitted_characters:
|
|||
|
text = text.replace(char, "")
|
|||
|
for exp in exps:
|
|||
|
new_exps.append(exp + text)
|
|||
|
exps = new_exps.copy()
|
|||
|
return exps
|
|||
|
|
|||
|
|
|||
|
def parse_hyouki_pattern(pattern):
|
|||
|
replacements = {
|
|||
|
"(": "<省略>(",
|
|||
|
")": ")</省略>",
|
|||
|
"{": "<補足表記><表記対象>",
|
|||
|
"・": "</表記対象><表記内容G>(<表記内容>",
|
|||
|
"}": "</表記内容>)</表記内容G></補足表記>",
|
|||
|
"〈": "<言換G>〈<言換>",
|
|||
|
"/": "</言換>/<言換>",
|
|||
|
"〉": "</言換>〉</言換G>",
|
|||
|
"⦅": "<補足表記><表記対象>",
|
|||
|
"\": "</表記対象><表記内容G>⦅<表記内容>",
|
|||
|
"⦆": "</表記内容>⦆</表記内容G></補足表記>",
|
|||
|
}
|
|||
|
markup = f"<span>{pattern}</span>"
|
|||
|
for key, val in replacements.items():
|
|||
|
markup = markup.replace(key, val)
|
|||
|
soup = BeautifulSoup(markup, "xml")
|
|||
|
hyouki_soup = soup.find("span")
|
|||
|
exps = parse_hyouki_soup(hyouki_soup, [""])
|
|||
|
return exps
|