from bs4 import BeautifulSoup def parse_hyouki_soup(soup, base_exps): omitted_characters = [ "／", "〈", "〉", "（", "）", "⦅", "⦆", "：", "…" ] exps = base_exps.copy() for child in soup.children: new_exps = [] if child.name == "言換G": for alt in child.find_all("言換"): parts = parse_hyouki_soup(alt, [""]) for exp in exps: for part in parts: new_exps.append(exp + part) elif child.name == "補足表記": alt1 = child.find("表記対象") alt2 = child.find("表記内容G") parts1 = parse_hyouki_soup(alt1, [""]) parts2 = parse_hyouki_soup(alt2, [""]) for exp in exps: for part in parts1: new_exps.append(exp + part) for part in parts2: new_exps.append(exp + part) elif child.name == "省略": parts = parse_hyouki_soup(child, [""]) for exp in exps: new_exps.append(exp) for part in parts: new_exps.append(exp + part) elif child.name is not None: new_exps = parse_hyouki_soup(child, exps) else: text = child.text for char in omitted_characters: text = text.replace(char, "") for exp in exps: new_exps.append(exp + text) exps = new_exps.copy() return exps def parse_hyouki_pattern(pattern): replacements = { "（": "<省略>（", "）": "）", "｛": "<補足表記><表記対象>", "・": "<表記内容G>（<表記内容>", "｝": "）", "〈": "<言換G>〈<言換>", "／": "／<言換>", "〉": "〉", "⦅": "<補足表記><表記対象>", "＼": "<表記内容G>⦅<表記内容>", "⦆": "⦆", } markup = f"{pattern}" for key, val in replacements.items(): markup = markup.replace(key, val) soup = BeautifulSoup(markup, "xml") hyouki_soup = soup.find("span") exps = parse_hyouki_soup(hyouki_soup, [""]) return exps