Reorganize file structure of entries modules
This commit is contained in:
parent
0cd530585f
commit
9b3fdc86d1
|
@ -85,40 +85,3 @@ def expand_abbreviation_list(expressions):
|
|||
if new_exp not in new_exps:
|
||||
new_exps.append(new_exp)
|
||||
return new_exps
|
||||
|
||||
|
||||
def expand_smk_alternatives(text):
|
||||
"""Return a list of strings described by △ notation."""
|
||||
m = re.search(r"△([^(]+)(([^(]+))", text)
|
||||
if m is None:
|
||||
return [text]
|
||||
alt_parts = [m.group(1)]
|
||||
for alt_part in m.group(2).split("・"):
|
||||
alt_parts.append(alt_part)
|
||||
alts = []
|
||||
for alt_part in alt_parts:
|
||||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
|
||||
alts.append(alt_exp)
|
||||
return alts
|
||||
|
||||
|
||||
def expand_daijirin_alternatives(text):
|
||||
"""Return a list of strings described by = notation."""
|
||||
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||
groups = re.findall(group_pattern, text)
|
||||
expressions = [""]
|
||||
for group in groups:
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[0])
|
||||
expressions = new_exps.copy()
|
||||
if group[1] == "":
|
||||
continue
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[2])
|
||||
for expression in expressions:
|
||||
for alt in group[3].split("・"):
|
||||
new_exps.append(expression + alt)
|
||||
expressions = new_exps.copy()
|
||||
return expressions
|
|
@ -3,11 +3,11 @@ from abc import abstractmethod
|
|||
from datetime import datetime, date
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.entries.entry import Entry
|
||||
import bot.entries.expressions as Expressions
|
||||
from bot.entries.base.entry import Entry
|
||||
import bot.entries.base.expressions as Expressions
|
||||
|
||||
|
||||
class _JitenonEntry(Entry):
|
||||
class JitenonEntry(Entry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.expression = ""
|
||||
|
@ -140,104 +140,3 @@ class _JitenonEntry(Entry):
|
|||
elif isinstance(attr_val, list):
|
||||
colvals.append(";".join(attr_val))
|
||||
return ",".join(colvals)
|
||||
|
||||
|
||||
class JitenonYojiEntry(_JitenonEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.origin = ""
|
||||
self.kanken_level = ""
|
||||
self.category = ""
|
||||
self.related_expressions = []
|
||||
|
||||
def _get_column_map(self):
|
||||
return {
|
||||
"四字熟語": "expression",
|
||||
"読み方": "yomikata",
|
||||
"意味": "definition",
|
||||
"異形": "other_forms",
|
||||
"出典": "origin",
|
||||
"漢検級": "kanken_level",
|
||||
"場面用途": "category",
|
||||
"類義語": "related_expressions",
|
||||
}
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
|
||||
|
||||
class JitenonKotowazaEntry(_JitenonEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.origin = ""
|
||||
self.example = ""
|
||||
self.related_expressions = []
|
||||
|
||||
def _get_column_map(self):
|
||||
return {
|
||||
"言葉": "expression",
|
||||
"読み方": "yomikata",
|
||||
"意味": "definition",
|
||||
"異形": "other_forms",
|
||||
"出典": "origin",
|
||||
"例文": "example",
|
||||
"類句": "related_expressions",
|
||||
}
|
||||
|
||||
def _get_headwords(self):
|
||||
if self.expression == "金棒引き・鉄棒引き":
|
||||
headwords = {
|
||||
"かなぼうひき": ["金棒引き", "鉄棒引き"]
|
||||
}
|
||||
else:
|
||||
headwords = super()._get_headwords()
|
||||
return headwords
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
|
||||
|
||||
class JitenonKokugoEntry(_JitenonEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.example = ""
|
||||
self.alt_expression = ""
|
||||
self.antonym = ""
|
||||
self.attachments = ""
|
||||
self.compounds = ""
|
||||
self.related_words = ""
|
||||
|
||||
def _get_column_map(self):
|
||||
return {
|
||||
"言葉": "expression",
|
||||
"読み方": "yomikata",
|
||||
"意味": "definition",
|
||||
"例文": "example",
|
||||
"別表記": "alt_expression",
|
||||
"対義語": "antonym",
|
||||
"活用": "attachments",
|
||||
"用例": "compounds",
|
||||
"類語": "related_words",
|
||||
}
|
||||
|
||||
def _get_headwords(self):
|
||||
headwords = {}
|
||||
for reading in self.yomikata.split("・"):
|
||||
if reading not in headwords:
|
||||
headwords[reading] = []
|
||||
for expression in self.expression.split("・"):
|
||||
headwords[reading].append(expression)
|
||||
if self.alt_expression.strip() != "":
|
||||
for expression in self.alt_expression.split("・"):
|
||||
headwords[reading].append(expression)
|
||||
return headwords
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
60
bot/entries/base/sanseido_entry.py
Normal file
60
bot/entries/base/sanseido_entry.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
from abc import abstractmethod
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.entries.base.entry import Entry
|
||||
import bot.entries.base.expressions as Expressions
|
||||
|
||||
|
||||
class SanseidoEntry(Entry):
|
||||
def set_page(self, page):
|
||||
page = self._decompose_subentries(page)
|
||||
self._page = page
|
||||
|
||||
def get_page_soup(self):
|
||||
soup = BeautifulSoup(self._page, "xml")
|
||||
return soup
|
||||
|
||||
def get_global_identifier(self):
|
||||
parent_part = format(self.entry_id[0], '06')
|
||||
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
||||
return f"@{self.target.value}-{parent_part}-{child_part}"
|
||||
|
||||
def _decompose_subentries(self, page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
for x in self._get_subentry_parameters():
|
||||
subentry_class, tags, subentry_list = x
|
||||
for tag in tags:
|
||||
tag_soup = soup.find(tag)
|
||||
while tag_soup is not None:
|
||||
tag_soup.name = "項目"
|
||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||
subentry = subentry_class(self.target, subentry_id)
|
||||
page = tag_soup.decode()
|
||||
subentry.set_page(page)
|
||||
subentry_list.append(subentry)
|
||||
tag_soup.decompose()
|
||||
tag_soup = soup.find(tag)
|
||||
return soup.decode()
|
||||
|
||||
@abstractmethod
|
||||
def _get_subentry_parameters(self):
|
||||
pass
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
||||
@staticmethod
|
||||
def id_string_to_entry_id(id_string):
|
||||
parts = id_string.split("-")
|
||||
if len(parts) == 1:
|
||||
return (int(parts[0]), 0)
|
||||
elif len(parts) == 2:
|
||||
# subentries have a hexadecimal part
|
||||
return (int(parts[0]), int(parts[1], 16))
|
||||
else:
|
||||
raise Exception(f"Invalid entry ID: {id_string}")
|
|
@ -1,231 +0,0 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.entries.expressions as Expressions
|
||||
import bot.soup as Soup
|
||||
from bot.data import load_phrase_readings
|
||||
from bot.data import load_daijirin2_kana_abbreviations
|
||||
from bot.entries.entry import Entry
|
||||
from bot.entries.daijirin2_preprocess import preprocess_page
|
||||
|
||||
|
||||
class _BaseDaijirin2Entry(Entry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
|
||||
|
||||
def get_global_identifier(self):
|
||||
parent_part = format(self.entry_id[0], '06')
|
||||
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
||||
return f"@{self.target.value}-{parent_part}-{child_part}"
|
||||
|
||||
def set_page(self, page):
|
||||
page = self.__decompose_subentries(page)
|
||||
self._page = page
|
||||
|
||||
def get_page_soup(self):
|
||||
soup = BeautifulSoup(self._page, "xml")
|
||||
return soup
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
for pos_group in soup.find_all("品詞G"):
|
||||
if pos_group.parent.name == "大語義":
|
||||
self._set_part_of_speech_tags(pos_group)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _set_part_of_speech_tags(self, el):
|
||||
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
|
||||
for child in el.children:
|
||||
if child.name is not None:
|
||||
self._set_part_of_speech_tags(child)
|
||||
continue
|
||||
pos = str(child)
|
||||
if el.name not in pos_names:
|
||||
continue
|
||||
elif pos in ["[", "]"]:
|
||||
continue
|
||||
elif pos in self._part_of_speech_tags:
|
||||
continue
|
||||
else:
|
||||
self._part_of_speech_tags.append(pos)
|
||||
|
||||
def _get_regular_headwords(self, soup):
|
||||
self._fill_alts(soup)
|
||||
reading = soup.find("見出仮名").text
|
||||
expressions = []
|
||||
for el in soup.find_all("標準表記"):
|
||||
expression = self._clean_expression(el.text)
|
||||
if "—" in expression:
|
||||
kana_abbrs = self._kana_abbreviations[self.entry_id]
|
||||
for abbr in kana_abbrs:
|
||||
expression = expression.replace("—", abbr, 1)
|
||||
expressions.append(expression)
|
||||
expressions = Expressions.expand_abbreviation_list(expressions)
|
||||
if len(expressions) == 0:
|
||||
expressions.append(reading)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
||||
def __decompose_subentries(self, page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
subentry_parameters = [
|
||||
[Daijirin2ChildEntry, ["子項目"], self.children],
|
||||
[Daijirin2PhraseEntry, ["句項目"], self.phrases],
|
||||
]
|
||||
for x in subentry_parameters:
|
||||
subentry_class, tags, subentry_list = x
|
||||
for tag in tags:
|
||||
tag_soup = soup.find(tag)
|
||||
while tag_soup is not None:
|
||||
tag_soup.name = "項目"
|
||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||
subentry = subentry_class(self.target, subentry_id)
|
||||
page = tag_soup.decode()
|
||||
subentry.set_page(page)
|
||||
subentry_list.append(subentry)
|
||||
tag_soup.decompose()
|
||||
tag_soup = soup.find(tag)
|
||||
return soup.decode()
|
||||
|
||||
@staticmethod
|
||||
def id_string_to_entry_id(id_string):
|
||||
parts = id_string.split("-")
|
||||
if len(parts) == 1:
|
||||
return (int(parts[0]), 0)
|
||||
elif len(parts) == 2:
|
||||
# subentries have a hexadecimal part
|
||||
return (int(parts[0]), int(parts[1], 16))
|
||||
else:
|
||||
raise Exception(f"Invalid entry ID: {id_string}")
|
||||
|
||||
@staticmethod
|
||||
def _delete_unused_nodes(soup):
|
||||
"""Remove extra markup elements that appear in the entry
|
||||
headword line which are not part of the entry headword"""
|
||||
unused_nodes = [
|
||||
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
|
||||
"表外字マーク", "表外字マーク", "ルビG"
|
||||
]
|
||||
for name in unused_nodes:
|
||||
Soup.delete_soup_nodes(soup, name)
|
||||
|
||||
@staticmethod
|
||||
def _clean_expression(expression):
|
||||
for x in ["〈", "〉", "《", "》", " "]:
|
||||
expression = expression.replace(x, "")
|
||||
return expression
|
||||
|
||||
@staticmethod
|
||||
def _fill_alts(soup):
|
||||
for gaiji in soup.find_all(class_="gaiji"):
|
||||
if gaiji.name == "img" and gaiji.has_attr("alt"):
|
||||
gaiji.name = "span"
|
||||
gaiji.string = gaiji.attrs["alt"]
|
||||
|
||||
|
||||
class Daijirin2Entry(_BaseDaijirin2Entry):
|
||||
def __init__(self, target, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(target, entry_id)
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
if soup.find("漢字見出") is not None:
|
||||
headwords = self._get_kanji_headwords(soup)
|
||||
elif soup.find("略語G") is not None:
|
||||
headwords = self._get_acronym_headwords(soup)
|
||||
else:
|
||||
headwords = self._get_regular_headwords(soup)
|
||||
return headwords
|
||||
|
||||
def _get_kanji_headwords(self, soup):
|
||||
readings = []
|
||||
for el in soup.find_all("漢字音"):
|
||||
hira = Expressions.kata_to_hira(el.text)
|
||||
readings.append(hira)
|
||||
if soup.find("漢字音") is None:
|
||||
readings.append("")
|
||||
expressions = []
|
||||
for el in soup.find_all("漢字見出"):
|
||||
expressions.append(el.text)
|
||||
headwords = {}
|
||||
for reading in readings:
|
||||
headwords[reading] = expressions
|
||||
return headwords
|
||||
|
||||
def _get_acronym_headwords(self, soup):
|
||||
expressions = []
|
||||
for el in soup.find_all("略語"):
|
||||
expression_parts = []
|
||||
for part in el.find_all(["欧字", "和字"]):
|
||||
expression_parts.append(part.text)
|
||||
expression = "".join(expression_parts)
|
||||
expressions.append(expression)
|
||||
headwords = {"": expressions}
|
||||
return headwords
|
||||
|
||||
|
||||
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
headwords = self._get_regular_headwords(soup)
|
||||
return headwords
|
||||
|
||||
|
||||
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrases do not contain these tags
|
||||
return []
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
headwords = {}
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings()
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
return headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
self._delete_unused_nodes(soup)
|
||||
text = soup.find("句表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = Expressions.expand_daijirin_alternatives(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
expressions.append(exp)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self):
|
||||
phrase_readings = load_phrase_readings(self.target)
|
||||
text = phrase_readings[self.entry_id]
|
||||
alternatives = Expressions.expand_daijirin_alternatives(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
88
bot/entries/daijirin2/base_entry.py
Normal file
88
bot/entries/daijirin2/base_entry.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
import bot.soup as Soup
|
||||
from bot.data import load_daijirin2_kana_abbreviations
|
||||
from bot.entries.base.sanseido_entry import SanseidoEntry
|
||||
import bot.entries.base.expressions as Expressions
|
||||
|
||||
|
||||
class BaseEntry(SanseidoEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
for pos_group in soup.find_all("品詞G"):
|
||||
if pos_group.parent.name == "大語義":
|
||||
self._set_part_of_speech_tags(pos_group)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _set_part_of_speech_tags(self, el):
|
||||
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
|
||||
for child in el.children:
|
||||
if child.name is not None:
|
||||
self._set_part_of_speech_tags(child)
|
||||
continue
|
||||
pos = str(child)
|
||||
if el.name not in pos_names:
|
||||
continue
|
||||
elif pos in ["[", "]"]:
|
||||
continue
|
||||
elif pos in self._part_of_speech_tags:
|
||||
continue
|
||||
else:
|
||||
self._part_of_speech_tags.append(pos)
|
||||
|
||||
def _get_regular_headwords(self, soup):
|
||||
self._fill_alts(soup)
|
||||
reading = soup.find("見出仮名").text
|
||||
expressions = []
|
||||
for el in soup.find_all("標準表記"):
|
||||
expression = self._clean_expression(el.text)
|
||||
if "—" in expression:
|
||||
kana_abbrs = self._kana_abbreviations[self.entry_id]
|
||||
for abbr in kana_abbrs:
|
||||
expression = expression.replace("—", abbr, 1)
|
||||
expressions.append(expression)
|
||||
expressions = Expressions.expand_abbreviation_list(expressions)
|
||||
if len(expressions) == 0:
|
||||
expressions.append(reading)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
||||
|
||||
def _get_subentry_parameters(self):
|
||||
from bot.entries.daijirin2.child_entry import ChildEntry
|
||||
from bot.entries.daijirin2.phrase_entry import PhraseEntry
|
||||
subentry_parameters = [
|
||||
[ChildEntry, ["子項目"], self.children],
|
||||
[PhraseEntry, ["句項目"], self.phrases],
|
||||
]
|
||||
return subentry_parameters
|
||||
|
||||
@staticmethod
|
||||
def _delete_unused_nodes(soup):
|
||||
"""Remove extra markup elements that appear in the entry
|
||||
headword line which are not part of the entry headword"""
|
||||
unused_nodes = [
|
||||
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
|
||||
"表外字マーク", "表外字マーク", "ルビG"
|
||||
]
|
||||
for name in unused_nodes:
|
||||
Soup.delete_soup_nodes(soup, name)
|
||||
|
||||
@staticmethod
|
||||
def _clean_expression(expression):
|
||||
for x in ["〈", "〉", "《", "》", " "]:
|
||||
expression = expression.replace(x, "")
|
||||
return expression
|
||||
|
||||
@staticmethod
|
||||
def _fill_alts(soup):
|
||||
for gaiji in soup.find_all(class_="gaiji"):
|
||||
if gaiji.name == "img" and gaiji.has_attr("alt"):
|
||||
gaiji.name = "span"
|
||||
gaiji.string = gaiji.attrs["alt"]
|
9
bot/entries/daijirin2/child_entry.py
Normal file
9
bot/entries/daijirin2/child_entry.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
from bot.entries.daijirin2.base_entry import BaseEntry
|
||||
|
||||
|
||||
class ChildEntry(BaseEntry):
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
headwords = self._get_regular_headwords(soup)
|
||||
return headwords
|
50
bot/entries/daijirin2/entry.py
Normal file
50
bot/entries/daijirin2/entry.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import bot.entries.base.expressions as Expressions
|
||||
from bot.entries.daijirin2.base_entry import BaseEntry
|
||||
from bot.entries.daijirin2.preprocess import preprocess_page
|
||||
|
||||
|
||||
class Entry(BaseEntry):
|
||||
def __init__(self, target, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(target, entry_id)
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
if soup.find("漢字見出") is not None:
|
||||
headwords = self._get_kanji_headwords(soup)
|
||||
elif soup.find("略語G") is not None:
|
||||
headwords = self._get_acronym_headwords(soup)
|
||||
else:
|
||||
headwords = self._get_regular_headwords(soup)
|
||||
return headwords
|
||||
|
||||
def _get_kanji_headwords(self, soup):
|
||||
readings = []
|
||||
for el in soup.find_all("漢字音"):
|
||||
hira = Expressions.kata_to_hira(el.text)
|
||||
readings.append(hira)
|
||||
if soup.find("漢字音") is None:
|
||||
readings.append("")
|
||||
expressions = []
|
||||
for el in soup.find_all("漢字見出"):
|
||||
expressions.append(el.text)
|
||||
headwords = {}
|
||||
for reading in readings:
|
||||
headwords[reading] = expressions
|
||||
return headwords
|
||||
|
||||
def _get_acronym_headwords(self, soup):
|
||||
expressions = []
|
||||
for el in soup.find_all("略語"):
|
||||
expression_parts = []
|
||||
for part in el.find_all(["欧字", "和字"]):
|
||||
expression_parts.append(part.text)
|
||||
expression = "".join(expression_parts)
|
||||
expressions.append(expression)
|
||||
headwords = {"": expressions}
|
||||
return headwords
|
67
bot/entries/daijirin2/phrase_entry.py
Normal file
67
bot/entries/daijirin2/phrase_entry.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
import re
|
||||
|
||||
import bot.entries.base.expressions as Expressions
|
||||
from bot.data import load_phrase_readings
|
||||
from bot.entries.daijirin2.base_entry import BaseEntry
|
||||
|
||||
|
||||
class PhraseEntry(BaseEntry):
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrases do not contain these tags
|
||||
return []
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
headwords = {}
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings()
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
return headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
self._delete_unused_nodes(soup)
|
||||
text = soup.find("句表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = parse_phrase(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
expressions.append(exp)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self):
|
||||
phrase_readings = load_phrase_readings(self.target)
|
||||
text = phrase_readings[self.entry_id]
|
||||
alternatives = parse_phrase(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
||||
|
||||
|
||||
def parse_phrase(text):
|
||||
"""Return a list of strings described by = notation."""
|
||||
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||
groups = re.findall(group_pattern, text)
|
||||
expressions = [""]
|
||||
for group in groups:
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[0])
|
||||
expressions = new_exps.copy()
|
||||
if group[1] == "":
|
||||
continue
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[2])
|
||||
for expression in expressions:
|
||||
for alt in group[3].split("・"):
|
||||
new_exps.append(expression + alt)
|
||||
expressions = new_exps.copy()
|
||||
return expressions
|
|
@ -1,20 +1,7 @@
|
|||
from bot.targets import Targets
|
||||
|
||||
from bot.entries.jitenon import JitenonKokugoEntry
|
||||
from bot.entries.jitenon import JitenonYojiEntry
|
||||
from bot.entries.jitenon import JitenonKotowazaEntry
|
||||
from bot.entries.smk8 import Smk8Entry
|
||||
from bot.entries.daijirin2 import Daijirin2Entry
|
||||
from bot.entries.sankoku8 import Sankoku8Entry
|
||||
import importlib
|
||||
|
||||
|
||||
def new_entry(target, page_id):
|
||||
entry_map = {
|
||||
Targets.JITENON_KOKUGO: JitenonKokugoEntry,
|
||||
Targets.JITENON_YOJI: JitenonYojiEntry,
|
||||
Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
|
||||
Targets.SMK8: Smk8Entry,
|
||||
Targets.DAIJIRIN2: Daijirin2Entry,
|
||||
Targets.SANKOKU8: Sankoku8Entry,
|
||||
}
|
||||
return entry_map[target](target, page_id)
|
||||
module_path = f"bot.entries.{target.name.lower()}.entry"
|
||||
module = importlib.import_module(module_path)
|
||||
return module.Entry(target, page_id)
|
||||
|
|
45
bot/entries/jitenon_kokugo/entry.py
Normal file
45
bot/entries/jitenon_kokugo/entry.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
from bot.entries.base.jitenon_entry import JitenonEntry
|
||||
import bot.entries.base.expressions as Expressions
|
||||
|
||||
|
||||
class Entry(JitenonEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.example = ""
|
||||
self.alt_expression = ""
|
||||
self.antonym = ""
|
||||
self.attachments = ""
|
||||
self.compounds = ""
|
||||
self.related_words = ""
|
||||
|
||||
def _get_column_map(self):
|
||||
return {
|
||||
"言葉": "expression",
|
||||
"読み方": "yomikata",
|
||||
"意味": "definition",
|
||||
"例文": "example",
|
||||
"別表記": "alt_expression",
|
||||
"対義語": "antonym",
|
||||
"活用": "attachments",
|
||||
"用例": "compounds",
|
||||
"類語": "related_words",
|
||||
}
|
||||
|
||||
def _get_headwords(self):
|
||||
headwords = {}
|
||||
for reading in self.yomikata.split("・"):
|
||||
if reading not in headwords:
|
||||
headwords[reading] = []
|
||||
for expression in self.expression.split("・"):
|
||||
headwords[reading].append(expression)
|
||||
if self.alt_expression.strip() != "":
|
||||
for expression in self.alt_expression.split("・"):
|
||||
headwords[reading].append(expression)
|
||||
return headwords
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
35
bot/entries/jitenon_kotowaza/entry.py
Normal file
35
bot/entries/jitenon_kotowaza/entry.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
from bot.entries.base.jitenon_entry import JitenonEntry
|
||||
import bot.entries.base.expressions as Expressions
|
||||
|
||||
|
||||
class Entry(JitenonEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.origin = ""
|
||||
self.example = ""
|
||||
self.related_expressions = []
|
||||
|
||||
def _get_column_map(self):
|
||||
return {
|
||||
"言葉": "expression",
|
||||
"読み方": "yomikata",
|
||||
"意味": "definition",
|
||||
"異形": "other_forms",
|
||||
"出典": "origin",
|
||||
"例文": "example",
|
||||
"類句": "related_expressions",
|
||||
}
|
||||
|
||||
def _get_headwords(self):
|
||||
if self.expression == "金棒引き・鉄棒引き":
|
||||
headwords = {
|
||||
"かなぼうひき": ["金棒引き", "鉄棒引き"]
|
||||
}
|
||||
else:
|
||||
headwords = super()._get_headwords()
|
||||
return headwords
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
27
bot/entries/jitenon_yoji/entry.py
Normal file
27
bot/entries/jitenon_yoji/entry.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import bot.entries.base.expressions as Expressions
|
||||
from bot.entries.base.jitenon_entry import JitenonEntry
|
||||
|
||||
|
||||
class Entry(JitenonEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.origin = ""
|
||||
self.kanken_level = ""
|
||||
self.category = ""
|
||||
self.related_expressions = []
|
||||
|
||||
def _get_column_map(self):
|
||||
return {
|
||||
"四字熟語": "expression",
|
||||
"読み方": "yomikata",
|
||||
"意味": "definition",
|
||||
"異形": "other_forms",
|
||||
"出典": "origin",
|
||||
"漢検級": "kanken_level",
|
||||
"場面用途": "category",
|
||||
"類義語": "related_expressions",
|
||||
}
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
|
@ -1,260 +0,0 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import bot.entries.expressions as Expressions
|
||||
import bot.soup as Soup
|
||||
from bot.entries.entry import Entry
|
||||
from bot.data import load_phrase_readings
|
||||
from bot.entries.sankoku8_preprocess import preprocess_page
|
||||
|
||||
|
||||
class _BaseSankoku8Entry(Entry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self._hyouki_name = "表記"
|
||||
self._midashi_name = None
|
||||
self._midashi_kana_name = None
|
||||
|
||||
def get_global_identifier(self):
|
||||
parent_part = format(self.entry_id[0], '06')
|
||||
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
||||
return f"@{self.target.value}-{parent_part}-{child_part}"
|
||||
|
||||
def set_page(self, page):
|
||||
page = self.__decompose_subentries(page)
|
||||
self._page = page
|
||||
|
||||
def get_page_soup(self):
|
||||
soup = BeautifulSoup(self._page, "xml")
|
||||
return soup
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
readings = self._find_readings(soup)
|
||||
expressions = self._find_expressions(soup)
|
||||
headwords = {}
|
||||
for reading in readings:
|
||||
headwords[reading] = []
|
||||
if len(readings) == 1:
|
||||
reading = readings[0]
|
||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||
headwords[reading].append(reading)
|
||||
for exp in expressions:
|
||||
if exp not in headwords[reading]:
|
||||
headwords[reading].append(exp)
|
||||
elif len(readings) > 1 and len(expressions) == 0:
|
||||
for reading in readings:
|
||||
headwords[reading].append(reading)
|
||||
elif len(readings) > 1 and len(expressions) == 1:
|
||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||
for reading in readings:
|
||||
headwords[reading].append(reading)
|
||||
expression = expressions[0]
|
||||
for reading in readings:
|
||||
if expression not in headwords[reading]:
|
||||
headwords[reading].append(expression)
|
||||
elif len(readings) > 1 and len(expressions) == len(readings):
|
||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||
for reading in readings:
|
||||
headwords[reading].append(reading)
|
||||
for idx, reading in enumerate(readings):
|
||||
exp = expressions[idx]
|
||||
if exp not in headwords[reading]:
|
||||
headwords[reading].append(exp)
|
||||
else:
|
||||
raise Exception() # shouldn't happen
|
||||
return headwords
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
|
||||
pos_group = midashi.find("品詞G")
|
||||
if pos_group is None:
|
||||
continue
|
||||
for tag in pos_group.find_all("a"):
|
||||
if tag.text not in self._part_of_speech_tags:
|
||||
self._part_of_speech_tags.append(tag.text)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
expressions = []
|
||||
for hyouki in soup.find_all(self._hyouki_name):
|
||||
for expression in parse_hyouki_soup(hyouki, [""]):
|
||||
expressions.append(expression)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self, soup):
|
||||
midasi_kana = soup.find(self._midashi_kana_name)
|
||||
readings = parse_hyouki_soup(midasi_kana, [""])
|
||||
return readings
|
||||
|
||||
def __decompose_subentries(self, page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
subentry_parameters = [
|
||||
[Sankoku8ChildEntry, ["子項目"], self.children],
|
||||
[Sankoku8PhraseEntry, ["句項目"], self.phrases],
|
||||
]
|
||||
for x in subentry_parameters:
|
||||
subentry_class, tags, subentry_list = x
|
||||
for tag in tags:
|
||||
tag_soup = soup.find(tag)
|
||||
while tag_soup is not None:
|
||||
tag_soup.name = "項目"
|
||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||
subentry = subentry_class(self.target, subentry_id)
|
||||
page = tag_soup.decode()
|
||||
subentry.set_page(page)
|
||||
subentry_list.append(subentry)
|
||||
tag_soup.decompose()
|
||||
tag_soup = soup.find(tag)
|
||||
return soup.decode()
|
||||
|
||||
@staticmethod
|
||||
def id_string_to_entry_id(id_string):
|
||||
parts = id_string.split("-")
|
||||
if len(parts) == 1:
|
||||
return (int(parts[0]), 0)
|
||||
elif len(parts) == 2:
|
||||
# subentries have a hexadecimal part
|
||||
return (int(parts[0]), int(parts[1], 16))
|
||||
else:
|
||||
raise Exception(f"Invalid entry ID: {id_string}")
|
||||
|
||||
@staticmethod
|
||||
def _delete_unused_nodes(soup):
|
||||
"""Remove extra markup elements that appear in the entry
|
||||
headword line which are not part of the entry headword"""
|
||||
unused_nodes = [
|
||||
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
|
||||
"アクセント分節", "活用分節", "ルビG", "分書"
|
||||
]
|
||||
for name in unused_nodes:
|
||||
Soup.delete_soup_nodes(soup, name)
|
||||
|
||||
|
||||
class Sankoku8Entry(_BaseSankoku8Entry):
|
||||
def __init__(self, target, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(target, entry_id)
|
||||
self._midashi_name = "見出部"
|
||||
self._midashi_kana_name = "見出仮名"
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
||||
|
||||
|
||||
class Sankoku8ChildEntry(_BaseSankoku8Entry):
|
||||
def __init__(self, target, page_id):
|
||||
super().__init__(target, page_id)
|
||||
self._midashi_name = "子見出部"
|
||||
self._midashi_kana_name = "子見出仮名"
|
||||
|
||||
|
||||
class Sankoku8PhraseEntry(_BaseSankoku8Entry):
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrases do not contain these tags
|
||||
return []
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings(soup)
|
||||
headwords = {}
|
||||
if len(expressions) != len(readings):
|
||||
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
return headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
phrase_soup = soup.find("句表記")
|
||||
expressions = parse_hyouki_soup(phrase_soup, [""])
|
||||
return expressions
|
||||
|
||||
def _find_readings(self, soup):
|
||||
reading_patterns = load_phrase_readings(self.target)
|
||||
reading_pattern = reading_patterns[self.entry_id]
|
||||
readings = parse_hyouki_pattern(reading_pattern)
|
||||
return readings
|
||||
|
||||
|
||||
def parse_hyouki_soup(soup, base_exps):
|
||||
omitted_characters = [
|
||||
"/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
|
||||
]
|
||||
exps = base_exps.copy()
|
||||
for child in soup.children:
|
||||
new_exps = []
|
||||
if child.name == "言換G":
|
||||
for alt in child.find_all("言換"):
|
||||
parts = parse_hyouki_soup(alt, [""])
|
||||
for exp in exps:
|
||||
for part in parts:
|
||||
new_exps.append(exp + part)
|
||||
elif child.name == "補足表記":
|
||||
alt1 = child.find("表記対象")
|
||||
alt2 = child.find("表記内容G")
|
||||
parts1 = parse_hyouki_soup(alt1, [""])
|
||||
parts2 = parse_hyouki_soup(alt2, [""])
|
||||
for exp in exps:
|
||||
for part in parts1:
|
||||
new_exps.append(exp + part)
|
||||
for part in parts2:
|
||||
new_exps.append(exp + part)
|
||||
elif child.name == "省略":
|
||||
parts = parse_hyouki_soup(child, [""])
|
||||
for exp in exps:
|
||||
new_exps.append(exp)
|
||||
for part in parts:
|
||||
new_exps.append(exp + part)
|
||||
elif child.name is not None:
|
||||
new_exps = parse_hyouki_soup(child, exps)
|
||||
else:
|
||||
text = child.text
|
||||
for char in omitted_characters:
|
||||
text = text.replace(char, "")
|
||||
for exp in exps:
|
||||
new_exps.append(exp + text)
|
||||
exps = new_exps.copy()
|
||||
return exps
|
||||
|
||||
|
||||
def parse_hyouki_pattern(pattern):
|
||||
replacements = {
|
||||
"(": "<省略>(",
|
||||
")": ")</省略>",
|
||||
"{": "<補足表記><表記対象>",
|
||||
"・": "</表記対象><表記内容G>(<表記内容>",
|
||||
"}": "</表記内容>)</表記内容G></補足表記>",
|
||||
"〈": "<言換G>〈<言換>",
|
||||
"/": "</言換>/<言換>",
|
||||
"〉": "</言換>〉</言換G>",
|
||||
"⦅": "<補足表記><表記対象>",
|
||||
"\": "</表記対象><表記内容G>⦅<表記内容>",
|
||||
"⦆": "</表記内容>⦆</表記内容G></補足表記>",
|
||||
}
|
||||
markup = f"<span>{pattern}</span>"
|
||||
for key, val in replacements.items():
|
||||
markup = markup.replace(key, val)
|
||||
soup = BeautifulSoup(markup, "xml")
|
||||
hyouki_soup = soup.find("span")
|
||||
exps = parse_hyouki_soup(hyouki_soup, [""])
|
||||
return exps
|
97
bot/entries/sankoku8/base_entry.py
Normal file
97
bot/entries/sankoku8/base_entry.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
import bot.soup as Soup
|
||||
from bot.entries.base.sanseido_entry import SanseidoEntry
|
||||
from bot.entries.sankoku8.parse import parse_hyouki_soup
|
||||
|
||||
|
||||
class BaseEntry(SanseidoEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self._hyouki_name = "表記"
|
||||
self._midashi_name = None
|
||||
self._midashi_kana_name = None
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
readings = self._find_readings(soup)
|
||||
expressions = self._find_expressions(soup)
|
||||
headwords = {}
|
||||
for reading in readings:
|
||||
headwords[reading] = []
|
||||
if len(readings) == 1:
|
||||
reading = readings[0]
|
||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||
headwords[reading].append(reading)
|
||||
for exp in expressions:
|
||||
if exp not in headwords[reading]:
|
||||
headwords[reading].append(exp)
|
||||
elif len(readings) > 1 and len(expressions) == 0:
|
||||
for reading in readings:
|
||||
headwords[reading].append(reading)
|
||||
elif len(readings) > 1 and len(expressions) == 1:
|
||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||
for reading in readings:
|
||||
headwords[reading].append(reading)
|
||||
expression = expressions[0]
|
||||
for reading in readings:
|
||||
if expression not in headwords[reading]:
|
||||
headwords[reading].append(expression)
|
||||
elif len(readings) > 1 and len(expressions) == len(readings):
|
||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||
for reading in readings:
|
||||
headwords[reading].append(reading)
|
||||
for idx, reading in enumerate(readings):
|
||||
exp = expressions[idx]
|
||||
if exp not in headwords[reading]:
|
||||
headwords[reading].append(exp)
|
||||
else:
|
||||
raise Exception() # shouldn't happen
|
||||
return headwords
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
|
||||
pos_group = midashi.find("品詞G")
|
||||
if pos_group is None:
|
||||
continue
|
||||
for tag in pos_group.find_all("a"):
|
||||
if tag.text not in self._part_of_speech_tags:
|
||||
self._part_of_speech_tags.append(tag.text)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
expressions = []
|
||||
for hyouki in soup.find_all(self._hyouki_name):
|
||||
for expression in parse_hyouki_soup(hyouki, [""]):
|
||||
expressions.append(expression)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self, soup):
|
||||
midasi_kana = soup.find(self._midashi_kana_name)
|
||||
readings = parse_hyouki_soup(midasi_kana, [""])
|
||||
return readings
|
||||
|
||||
def _get_subentry_parameters(self):
|
||||
from bot.entries.sankoku8.child_entry import ChildEntry
|
||||
from bot.entries.sankoku8.phrase_entry import PhraseEntry
|
||||
subentry_parameters = [
|
||||
[ChildEntry, ["子項目"], self.children],
|
||||
[PhraseEntry, ["句項目"], self.phrases],
|
||||
]
|
||||
return subentry_parameters
|
||||
|
||||
@staticmethod
|
||||
def _delete_unused_nodes(soup):
|
||||
"""Remove extra markup elements that appear in the entry
|
||||
headword line which are not part of the entry headword"""
|
||||
unused_nodes = [
|
||||
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
|
||||
"アクセント分節", "活用分節", "ルビG", "分書"
|
||||
]
|
||||
for name in unused_nodes:
|
||||
Soup.delete_soup_nodes(soup, name)
|
8
bot/entries/sankoku8/child_entry.py
Normal file
8
bot/entries/sankoku8/child_entry.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from bot.entries.sankoku8.base_entry import BaseEntry
|
||||
|
||||
|
||||
class ChildEntry(BaseEntry):
|
||||
def __init__(self, target, page_id):
|
||||
super().__init__(target, page_id)
|
||||
self._midashi_name = "子見出部"
|
||||
self._midashi_kana_name = "子見出仮名"
|
14
bot/entries/sankoku8/entry.py
Normal file
14
bot/entries/sankoku8/entry.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
from bot.entries.sankoku8.base_entry import BaseEntry
|
||||
from bot.entries.sankoku8.preprocess import preprocess_page
|
||||
|
||||
|
||||
class Entry(BaseEntry):
|
||||
def __init__(self, target, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(target, entry_id)
|
||||
self._midashi_name = "見出部"
|
||||
self._midashi_kana_name = "見出仮名"
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
65
bot/entries/sankoku8/parse.py
Normal file
65
bot/entries/sankoku8/parse.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def parse_hyouki_soup(soup, base_exps):
|
||||
omitted_characters = [
|
||||
"/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
|
||||
]
|
||||
exps = base_exps.copy()
|
||||
for child in soup.children:
|
||||
new_exps = []
|
||||
if child.name == "言換G":
|
||||
for alt in child.find_all("言換"):
|
||||
parts = parse_hyouki_soup(alt, [""])
|
||||
for exp in exps:
|
||||
for part in parts:
|
||||
new_exps.append(exp + part)
|
||||
elif child.name == "補足表記":
|
||||
alt1 = child.find("表記対象")
|
||||
alt2 = child.find("表記内容G")
|
||||
parts1 = parse_hyouki_soup(alt1, [""])
|
||||
parts2 = parse_hyouki_soup(alt2, [""])
|
||||
for exp in exps:
|
||||
for part in parts1:
|
||||
new_exps.append(exp + part)
|
||||
for part in parts2:
|
||||
new_exps.append(exp + part)
|
||||
elif child.name == "省略":
|
||||
parts = parse_hyouki_soup(child, [""])
|
||||
for exp in exps:
|
||||
new_exps.append(exp)
|
||||
for part in parts:
|
||||
new_exps.append(exp + part)
|
||||
elif child.name is not None:
|
||||
new_exps = parse_hyouki_soup(child, exps)
|
||||
else:
|
||||
text = child.text
|
||||
for char in omitted_characters:
|
||||
text = text.replace(char, "")
|
||||
for exp in exps:
|
||||
new_exps.append(exp + text)
|
||||
exps = new_exps.copy()
|
||||
return exps
|
||||
|
||||
|
||||
def parse_hyouki_pattern(pattern):
|
||||
replacements = {
|
||||
"(": "<省略>(",
|
||||
")": ")</省略>",
|
||||
"{": "<補足表記><表記対象>",
|
||||
"・": "</表記対象><表記内容G>(<表記内容>",
|
||||
"}": "</表記内容>)</表記内容G></補足表記>",
|
||||
"〈": "<言換G>〈<言換>",
|
||||
"/": "</言換>/<言換>",
|
||||
"〉": "</言換>〉</言換G>",
|
||||
"⦅": "<補足表記><表記対象>",
|
||||
"\": "</表記対象><表記内容G>⦅<表記内容>",
|
||||
"⦆": "</表記内容>⦆</表記内容G></補足表記>",
|
||||
}
|
||||
markup = f"<span>{pattern}</span>"
|
||||
for key, val in replacements.items():
|
||||
markup = markup.replace(key, val)
|
||||
soup = BeautifulSoup(markup, "xml")
|
||||
hyouki_soup = soup.find("span")
|
||||
exps = parse_hyouki_soup(hyouki_soup, [""])
|
||||
return exps
|
37
bot/entries/sankoku8/phrase_entry.py
Normal file
37
bot/entries/sankoku8/phrase_entry.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from bot.data import load_phrase_readings
|
||||
from bot.entries.sankoku8.base_entry import BaseEntry
|
||||
from bot.entries.sankoku8.parse import parse_hyouki_soup
|
||||
from bot.entries.sankoku8.parse import parse_hyouki_pattern
|
||||
|
||||
|
||||
class PhraseEntry(BaseEntry):
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrases do not contain these tags
|
||||
return []
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings(soup)
|
||||
headwords = {}
|
||||
if len(expressions) != len(readings):
|
||||
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
return headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
phrase_soup = soup.find("句表記")
|
||||
expressions = parse_hyouki_soup(phrase_soup, [""])
|
||||
return expressions
|
||||
|
||||
def _find_readings(self, soup):
|
||||
reading_patterns = load_phrase_readings(self.target)
|
||||
reading_pattern = reading_patterns[self.entry_id]
|
||||
readings = parse_hyouki_pattern(reading_pattern)
|
||||
return readings
|
|
@ -1,221 +0,0 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.entries.expressions as Expressions
|
||||
import bot.soup as Soup
|
||||
from bot.data import load_phrase_readings
|
||||
from bot.entries.entry import Entry
|
||||
from bot.entries.smk8_preprocess import preprocess_page
|
||||
|
||||
|
||||
class _BaseSmk8Entry(Entry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self.kanjis = []
|
||||
|
||||
def get_global_identifier(self):
|
||||
parent_part = format(self.entry_id[0], '06')
|
||||
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
||||
return f"@{self.target.value}-{parent_part}-{child_part}"
|
||||
|
||||
def set_page(self, page):
|
||||
page = self.__decompose_subentries(page)
|
||||
self._page = page
|
||||
|
||||
def get_page_soup(self):
|
||||
soup = BeautifulSoup(self._page, "xml")
|
||||
return soup
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
headword_info = soup.find("見出要素")
|
||||
if headword_info is None:
|
||||
return self._part_of_speech_tags
|
||||
for tag in headword_info.find_all("品詞M"):
|
||||
if tag.text not in self._part_of_speech_tags:
|
||||
self._part_of_speech_tags.append(tag.text)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _add_variant_expressions(self, headwords):
|
||||
for expressions in headwords.values():
|
||||
Expressions.add_variant_kanji(expressions)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
||||
def _find_reading(self, soup):
|
||||
midasi_kana = soup.find("見出仮名")
|
||||
reading = midasi_kana.text
|
||||
for x in [" ", "・"]:
|
||||
reading = reading.replace(x, "")
|
||||
return reading
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
clean_expressions = []
|
||||
for expression in soup.find_all("標準表記"):
|
||||
clean_expression = self._clean_expression(expression.text)
|
||||
clean_expressions.append(clean_expression)
|
||||
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
||||
return expressions
|
||||
|
||||
def __decompose_subentries(self, page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
subentry_parameters = [
|
||||
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
|
||||
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
||||
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
|
||||
]
|
||||
for x in subentry_parameters:
|
||||
subentry_class, tags, subentry_list = x
|
||||
for tag in tags:
|
||||
tag_soup = soup.find(tag)
|
||||
while tag_soup is not None:
|
||||
tag_soup.name = "項目"
|
||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||
subentry = subentry_class(self.target, subentry_id)
|
||||
page = tag_soup.decode()
|
||||
subentry.set_page(page)
|
||||
subentry_list.append(subentry)
|
||||
tag_soup.decompose()
|
||||
tag_soup = soup.find(tag)
|
||||
return soup.decode()
|
||||
|
||||
@staticmethod
|
||||
def id_string_to_entry_id(id_string):
|
||||
parts = id_string.split("-")
|
||||
if len(parts) == 1:
|
||||
return (int(parts[0]), 0)
|
||||
elif len(parts) == 2:
|
||||
# subentries have a hexadecimal part
|
||||
return (int(parts[0]), int(parts[1], 16))
|
||||
else:
|
||||
raise Exception(f"Invalid entry ID: {id_string}")
|
||||
|
||||
@staticmethod
|
||||
def _delete_unused_nodes(soup):
|
||||
"""Remove extra markup elements that appear in the entry
|
||||
headword line which are not part of the entry headword"""
|
||||
unused_nodes = [
|
||||
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
|
||||
]
|
||||
for name in unused_nodes:
|
||||
Soup.delete_soup_nodes(soup, name)
|
||||
|
||||
@staticmethod
|
||||
def _clean_expression(expression):
|
||||
for x in ["〈", "〉", "{", "}", "…", " "]:
|
||||
expression = expression.replace(x, "")
|
||||
return expression
|
||||
|
||||
@staticmethod
|
||||
def _fill_alts(soup):
|
||||
for el in soup.find_all(["親見出仮名", "親見出表記"]):
|
||||
el.string = el.attrs["alt"]
|
||||
for gaiji in soup.find_all("外字"):
|
||||
gaiji.string = gaiji.img.attrs["alt"]
|
||||
|
||||
|
||||
class Smk8Entry(_BaseSmk8Entry):
|
||||
def __init__(self, target, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(target, entry_id)
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
reading = self._find_reading(soup)
|
||||
expressions = []
|
||||
if soup.find("見出部").find("標準表記") is None:
|
||||
expressions.append(reading)
|
||||
for expression in self._find_expressions(soup):
|
||||
if expression not in expressions:
|
||||
expressions.append(expression)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
||||
|
||||
|
||||
class Smk8ChildEntry(_BaseSmk8Entry):
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
reading = self._find_reading(soup)
|
||||
expressions = []
|
||||
if soup.find("子見出部").find("標準表記") is None:
|
||||
expressions.append(reading)
|
||||
for expression in self._find_expressions(soup):
|
||||
if expression not in expressions:
|
||||
expressions.append(expression)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
||||
|
||||
|
||||
class Smk8PhraseEntry(_BaseSmk8Entry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.__phrase_readings = load_phrase_readings(self.target)
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrases do not contain these tags
|
||||
return []
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
headwords = {}
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings()
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
return headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
text = soup.find("標準表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = Expressions.expand_smk_alternatives(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
expressions.append(exp)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self):
|
||||
text = self.__phrase_readings[self.entry_id]
|
||||
alternatives = Expressions.expand_smk_alternatives(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
||||
|
||||
|
||||
class Smk8KanjiEntry(_BaseSmk8Entry):
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
reading = self.__get_parent_reading()
|
||||
expressions = self._find_expressions(soup)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
||||
|
||||
def __get_parent_reading(self):
|
||||
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
|
||||
parent = self.ID_TO_ENTRY[parent_id]
|
||||
reading = parent.get_first_reading()
|
||||
return reading
|
73
bot/entries/smk8/base_entry.py
Normal file
73
bot/entries/smk8/base_entry.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
import bot.soup as Soup
|
||||
import bot.entries.base.expressions as Expressions
|
||||
from bot.entries.base.sanseido_entry import SanseidoEntry
|
||||
|
||||
|
||||
class BaseEntry(SanseidoEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self.kanjis = []
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
headword_info = soup.find("見出要素")
|
||||
if headword_info is None:
|
||||
return self._part_of_speech_tags
|
||||
for tag in headword_info.find_all("品詞M"):
|
||||
if tag.text not in self._part_of_speech_tags:
|
||||
self._part_of_speech_tags.append(tag.text)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _find_reading(self, soup):
|
||||
midasi_kana = soup.find("見出仮名")
|
||||
reading = midasi_kana.text
|
||||
for x in [" ", "・"]:
|
||||
reading = reading.replace(x, "")
|
||||
return reading
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
clean_expressions = []
|
||||
for expression in soup.find_all("標準表記"):
|
||||
clean_expression = self._clean_expression(expression.text)
|
||||
clean_expressions.append(clean_expression)
|
||||
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
||||
return expressions
|
||||
|
||||
def _get_subentry_parameters(self):
|
||||
from bot.entries.smk8.child_entry import ChildEntry
|
||||
from bot.entries.smk8.phrase_entry import PhraseEntry
|
||||
from bot.entries.smk8.kanji_entry import KanjiEntry
|
||||
subentry_parameters = [
|
||||
[ChildEntry, ["子項目F", "子項目"], self.children],
|
||||
[PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
||||
[KanjiEntry, ["造語成分項目"], self.kanjis],
|
||||
]
|
||||
return subentry_parameters
|
||||
|
||||
@staticmethod
|
||||
def _delete_unused_nodes(soup):
|
||||
"""Remove extra markup elements that appear in the entry
|
||||
headword line which are not part of the entry headword"""
|
||||
unused_nodes = [
|
||||
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
|
||||
]
|
||||
for name in unused_nodes:
|
||||
Soup.delete_soup_nodes(soup, name)
|
||||
|
||||
@staticmethod
|
||||
def _clean_expression(expression):
|
||||
for x in ["〈", "〉", "{", "}", "…", " "]:
|
||||
expression = expression.replace(x, "")
|
||||
return expression
|
||||
|
||||
@staticmethod
|
||||
def _fill_alts(soup):
|
||||
for elm in soup.find_all(["親見出仮名", "親見出表記"]):
|
||||
elm.string = elm.attrs["alt"]
|
||||
for gaiji in soup.find_all("外字"):
|
||||
gaiji.string = gaiji.img.attrs["alt"]
|
17
bot/entries/smk8/child_entry.py
Normal file
17
bot/entries/smk8/child_entry.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
from bot.entries.smk8.base_entry import BaseEntry
|
||||
|
||||
|
||||
class ChildEntry(BaseEntry):
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
reading = self._find_reading(soup)
|
||||
expressions = []
|
||||
if soup.find("子見出部").find("標準表記") is None:
|
||||
expressions.append(reading)
|
||||
for expression in self._find_expressions(soup):
|
||||
if expression not in expressions:
|
||||
expressions.append(expression)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
26
bot/entries/smk8/entry.py
Normal file
26
bot/entries/smk8/entry.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
from bot.entries.smk8.base_entry import BaseEntry
|
||||
from bot.entries.smk8.preprocess import preprocess_page
|
||||
|
||||
|
||||
class Entry(BaseEntry):
|
||||
def __init__(self, target, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(target, entry_id)
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
reading = self._find_reading(soup)
|
||||
expressions = []
|
||||
if soup.find("見出部").find("標準表記") is None:
|
||||
expressions.append(reading)
|
||||
for expression in self._find_expressions(soup):
|
||||
if expression not in expressions:
|
||||
expressions.append(expression)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
22
bot/entries/smk8/kanji_entry.py
Normal file
22
bot/entries/smk8/kanji_entry.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
from bot.entries.smk8.base_entry import BaseEntry
|
||||
|
||||
|
||||
class KanjiEntry(BaseEntry):
|
||||
def get_part_of_speech_tags(self):
|
||||
# kanji entries do not contain these tags
|
||||
return []
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
reading = self.__get_parent_reading()
|
||||
expressions = self._find_expressions(soup)
|
||||
headwords = {reading: expressions}
|
||||
return headwords
|
||||
|
||||
def __get_parent_reading(self):
|
||||
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
|
||||
parent = self.ID_TO_ENTRY[parent_id]
|
||||
reading = parent.get_first_reading()
|
||||
return reading
|
64
bot/entries/smk8/phrase_entry.py
Normal file
64
bot/entries/smk8/phrase_entry.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
import re
|
||||
|
||||
import bot.entries.base.expressions as Expressions
|
||||
from bot.data import load_phrase_readings
|
||||
from bot.entries.smk8.base_entry import BaseEntry
|
||||
|
||||
|
||||
class PhraseEntry(BaseEntry):
|
||||
def __init__(self, target, entry_id):
|
||||
super().__init__(target, entry_id)
|
||||
self.__phrase_readings = load_phrase_readings(self.target)
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrase entries do not contain these tags
|
||||
return []
|
||||
|
||||
def _get_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
headwords = {}
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings()
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
return headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
self._delete_unused_nodes(soup)
|
||||
self._fill_alts(soup)
|
||||
text = soup.find("標準表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = parse_phrase(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
expressions.append(exp)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self):
|
||||
text = self.__phrase_readings[self.entry_id]
|
||||
alternatives = parse_phrase(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
||||
|
||||
|
||||
def parse_phrase(text):
|
||||
"""Return a list of strings described by △ notation."""
|
||||
match = re.search(r"△([^(]+)(([^(]+))", text)
|
||||
if match is None:
|
||||
return [text]
|
||||
alt_parts = [match.group(1)]
|
||||
for alt_part in match.group(2).split("・"):
|
||||
alt_parts.append(alt_part)
|
||||
alts = []
|
||||
for alt_part in alt_parts:
|
||||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
|
||||
alts.append(alt_exp)
|
||||
return alts
|
|
@ -1,4 +1,4 @@
|
|||
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
|
||||
from bot.entries.daijirin2.phrase_entry import PhraseEntry
|
||||
|
||||
from bot.yomichan.terms.terminator import Terminator
|
||||
from bot.yomichan.glossary.daijirin2 import make_glossary
|
||||
|
@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
|||
|
||||
|
||||
class Daijirin2Terminator(Terminator):
|
||||
def __init__(self, target):
|
||||
super().__init__(target)
|
||||
|
||||
def _definition_tags(self, entry):
|
||||
return ""
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry
|
||||
from bot.entries.sankoku8.phrase_entry import PhraseEntry
|
||||
|
||||
from bot.yomichan.terms.terminator import Terminator
|
||||
from bot.yomichan.glossary.sankoku8 import make_glossary
|
||||
|
@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
|||
|
||||
|
||||
class Sankoku8Terminator(Terminator):
|
||||
def __init__(self, target):
|
||||
super().__init__(target)
|
||||
|
||||
def _definition_tags(self, entry):
|
||||
return ""
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
|
||||
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
|
||||
from bot.entries.smk8.kanji_entry import KanjiEntry
|
||||
from bot.entries.smk8.phrase_entry import PhraseEntry
|
||||
|
||||
from bot.yomichan.terms.terminator import Terminator
|
||||
from bot.yomichan.glossary.smk8 import make_glossary
|
||||
|
|
21
tests/test_daijirin_phrases.py
Normal file
21
tests/test_daijirin_phrases.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
import unittest
|
||||
from bot.entries.daijirin2.phrase_entry import parse_phrase
|
||||
|
||||
|
||||
class TestDaijirin2PhraseParse(unittest.TestCase):
|
||||
def test1(self):
|
||||
text = "同じ穴の=狢(=狐・狸)"
|
||||
exps = parse_phrase(text)
|
||||
self.assertEqual(len(exps), 3)
|
||||
self.assertIn("同じ穴の狢", exps)
|
||||
self.assertIn("同じ穴の狐", exps)
|
||||
self.assertIn("同じ穴の狸", exps)
|
||||
|
||||
def test2(self):
|
||||
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
|
||||
exps = parse_phrase(text)
|
||||
self.assertEqual(len(exps), 4)
|
||||
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
|
||||
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
|
||||
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
|
||||
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
|
|
@ -1,5 +1,5 @@
|
|||
import unittest
|
||||
import bot.entries.expressions as Expressions
|
||||
import bot.entries.base.expressions as Expressions
|
||||
|
||||
|
||||
class TestExpressions(unittest.TestCase):
|
||||
|
@ -69,28 +69,3 @@ class TestExpressions(unittest.TestCase):
|
|||
self.assertIn("有合わせ", abbrs)
|
||||
self.assertIn("有り合せ", abbrs)
|
||||
self.assertIn("有合せ", abbrs)
|
||||
|
||||
def test_smk_expand_alternatives(self):
|
||||
text = "△金(時間・暇)に飽かして"
|
||||
exps = Expressions.expand_smk_alternatives(text)
|
||||
self.assertEqual(len(exps), 3)
|
||||
self.assertIn("金に飽かして", exps)
|
||||
self.assertIn("時間に飽かして", exps)
|
||||
self.assertIn("暇に飽かして", exps)
|
||||
|
||||
def test_daijirin_expand_alternatives(self):
|
||||
text = "同じ穴の=狢(=狐・狸)"
|
||||
exps = Expressions.expand_daijirin_alternatives(text)
|
||||
self.assertEqual(len(exps), 3)
|
||||
self.assertIn("同じ穴の狢", exps)
|
||||
self.assertIn("同じ穴の狐", exps)
|
||||
self.assertIn("同じ穴の狸", exps)
|
||||
|
||||
def test_daijirin_expand_alternatives2(self):
|
||||
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
|
||||
exps = Expressions.expand_daijirin_alternatives(text)
|
||||
self.assertEqual(len(exps), 4)
|
||||
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
|
||||
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
|
||||
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
|
||||
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
import unittest
|
||||
from bot.entries.sankoku8 import parse_hyouki_pattern
|
||||
from bot.entries.sankoku8.parse import parse_hyouki_pattern
|
||||
|
||||
|
||||
class TestSankokuPhrases(unittest.TestCase):
|
||||
def test_sankoku_phrases1(self):
|
||||
class TestSankoku8PhraseParse(unittest.TestCase):
|
||||
def test1(self):
|
||||
pattern = '耳にたこ(ができる)'
|
||||
exps = parse_hyouki_pattern(pattern)
|
||||
self.assertEqual(len(exps), 2)
|
||||
self.assertIn("耳にたこ", exps)
|
||||
self.assertIn("耳にたこができる", exps)
|
||||
|
||||
def test_sankoku_phrases2(self):
|
||||
def test2(self):
|
||||
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
|
||||
exps = parse_hyouki_pattern(pattern)
|
||||
self.assertEqual(len(exps), 4)
|
||||
|
@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase):
|
|||
self.assertIn("一斑をもって全豹を卜す", exps)
|
||||
self.assertIn("一斑をもって全豹を推す", exps)
|
||||
|
||||
def test_sankoku_phrases3(self):
|
||||
def test3(self):
|
||||
pattern = '{かじ・舵}を切る'
|
||||
exps = parse_hyouki_pattern(pattern)
|
||||
self.assertEqual(len(exps), 2)
|
||||
self.assertIn("かじを切る", exps)
|
||||
self.assertIn("舵を切る", exps)
|
||||
|
||||
def test_sankoku_phrases4(self):
|
||||
def test4(self):
|
||||
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
|
||||
exps = parse_hyouki_pattern(pattern)
|
||||
self.assertEqual(len(exps), 6)
|
||||
|
@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase):
|
|||
self.assertIn("重箱の隅をようじでほじくる", exps)
|
||||
self.assertIn("重箱の隅を楊枝でほじくる", exps)
|
||||
|
||||
def test_sankoku_phrases5(self):
|
||||
def test5(self):
|
||||
pattern = '群盲象を〈{な・撫}でる/評する〉'
|
||||
exps = parse_hyouki_pattern(pattern)
|
||||
self.assertEqual(len(exps), 3)
|
||||
|
|
19
tests/test_smk_phrases.py
Normal file
19
tests/test_smk_phrases.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
import unittest
|
||||
from bot.entries.smk8.phrase_entry import parse_phrase
|
||||
|
||||
|
||||
class TestSmk8PhraseParse(unittest.TestCase):
|
||||
def test1(self):
|
||||
text = "目と鼻の△先(間)"
|
||||
exps = parse_phrase(text)
|
||||
self.assertEqual(len(exps), 2)
|
||||
self.assertIn("目と鼻の先", exps)
|
||||
self.assertIn("目と鼻の間", exps)
|
||||
|
||||
def test2(self):
|
||||
text = "△金(時間・暇)に飽かして"
|
||||
exps = parse_phrase(text)
|
||||
self.assertEqual(len(exps), 3)
|
||||
self.assertIn("金に飽かして", exps)
|
||||
self.assertIn("時間に飽かして", exps)
|
||||
self.assertIn("暇に飽かして", exps)
|
Loading…
Reference in a new issue