Reorganize file structure of entries modules

This commit is contained in:
stephenmk 2023-07-26 19:28:50 -05:00
parent 0cd530585f
commit 9b3fdc86d1
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
35 changed files with 863 additions and 913 deletions

View file

@ -85,40 +85,3 @@ def expand_abbreviation_list(expressions):
if new_exp not in new_exps:
new_exps.append(new_exp)
return new_exps
def expand_smk_alternatives(text):
"""Return a list of strings described by △ notation."""
m = re.search(r"△([^]+)([^]+)", text)
if m is None:
return [text]
alt_parts = [m.group(1)]
for alt_part in m.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, text)
alts.append(alt_exp)
return alts
def expand_daijirin_alternatives(text):
"""Return a list of strings described by notation."""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, text)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -3,11 +3,11 @@ from abc import abstractmethod
from datetime import datetime, date
from bs4 import BeautifulSoup
from bot.entries.entry import Entry
import bot.entries.expressions as Expressions
from bot.entries.base.entry import Entry
import bot.entries.base.expressions as Expressions
class _JitenonEntry(Entry):
class JitenonEntry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.expression = ""
@ -140,104 +140,3 @@ class _JitenonEntry(Entry):
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)
class JitenonYojiEntry(_JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.kanken_level = ""
self.category = ""
self.related_expressions = []
def _get_column_map(self):
return {
"四字熟語": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"漢検級": "kanken_level",
"場面用途": "category",
"類義語": "related_expressions",
}
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
class JitenonKotowazaEntry(_JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.example = ""
self.related_expressions = []
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"例文": "example",
"類句": "related_expressions",
}
def _get_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
headwords = super()._get_headwords()
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
class JitenonKokugoEntry(_JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.example = ""
self.alt_expression = ""
self.antonym = ""
self.attachments = ""
self.compounds = ""
self.related_words = ""
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"例文": "example",
"別表記": "alt_expression",
"対義語": "antonym",
"活用": "attachments",
"用例": "compounds",
"類語": "related_words",
}
def _get_headwords(self):
headwords = {}
for reading in self.yomikata.split(""):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split(""):
headwords[reading].append(expression)
if self.alt_expression.strip() != "":
for expression in self.alt_expression.split(""):
headwords[reading].append(expression)
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)

View file

@ -0,0 +1,60 @@
from abc import abstractmethod
from bs4 import BeautifulSoup
from bot.entries.base.entry import Entry
import bot.entries.base.expressions as Expressions
class SanseidoEntry(Entry):
def set_page(self, page):
page = self._decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def _decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
for x in self._get_subentry_parameters():
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@abstractmethod
def _get_subentry_parameters(self):
pass
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")

View file

@ -1,231 +0,0 @@
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_phrase_readings
from bot.data import load_daijirin2_kana_abbreviations
from bot.entries.entry import Entry
from bot.entries.daijirin2_preprocess import preprocess_page
class _BaseDaijirin2Entry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for pos_group in soup.find_all("品詞G"):
if pos_group.parent.name == "大語義":
self._set_part_of_speech_tags(pos_group)
return self._part_of_speech_tags
def _set_part_of_speech_tags(self, el):
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
for child in el.children:
if child.name is not None:
self._set_part_of_speech_tags(child)
continue
pos = str(child)
if el.name not in pos_names:
continue
elif pos in ["", ""]:
continue
elif pos in self._part_of_speech_tags:
continue
else:
self._part_of_speech_tags.append(pos)
def _get_regular_headwords(self, soup):
self._fill_alts(soup)
reading = soup.find("見出仮名").text
expressions = []
for el in soup.find_all("標準表記"):
expression = self._clean_expression(el.text)
if "" in expression:
kana_abbrs = self._kana_abbreviations[self.entry_id]
for abbr in kana_abbrs:
expression = expression.replace("", abbr, 1)
expressions.append(expression)
expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0:
expressions.append(reading)
headwords = {reading: expressions}
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Daijirin2ChildEntry, ["子項目"], self.children],
[Daijirin2PhraseEntry, ["句項目"], self.phrases],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for gaiji in soup.find_all(class_="gaiji"):
if gaiji.name == "img" and gaiji.has_attr("alt"):
gaiji.name = "span"
gaiji.string = gaiji.attrs["alt"]
class Daijirin2Entry(_BaseDaijirin2Entry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None:
headwords = self._get_acronym_headwords(soup)
else:
headwords = self._get_regular_headwords(soup)
return headwords
def _get_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
readings.append(hira)
if soup.find("漢字音") is None:
readings.append("")
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
headwords = {}
for reading in readings:
headwords[reading] = expressions
return headwords
def _get_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
for part in el.find_all(["欧字", "和字"]):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
headwords = {"": expressions}
return headwords
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
headwords = self._get_regular_headwords(soup)
return headwords
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
text = soup.find("句表記").text
text = self._clean_expression(text)
alternatives = Expressions.expand_daijirin_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
phrase_readings = load_phrase_readings(self.target)
text = phrase_readings[self.entry_id]
alternatives = Expressions.expand_daijirin_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings

View file

@ -0,0 +1,88 @@
import bot.soup as Soup
from bot.data import load_daijirin2_kana_abbreviations
from bot.entries.base.sanseido_entry import SanseidoEntry
import bot.entries.base.expressions as Expressions
class BaseEntry(SanseidoEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for pos_group in soup.find_all("品詞G"):
if pos_group.parent.name == "大語義":
self._set_part_of_speech_tags(pos_group)
return self._part_of_speech_tags
def _set_part_of_speech_tags(self, el):
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
for child in el.children:
if child.name is not None:
self._set_part_of_speech_tags(child)
continue
pos = str(child)
if el.name not in pos_names:
continue
elif pos in ["", ""]:
continue
elif pos in self._part_of_speech_tags:
continue
else:
self._part_of_speech_tags.append(pos)
def _get_regular_headwords(self, soup):
self._fill_alts(soup)
reading = soup.find("見出仮名").text
expressions = []
for el in soup.find_all("標準表記"):
expression = self._clean_expression(el.text)
if "" in expression:
kana_abbrs = self._kana_abbreviations[self.entry_id]
for abbr in kana_abbrs:
expression = expression.replace("", abbr, 1)
expressions.append(expression)
expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0:
expressions.append(reading)
headwords = {reading: expressions}
return headwords
def _get_subentry_parameters(self):
from bot.entries.daijirin2.child_entry import ChildEntry
from bot.entries.daijirin2.phrase_entry import PhraseEntry
subentry_parameters = [
[ChildEntry, ["子項目"], self.children],
[PhraseEntry, ["句項目"], self.phrases],
]
return subentry_parameters
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for gaiji in soup.find_all(class_="gaiji"):
if gaiji.name == "img" and gaiji.has_attr("alt"):
gaiji.name = "span"
gaiji.string = gaiji.attrs["alt"]

View file

@ -0,0 +1,9 @@
from bot.entries.daijirin2.base_entry import BaseEntry
class ChildEntry(BaseEntry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
headwords = self._get_regular_headwords(soup)
return headwords

View file

@ -0,0 +1,50 @@
import bot.entries.base.expressions as Expressions
from bot.entries.daijirin2.base_entry import BaseEntry
from bot.entries.daijirin2.preprocess import preprocess_page
class Entry(BaseEntry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None:
headwords = self._get_acronym_headwords(soup)
else:
headwords = self._get_regular_headwords(soup)
return headwords
def _get_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
readings.append(hira)
if soup.find("漢字音") is None:
readings.append("")
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
headwords = {}
for reading in readings:
headwords[reading] = expressions
return headwords
def _get_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
for part in el.find_all(["欧字", "和字"]):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
headwords = {"": expressions}
return headwords

View file

@ -0,0 +1,67 @@
import re
import bot.entries.base.expressions as Expressions
from bot.data import load_phrase_readings
from bot.entries.daijirin2.base_entry import BaseEntry
class PhraseEntry(BaseEntry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
text = soup.find("句表記").text
text = self._clean_expression(text)
alternatives = parse_phrase(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
phrase_readings = load_phrase_readings(self.target)
text = phrase_readings[self.entry_id]
alternatives = parse_phrase(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
def parse_phrase(text):
"""Return a list of strings described by notation."""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, text)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -1,20 +1,7 @@
from bot.targets import Targets
from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
from bot.entries.sankoku8 import Sankoku8Entry
import importlib
def new_entry(target, page_id):
entry_map = {
Targets.JITENON_KOKUGO: JitenonKokugoEntry,
Targets.JITENON_YOJI: JitenonYojiEntry,
Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry,
Targets.SANKOKU8: Sankoku8Entry,
}
return entry_map[target](target, page_id)
module_path = f"bot.entries.{target.name.lower()}.entry"
module = importlib.import_module(module_path)
return module.Entry(target, page_id)

View file

@ -0,0 +1,45 @@
from bot.entries.base.jitenon_entry import JitenonEntry
import bot.entries.base.expressions as Expressions
class Entry(JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.example = ""
self.alt_expression = ""
self.antonym = ""
self.attachments = ""
self.compounds = ""
self.related_words = ""
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"例文": "example",
"別表記": "alt_expression",
"対義語": "antonym",
"活用": "attachments",
"用例": "compounds",
"類語": "related_words",
}
def _get_headwords(self):
headwords = {}
for reading in self.yomikata.split(""):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split(""):
headwords[reading].append(expression)
if self.alt_expression.strip() != "":
for expression in self.alt_expression.split(""):
headwords[reading].append(expression)
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)

View file

@ -0,0 +1,35 @@
from bot.entries.base.jitenon_entry import JitenonEntry
import bot.entries.base.expressions as Expressions
class Entry(JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.example = ""
self.related_expressions = []
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"例文": "example",
"類句": "related_expressions",
}
def _get_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
headwords = super()._get_headwords()
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)

View file

@ -0,0 +1,27 @@
import bot.entries.base.expressions as Expressions
from bot.entries.base.jitenon_entry import JitenonEntry
class Entry(JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.kanken_level = ""
self.category = ""
self.related_expressions = []
def _get_column_map(self):
return {
"四字熟語": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"漢検級": "kanken_level",
"場面用途": "category",
"類義語": "related_expressions",
}
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)

View file

@ -1,260 +0,0 @@
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.entries.entry import Entry
from bot.data import load_phrase_readings
from bot.entries.sankoku8_preprocess import preprocess_page
class _BaseSankoku8Entry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._hyouki_name = "表記"
self._midashi_name = None
self._midashi_kana_name = None
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
readings = self._find_readings(soup)
expressions = self._find_expressions(soup)
headwords = {}
for reading in readings:
headwords[reading] = []
if len(readings) == 1:
reading = readings[0]
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
headwords[reading].append(reading)
for exp in expressions:
if exp not in headwords[reading]:
headwords[reading].append(exp)
elif len(readings) > 1 and len(expressions) == 0:
for reading in readings:
headwords[reading].append(reading)
elif len(readings) > 1 and len(expressions) == 1:
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
expression = expressions[0]
for reading in readings:
if expression not in headwords[reading]:
headwords[reading].append(expression)
elif len(readings) > 1 and len(expressions) == len(readings):
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
for idx, reading in enumerate(readings):
exp = expressions[idx]
if exp not in headwords[reading]:
headwords[reading].append(exp)
else:
raise Exception() # shouldn't happen
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
pos_group = midashi.find("品詞G")
if pos_group is None:
continue
for tag in pos_group.find_all("a"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _find_expressions(self, soup):
expressions = []
for hyouki in soup.find_all(self._hyouki_name):
for expression in parse_hyouki_soup(hyouki, [""]):
expressions.append(expression)
return expressions
def _find_readings(self, soup):
midasi_kana = soup.find(self._midashi_kana_name)
readings = parse_hyouki_soup(midasi_kana, [""])
return readings
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Sankoku8ChildEntry, ["子項目"], self.children],
[Sankoku8PhraseEntry, ["句項目"], self.phrases],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
"アクセント分節", "活用分節", "ルビG", "分書"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
class Sankoku8Entry(_BaseSankoku8Entry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
self._midashi_name = "見出部"
self._midashi_kana_name = "見出仮名"
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
class Sankoku8ChildEntry(_BaseSankoku8Entry):
def __init__(self, target, page_id):
super().__init__(target, page_id)
self._midashi_name = "子見出部"
self._midashi_kana_name = "子見出仮名"
class Sankoku8PhraseEntry(_BaseSankoku8Entry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
expressions = self._find_expressions(soup)
readings = self._find_readings(soup)
headwords = {}
if len(expressions) != len(readings):
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
phrase_soup = soup.find("句表記")
expressions = parse_hyouki_soup(phrase_soup, [""])
return expressions
def _find_readings(self, soup):
reading_patterns = load_phrase_readings(self.target)
reading_pattern = reading_patterns[self.entry_id]
readings = parse_hyouki_pattern(reading_pattern)
return readings
def parse_hyouki_soup(soup, base_exps):
omitted_characters = [
"", "", "", "", "", "", "", "", ""
]
exps = base_exps.copy()
for child in soup.children:
new_exps = []
if child.name == "言換G":
for alt in child.find_all("言換"):
parts = parse_hyouki_soup(alt, [""])
for exp in exps:
for part in parts:
new_exps.append(exp + part)
elif child.name == "補足表記":
alt1 = child.find("表記対象")
alt2 = child.find("表記内容G")
parts1 = parse_hyouki_soup(alt1, [""])
parts2 = parse_hyouki_soup(alt2, [""])
for exp in exps:
for part in parts1:
new_exps.append(exp + part)
for part in parts2:
new_exps.append(exp + part)
elif child.name == "省略":
parts = parse_hyouki_soup(child, [""])
for exp in exps:
new_exps.append(exp)
for part in parts:
new_exps.append(exp + part)
elif child.name is not None:
new_exps = parse_hyouki_soup(child, exps)
else:
text = child.text
for char in omitted_characters:
text = text.replace(char, "")
for exp in exps:
new_exps.append(exp + text)
exps = new_exps.copy()
return exps
def parse_hyouki_pattern(pattern):
replacements = {
"": "<省略>",
"": "</省略>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G><表記内容>",
"": "</表記内容></表記内容G></補足表記>",
"": "<言換G>〈<言換>",
"": "</言換><言換>",
"": "</言換>〉</言換G>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G>⦅<表記内容>",
"": "</表記内容>⦆</表記内容G></補足表記>",
}
markup = f"<span>{pattern}</span>"
for key, val in replacements.items():
markup = markup.replace(key, val)
soup = BeautifulSoup(markup, "xml")
hyouki_soup = soup.find("span")
exps = parse_hyouki_soup(hyouki_soup, [""])
return exps

View file

@ -0,0 +1,97 @@
import bot.soup as Soup
from bot.entries.base.sanseido_entry import SanseidoEntry
from bot.entries.sankoku8.parse import parse_hyouki_soup
class BaseEntry(SanseidoEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._hyouki_name = "表記"
self._midashi_name = None
self._midashi_kana_name = None
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
readings = self._find_readings(soup)
expressions = self._find_expressions(soup)
headwords = {}
for reading in readings:
headwords[reading] = []
if len(readings) == 1:
reading = readings[0]
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
headwords[reading].append(reading)
for exp in expressions:
if exp not in headwords[reading]:
headwords[reading].append(exp)
elif len(readings) > 1 and len(expressions) == 0:
for reading in readings:
headwords[reading].append(reading)
elif len(readings) > 1 and len(expressions) == 1:
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
expression = expressions[0]
for reading in readings:
if expression not in headwords[reading]:
headwords[reading].append(expression)
elif len(readings) > 1 and len(expressions) == len(readings):
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
for idx, reading in enumerate(readings):
exp = expressions[idx]
if exp not in headwords[reading]:
headwords[reading].append(exp)
else:
raise Exception() # shouldn't happen
return headwords
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
pos_group = midashi.find("品詞G")
if pos_group is None:
continue
for tag in pos_group.find_all("a"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _find_expressions(self, soup):
expressions = []
for hyouki in soup.find_all(self._hyouki_name):
for expression in parse_hyouki_soup(hyouki, [""]):
expressions.append(expression)
return expressions
def _find_readings(self, soup):
midasi_kana = soup.find(self._midashi_kana_name)
readings = parse_hyouki_soup(midasi_kana, [""])
return readings
def _get_subentry_parameters(self):
from bot.entries.sankoku8.child_entry import ChildEntry
from bot.entries.sankoku8.phrase_entry import PhraseEntry
subentry_parameters = [
[ChildEntry, ["子項目"], self.children],
[PhraseEntry, ["句項目"], self.phrases],
]
return subentry_parameters
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
"アクセント分節", "活用分節", "ルビG", "分書"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)

View file

@ -0,0 +1,8 @@
from bot.entries.sankoku8.base_entry import BaseEntry
class ChildEntry(BaseEntry):
def __init__(self, target, page_id):
super().__init__(target, page_id)
self._midashi_name = "子見出部"
self._midashi_kana_name = "子見出仮名"

View file

@ -0,0 +1,14 @@
from bot.entries.sankoku8.base_entry import BaseEntry
from bot.entries.sankoku8.preprocess import preprocess_page
class Entry(BaseEntry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
self._midashi_name = "見出部"
self._midashi_kana_name = "見出仮名"
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)

View file

@ -0,0 +1,65 @@
from bs4 import BeautifulSoup
def parse_hyouki_soup(soup, base_exps):
omitted_characters = [
"", "", "", "", "", "", "", "", ""
]
exps = base_exps.copy()
for child in soup.children:
new_exps = []
if child.name == "言換G":
for alt in child.find_all("言換"):
parts = parse_hyouki_soup(alt, [""])
for exp in exps:
for part in parts:
new_exps.append(exp + part)
elif child.name == "補足表記":
alt1 = child.find("表記対象")
alt2 = child.find("表記内容G")
parts1 = parse_hyouki_soup(alt1, [""])
parts2 = parse_hyouki_soup(alt2, [""])
for exp in exps:
for part in parts1:
new_exps.append(exp + part)
for part in parts2:
new_exps.append(exp + part)
elif child.name == "省略":
parts = parse_hyouki_soup(child, [""])
for exp in exps:
new_exps.append(exp)
for part in parts:
new_exps.append(exp + part)
elif child.name is not None:
new_exps = parse_hyouki_soup(child, exps)
else:
text = child.text
for char in omitted_characters:
text = text.replace(char, "")
for exp in exps:
new_exps.append(exp + text)
exps = new_exps.copy()
return exps
def parse_hyouki_pattern(pattern):
replacements = {
"": "<省略>",
"": "</省略>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G><表記内容>",
"": "</表記内容></表記内容G></補足表記>",
"": "<言換G>〈<言換>",
"": "</言換><言換>",
"": "</言換>〉</言換G>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G>⦅<表記内容>",
"": "</表記内容>⦆</表記内容G></補足表記>",
}
markup = f"<span>{pattern}</span>"
for key, val in replacements.items():
markup = markup.replace(key, val)
soup = BeautifulSoup(markup, "xml")
hyouki_soup = soup.find("span")
exps = parse_hyouki_soup(hyouki_soup, [""])
return exps

View file

@ -0,0 +1,37 @@
from bot.data import load_phrase_readings
from bot.entries.sankoku8.base_entry import BaseEntry
from bot.entries.sankoku8.parse import parse_hyouki_soup
from bot.entries.sankoku8.parse import parse_hyouki_pattern
class PhraseEntry(BaseEntry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
expressions = self._find_expressions(soup)
readings = self._find_readings(soup)
headwords = {}
if len(expressions) != len(readings):
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
phrase_soup = soup.find("句表記")
expressions = parse_hyouki_soup(phrase_soup, [""])
return expressions
def _find_readings(self, soup):
reading_patterns = load_phrase_readings(self.target)
reading_pattern = reading_patterns[self.entry_id]
readings = parse_hyouki_pattern(reading_pattern)
return readings

View file

@ -1,221 +0,0 @@
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_phrase_readings
from bot.entries.entry import Entry
from bot.entries.smk8_preprocess import preprocess_page
class _BaseSmk8Entry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self.kanjis = []
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
headword_info = soup.find("見出要素")
if headword_info is None:
return self._part_of_speech_tags
for tag in headword_info.find_all("品詞M"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def _find_reading(self, soup):
midasi_kana = soup.find("見出仮名")
reading = midasi_kana.text
for x in [" ", ""]:
reading = reading.replace(x, "")
return reading
def _find_expressions(self, soup):
clean_expressions = []
for expression in soup.find_all("標準表記"):
clean_expression = self._clean_expression(expression.text)
clean_expressions.append(clean_expression)
expressions = Expressions.expand_abbreviation_list(clean_expressions)
return expressions
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for el in soup.find_all(["親見出仮名", "親見出表記"]):
el.string = el.attrs["alt"]
for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"]
class Smk8Entry(_BaseSmk8Entry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords
class Smk8ChildEntry(_BaseSmk8Entry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("子見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords
class Smk8PhraseEntry(_BaseSmk8Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.__phrase_readings = load_phrase_readings(self.target)
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
alternatives = Expressions.expand_smk_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = Expressions.expand_smk_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
class Smk8KanjiEntry(_BaseSmk8Entry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self.__get_parent_reading()
expressions = self._find_expressions(soup)
headwords = {reading: expressions}
return headwords
def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
parent = self.ID_TO_ENTRY[parent_id]
reading = parent.get_first_reading()
return reading

View file

@ -0,0 +1,73 @@
import bot.soup as Soup
import bot.entries.base.expressions as Expressions
from bot.entries.base.sanseido_entry import SanseidoEntry
class BaseEntry(SanseidoEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self.kanjis = []
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
headword_info = soup.find("見出要素")
if headword_info is None:
return self._part_of_speech_tags
for tag in headword_info.find_all("品詞M"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _find_reading(self, soup):
midasi_kana = soup.find("見出仮名")
reading = midasi_kana.text
for x in [" ", ""]:
reading = reading.replace(x, "")
return reading
def _find_expressions(self, soup):
clean_expressions = []
for expression in soup.find_all("標準表記"):
clean_expression = self._clean_expression(expression.text)
clean_expressions.append(clean_expression)
expressions = Expressions.expand_abbreviation_list(clean_expressions)
return expressions
def _get_subentry_parameters(self):
from bot.entries.smk8.child_entry import ChildEntry
from bot.entries.smk8.phrase_entry import PhraseEntry
from bot.entries.smk8.kanji_entry import KanjiEntry
subentry_parameters = [
[ChildEntry, ["子項目F", "子項目"], self.children],
[PhraseEntry, ["句項目F", "句項目"], self.phrases],
[KanjiEntry, ["造語成分項目"], self.kanjis],
]
return subentry_parameters
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for elm in soup.find_all(["親見出仮名", "親見出表記"]):
elm.string = elm.attrs["alt"]
for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"]

View file

@ -0,0 +1,17 @@
from bot.entries.smk8.base_entry import BaseEntry
class ChildEntry(BaseEntry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("子見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords

26
bot/entries/smk8/entry.py Normal file
View file

@ -0,0 +1,26 @@
from bot.entries.smk8.base_entry import BaseEntry
from bot.entries.smk8.preprocess import preprocess_page
class Entry(BaseEntry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords

View file

@ -0,0 +1,22 @@
from bot.entries.smk8.base_entry import BaseEntry
class KanjiEntry(BaseEntry):
def get_part_of_speech_tags(self):
# kanji entries do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self.__get_parent_reading()
expressions = self._find_expressions(soup)
headwords = {reading: expressions}
return headwords
def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
parent = self.ID_TO_ENTRY[parent_id]
reading = parent.get_first_reading()
return reading

View file

@ -0,0 +1,64 @@
import re
import bot.entries.base.expressions as Expressions
from bot.data import load_phrase_readings
from bot.entries.smk8.base_entry import BaseEntry
class PhraseEntry(BaseEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.__phrase_readings = load_phrase_readings(self.target)
def get_part_of_speech_tags(self):
# phrase entries do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
alternatives = parse_phrase(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = parse_phrase(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
def parse_phrase(text):
"""Return a list of strings described by △ notation."""
match = re.search(r"△([^]+)([^]+)", text)
if match is None:
return [text]
alt_parts = [match.group(1)]
for alt_part in match.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, text)
alts.append(alt_exp)
return alts

View file

@ -1,4 +1,4 @@
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
from bot.entries.daijirin2.phrase_entry import PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.daijirin2 import make_glossary
@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Daijirin2Terminator(Terminator):
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry):
return ""

View file

@ -1,4 +1,4 @@
from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry
from bot.entries.sankoku8.phrase_entry import PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.sankoku8 import make_glossary
@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Sankoku8Terminator(Terminator):
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry):
return ""

View file

@ -1,5 +1,5 @@
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
from bot.entries.smk8.kanji_entry import KanjiEntry
from bot.entries.smk8.phrase_entry import PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.smk8 import make_glossary

View file

@ -0,0 +1,21 @@
import unittest
from bot.entries.daijirin2.phrase_entry import parse_phrase
class TestDaijirin2PhraseParse(unittest.TestCase):
def test1(self):
text = "同じ穴の=狢(=狐・狸)"
exps = parse_phrase(text)
self.assertEqual(len(exps), 3)
self.assertIn("同じ穴の狢", exps)
self.assertIn("同じ穴の狐", exps)
self.assertIn("同じ穴の狸", exps)
def test2(self):
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
exps = parse_phrase(text)
self.assertEqual(len(exps), 4)
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)

View file

@ -1,5 +1,5 @@
import unittest
import bot.entries.expressions as Expressions
import bot.entries.base.expressions as Expressions
class TestExpressions(unittest.TestCase):
@ -69,28 +69,3 @@ class TestExpressions(unittest.TestCase):
self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs)
def test_smk_expand_alternatives(self):
text = "△金(時間・暇)に飽かして"
exps = Expressions.expand_smk_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("金に飽かして", exps)
self.assertIn("時間に飽かして", exps)
self.assertIn("暇に飽かして", exps)
def test_daijirin_expand_alternatives(self):
text = "同じ穴の=狢(=狐・狸)"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("同じ穴の狢", exps)
self.assertIn("同じ穴の狐", exps)
self.assertIn("同じ穴の狸", exps)
def test_daijirin_expand_alternatives2(self):
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 4)
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)

View file

@ -1,16 +1,16 @@
import unittest
from bot.entries.sankoku8 import parse_hyouki_pattern
from bot.entries.sankoku8.parse import parse_hyouki_pattern
class TestSankokuPhrases(unittest.TestCase):
def test_sankoku_phrases1(self):
class TestSankoku8PhraseParse(unittest.TestCase):
def test1(self):
pattern = '耳にたこ(ができる)'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("耳にたこ", exps)
self.assertIn("耳にたこができる", exps)
def test_sankoku_phrases2(self):
def test2(self):
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 4)
@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("一斑をもって全豹を卜す", exps)
self.assertIn("一斑をもって全豹を推す", exps)
def test_sankoku_phrases3(self):
def test3(self):
pattern = '{かじ・舵}を切る'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("かじを切る", exps)
self.assertIn("舵を切る", exps)
def test_sankoku_phrases4(self):
def test4(self):
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 6)
@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("重箱の隅をようじでほじくる", exps)
self.assertIn("重箱の隅を楊枝でほじくる", exps)
def test_sankoku_phrases5(self):
def test5(self):
pattern = '群盲象を〈{な・撫}でる/評する〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 3)

19
tests/test_smk_phrases.py Normal file
View file

@ -0,0 +1,19 @@
import unittest
from bot.entries.smk8.phrase_entry import parse_phrase
class TestSmk8PhraseParse(unittest.TestCase):
def test1(self):
text = "目と鼻の△先(間)"
exps = parse_phrase(text)
self.assertEqual(len(exps), 2)
self.assertIn("目と鼻の先", exps)
self.assertIn("目と鼻の間", exps)
def test2(self):
text = "△金(時間・暇)に飽かして"
exps = parse_phrase(text)
self.assertEqual(len(exps), 3)
self.assertIn("金に飽かして", exps)
self.assertIn("時間に飽かして", exps)
self.assertIn("暇に飽かして", exps)