diff --git a/bot/entries/entry.py b/bot/entries/base/entry.py
similarity index 100%
rename from bot/entries/entry.py
rename to bot/entries/base/entry.py
diff --git a/bot/entries/expressions.py b/bot/entries/base/expressions.py
similarity index 67%
rename from bot/entries/expressions.py
rename to bot/entries/base/expressions.py
index 687a325..7d20891 100644
--- a/bot/entries/expressions.py
+++ b/bot/entries/base/expressions.py
@@ -85,40 +85,3 @@ def expand_abbreviation_list(expressions):
if new_exp not in new_exps:
new_exps.append(new_exp)
return new_exps
-
-
-def expand_smk_alternatives(text):
- """Return a list of strings described by △ notation."""
- m = re.search(r"△([^(]+)(([^(]+))", text)
- if m is None:
- return [text]
- alt_parts = [m.group(1)]
- for alt_part in m.group(2).split("・"):
- alt_parts.append(alt_part)
- alts = []
- for alt_part in alt_parts:
- alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
- alts.append(alt_exp)
- return alts
-
-
-def expand_daijirin_alternatives(text):
- """Return a list of strings described by = notation."""
- group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
- groups = re.findall(group_pattern, text)
- expressions = [""]
- for group in groups:
- new_exps = []
- for expression in expressions:
- new_exps.append(expression + group[0])
- expressions = new_exps.copy()
- if group[1] == "":
- continue
- new_exps = []
- for expression in expressions:
- new_exps.append(expression + group[2])
- for expression in expressions:
- for alt in group[3].split("・"):
- new_exps.append(expression + alt)
- expressions = new_exps.copy()
- return expressions
diff --git a/bot/entries/jitenon.py b/bot/entries/base/jitenon_entry.py
similarity index 58%
rename from bot/entries/jitenon.py
rename to bot/entries/base/jitenon_entry.py
index 65c4d2e..7af845b 100644
--- a/bot/entries/jitenon.py
+++ b/bot/entries/base/jitenon_entry.py
@@ -3,11 +3,11 @@ from abc import abstractmethod
from datetime import datetime, date
from bs4 import BeautifulSoup
-from bot.entries.entry import Entry
-import bot.entries.expressions as Expressions
+from bot.entries.base.entry import Entry
+import bot.entries.base.expressions as Expressions
-class _JitenonEntry(Entry):
+class JitenonEntry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.expression = ""
@@ -140,104 +140,3 @@ class _JitenonEntry(Entry):
elif isinstance(attr_val, list):
colvals.append(";".join(attr_val))
return ",".join(colvals)
-
-
-class JitenonYojiEntry(_JitenonEntry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.origin = ""
- self.kanken_level = ""
- self.category = ""
- self.related_expressions = []
-
- def _get_column_map(self):
- return {
- "四字熟語": "expression",
- "読み方": "yomikata",
- "意味": "definition",
- "異形": "other_forms",
- "出典": "origin",
- "漢検級": "kanken_level",
- "場面用途": "category",
- "類義語": "related_expressions",
- }
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
-
-
-class JitenonKotowazaEntry(_JitenonEntry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.origin = ""
- self.example = ""
- self.related_expressions = []
-
- def _get_column_map(self):
- return {
- "言葉": "expression",
- "読み方": "yomikata",
- "意味": "definition",
- "異形": "other_forms",
- "出典": "origin",
- "例文": "example",
- "類句": "related_expressions",
- }
-
- def _get_headwords(self):
- if self.expression == "金棒引き・鉄棒引き":
- headwords = {
- "かなぼうひき": ["金棒引き", "鉄棒引き"]
- }
- else:
- headwords = super()._get_headwords()
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
-
-
-class JitenonKokugoEntry(_JitenonEntry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.example = ""
- self.alt_expression = ""
- self.antonym = ""
- self.attachments = ""
- self.compounds = ""
- self.related_words = ""
-
- def _get_column_map(self):
- return {
- "言葉": "expression",
- "読み方": "yomikata",
- "意味": "definition",
- "例文": "example",
- "別表記": "alt_expression",
- "対義語": "antonym",
- "活用": "attachments",
- "用例": "compounds",
- "類語": "related_words",
- }
-
- def _get_headwords(self):
- headwords = {}
- for reading in self.yomikata.split("・"):
- if reading not in headwords:
- headwords[reading] = []
- for expression in self.expression.split("・"):
- headwords[reading].append(expression)
- if self.alt_expression.strip() != "":
- for expression in self.alt_expression.split("・"):
- headwords[reading].append(expression)
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
diff --git a/bot/entries/base/sanseido_entry.py b/bot/entries/base/sanseido_entry.py
new file mode 100644
index 0000000..4e1098d
--- /dev/null
+++ b/bot/entries/base/sanseido_entry.py
@@ -0,0 +1,60 @@
+from abc import abstractmethod
+from bs4 import BeautifulSoup
+
+from bot.entries.base.entry import Entry
+import bot.entries.base.expressions as Expressions
+
+
+class SanseidoEntry(Entry):
+ def set_page(self, page):
+ page = self._decompose_subentries(page)
+ self._page = page
+
+ def get_page_soup(self):
+ soup = BeautifulSoup(self._page, "xml")
+ return soup
+
+ def get_global_identifier(self):
+ parent_part = format(self.entry_id[0], '06')
+ child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
+ return f"@{self.target.value}-{parent_part}-{child_part}"
+
+ def _decompose_subentries(self, page):
+ soup = BeautifulSoup(page, features="xml")
+ for x in self._get_subentry_parameters():
+ subentry_class, tags, subentry_list = x
+ for tag in tags:
+ tag_soup = soup.find(tag)
+ while tag_soup is not None:
+ tag_soup.name = "項目"
+ subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
+ self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
+ subentry = subentry_class(self.target, subentry_id)
+ page = tag_soup.decode()
+ subentry.set_page(page)
+ subentry_list.append(subentry)
+ tag_soup.decompose()
+ tag_soup = soup.find(tag)
+ return soup.decode()
+
+ @abstractmethod
+ def _get_subentry_parameters(self):
+ pass
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
+ Expressions.add_fullwidth(expressions)
+ Expressions.remove_iteration_mark(expressions)
+ Expressions.add_iteration_mark(expressions)
+
+ @staticmethod
+ def id_string_to_entry_id(id_string):
+ parts = id_string.split("-")
+ if len(parts) == 1:
+ return (int(parts[0]), 0)
+ elif len(parts) == 2:
+ # subentries have a hexadecimal part
+ return (int(parts[0]), int(parts[1], 16))
+ else:
+ raise Exception(f"Invalid entry ID: {id_string}")
diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py
deleted file mode 100644
index f7a629c..0000000
--- a/bot/entries/daijirin2.py
+++ /dev/null
@@ -1,231 +0,0 @@
-from bs4 import BeautifulSoup
-
-import bot.entries.expressions as Expressions
-import bot.soup as Soup
-from bot.data import load_phrase_readings
-from bot.data import load_daijirin2_kana_abbreviations
-from bot.entries.entry import Entry
-from bot.entries.daijirin2_preprocess import preprocess_page
-
-
-class _BaseDaijirin2Entry(Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.children = []
- self.phrases = []
- self._kana_abbreviations = load_daijirin2_kana_abbreviations()
-
- def get_global_identifier(self):
- parent_part = format(self.entry_id[0], '06')
- child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
- return f"@{self.target.value}-{parent_part}-{child_part}"
-
- def set_page(self, page):
- page = self.__decompose_subentries(page)
- self._page = page
-
- def get_page_soup(self):
- soup = BeautifulSoup(self._page, "xml")
- return soup
-
- def get_part_of_speech_tags(self):
- if self._part_of_speech_tags is not None:
- return self._part_of_speech_tags
- self._part_of_speech_tags = []
- soup = self.get_page_soup()
- for pos_group in soup.find_all("品詞G"):
- if pos_group.parent.name == "大語義":
- self._set_part_of_speech_tags(pos_group)
- return self._part_of_speech_tags
-
- def _set_part_of_speech_tags(self, el):
- pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
- for child in el.children:
- if child.name is not None:
- self._set_part_of_speech_tags(child)
- continue
- pos = str(child)
- if el.name not in pos_names:
- continue
- elif pos in ["[", "]"]:
- continue
- elif pos in self._part_of_speech_tags:
- continue
- else:
- self._part_of_speech_tags.append(pos)
-
- def _get_regular_headwords(self, soup):
- self._fill_alts(soup)
- reading = soup.find("見出仮名").text
- expressions = []
- for el in soup.find_all("標準表記"):
- expression = self._clean_expression(el.text)
- if "—" in expression:
- kana_abbrs = self._kana_abbreviations[self.entry_id]
- for abbr in kana_abbrs:
- expression = expression.replace("—", abbr, 1)
- expressions.append(expression)
- expressions = Expressions.expand_abbreviation_list(expressions)
- if len(expressions) == 0:
- expressions.append(reading)
- headwords = {reading: expressions}
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
-
- def __decompose_subentries(self, page):
- soup = BeautifulSoup(page, features="xml")
- subentry_parameters = [
- [Daijirin2ChildEntry, ["子項目"], self.children],
- [Daijirin2PhraseEntry, ["句項目"], self.phrases],
- ]
- for x in subentry_parameters:
- subentry_class, tags, subentry_list = x
- for tag in tags:
- tag_soup = soup.find(tag)
- while tag_soup is not None:
- tag_soup.name = "項目"
- subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
- self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(self.target, subentry_id)
- page = tag_soup.decode()
- subentry.set_page(page)
- subentry_list.append(subentry)
- tag_soup.decompose()
- tag_soup = soup.find(tag)
- return soup.decode()
-
- @staticmethod
- def id_string_to_entry_id(id_string):
- parts = id_string.split("-")
- if len(parts) == 1:
- return (int(parts[0]), 0)
- elif len(parts) == 2:
- # subentries have a hexadecimal part
- return (int(parts[0]), int(parts[1], 16))
- else:
- raise Exception(f"Invalid entry ID: {id_string}")
-
- @staticmethod
- def _delete_unused_nodes(soup):
- """Remove extra markup elements that appear in the entry
- headword line which are not part of the entry headword"""
- unused_nodes = [
- "漢字音logo", "活用分節", "連語句活用分節", "語構成",
- "表外字マーク", "表外字マーク", "ルビG"
- ]
- for name in unused_nodes:
- Soup.delete_soup_nodes(soup, name)
-
- @staticmethod
- def _clean_expression(expression):
- for x in ["〈", "〉", "《", "》", " "]:
- expression = expression.replace(x, "")
- return expression
-
- @staticmethod
- def _fill_alts(soup):
- for gaiji in soup.find_all(class_="gaiji"):
- if gaiji.name == "img" and gaiji.has_attr("alt"):
- gaiji.name = "span"
- gaiji.string = gaiji.attrs["alt"]
-
-
-class Daijirin2Entry(_BaseDaijirin2Entry):
- def __init__(self, target, page_id):
- entry_id = (page_id, 0)
- super().__init__(target, entry_id)
-
- def set_page(self, page):
- page = preprocess_page(page)
- super().set_page(page)
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- if soup.find("漢字見出") is not None:
- headwords = self._get_kanji_headwords(soup)
- elif soup.find("略語G") is not None:
- headwords = self._get_acronym_headwords(soup)
- else:
- headwords = self._get_regular_headwords(soup)
- return headwords
-
- def _get_kanji_headwords(self, soup):
- readings = []
- for el in soup.find_all("漢字音"):
- hira = Expressions.kata_to_hira(el.text)
- readings.append(hira)
- if soup.find("漢字音") is None:
- readings.append("")
- expressions = []
- for el in soup.find_all("漢字見出"):
- expressions.append(el.text)
- headwords = {}
- for reading in readings:
- headwords[reading] = expressions
- return headwords
-
- def _get_acronym_headwords(self, soup):
- expressions = []
- for el in soup.find_all("略語"):
- expression_parts = []
- for part in el.find_all(["欧字", "和字"]):
- expression_parts.append(part.text)
- expression = "".join(expression_parts)
- expressions.append(expression)
- headwords = {"": expressions}
- return headwords
-
-
-class Daijirin2ChildEntry(_BaseDaijirin2Entry):
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- headwords = self._get_regular_headwords(soup)
- return headwords
-
-
-class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
- def get_part_of_speech_tags(self):
- # phrases do not contain these tags
- return []
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- headwords = {}
- expressions = self._find_expressions(soup)
- readings = self._find_readings()
- for idx, expression in enumerate(expressions):
- reading = readings[idx]
- if reading in headwords:
- headwords[reading].append(expression)
- else:
- headwords[reading] = [expression]
- return headwords
-
- def _find_expressions(self, soup):
- self._delete_unused_nodes(soup)
- text = soup.find("句表記").text
- text = self._clean_expression(text)
- alternatives = Expressions.expand_daijirin_alternatives(text)
- expressions = []
- for alt in alternatives:
- for exp in Expressions.expand_abbreviation(alt):
- expressions.append(exp)
- return expressions
-
- def _find_readings(self):
- phrase_readings = load_phrase_readings(self.target)
- text = phrase_readings[self.entry_id]
- alternatives = Expressions.expand_daijirin_alternatives(text)
- readings = []
- for alt in alternatives:
- for reading in Expressions.expand_abbreviation(alt):
- readings.append(reading)
- return readings
diff --git a/bot/entries/daijirin2/base_entry.py b/bot/entries/daijirin2/base_entry.py
new file mode 100644
index 0000000..1113404
--- /dev/null
+++ b/bot/entries/daijirin2/base_entry.py
@@ -0,0 +1,88 @@
+import bot.soup as Soup
+from bot.data import load_daijirin2_kana_abbreviations
+from bot.entries.base.sanseido_entry import SanseidoEntry
+import bot.entries.base.expressions as Expressions
+
+
+class BaseEntry(SanseidoEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.children = []
+ self.phrases = []
+ self._kana_abbreviations = load_daijirin2_kana_abbreviations()
+
+ def get_part_of_speech_tags(self):
+ if self._part_of_speech_tags is not None:
+ return self._part_of_speech_tags
+ self._part_of_speech_tags = []
+ soup = self.get_page_soup()
+ for pos_group in soup.find_all("品詞G"):
+ if pos_group.parent.name == "大語義":
+ self._set_part_of_speech_tags(pos_group)
+ return self._part_of_speech_tags
+
+ def _set_part_of_speech_tags(self, el):
+ pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
+ for child in el.children:
+ if child.name is not None:
+ self._set_part_of_speech_tags(child)
+ continue
+ pos = str(child)
+ if el.name not in pos_names:
+ continue
+ elif pos in ["[", "]"]:
+ continue
+ elif pos in self._part_of_speech_tags:
+ continue
+ else:
+ self._part_of_speech_tags.append(pos)
+
+ def _get_regular_headwords(self, soup):
+ self._fill_alts(soup)
+ reading = soup.find("見出仮名").text
+ expressions = []
+ for el in soup.find_all("標準表記"):
+ expression = self._clean_expression(el.text)
+ if "—" in expression:
+ kana_abbrs = self._kana_abbreviations[self.entry_id]
+ for abbr in kana_abbrs:
+ expression = expression.replace("—", abbr, 1)
+ expressions.append(expression)
+ expressions = Expressions.expand_abbreviation_list(expressions)
+ if len(expressions) == 0:
+ expressions.append(reading)
+ headwords = {reading: expressions}
+ return headwords
+
+ def _get_subentry_parameters(self):
+ from bot.entries.daijirin2.child_entry import ChildEntry
+ from bot.entries.daijirin2.phrase_entry import PhraseEntry
+ subentry_parameters = [
+ [ChildEntry, ["子項目"], self.children],
+ [PhraseEntry, ["句項目"], self.phrases],
+ ]
+ return subentry_parameters
+
+ @staticmethod
+ def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
+ unused_nodes = [
+ "漢字音logo", "活用分節", "連語句活用分節", "語構成",
+ "表外字マーク", "表外字マーク", "ルビG"
+ ]
+ for name in unused_nodes:
+ Soup.delete_soup_nodes(soup, name)
+
+ @staticmethod
+ def _clean_expression(expression):
+ for x in ["〈", "〉", "《", "》", " "]:
+ expression = expression.replace(x, "")
+ return expression
+
+ @staticmethod
+ def _fill_alts(soup):
+ for gaiji in soup.find_all(class_="gaiji"):
+ if gaiji.name == "img" and gaiji.has_attr("alt"):
+ gaiji.name = "span"
+ gaiji.string = gaiji.attrs["alt"]
diff --git a/bot/entries/daijirin2/child_entry.py b/bot/entries/daijirin2/child_entry.py
new file mode 100644
index 0000000..42685a0
--- /dev/null
+++ b/bot/entries/daijirin2/child_entry.py
@@ -0,0 +1,9 @@
+from bot.entries.daijirin2.base_entry import BaseEntry
+
+
+class ChildEntry(BaseEntry):
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ headwords = self._get_regular_headwords(soup)
+ return headwords
diff --git a/bot/entries/daijirin2/entry.py b/bot/entries/daijirin2/entry.py
new file mode 100644
index 0000000..0b6970f
--- /dev/null
+++ b/bot/entries/daijirin2/entry.py
@@ -0,0 +1,50 @@
+import bot.entries.base.expressions as Expressions
+from bot.entries.daijirin2.base_entry import BaseEntry
+from bot.entries.daijirin2.preprocess import preprocess_page
+
+
+class Entry(BaseEntry):
+ def __init__(self, target, page_id):
+ entry_id = (page_id, 0)
+ super().__init__(target, entry_id)
+
+ def set_page(self, page):
+ page = preprocess_page(page)
+ super().set_page(page)
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ if soup.find("漢字見出") is not None:
+ headwords = self._get_kanji_headwords(soup)
+ elif soup.find("略語G") is not None:
+ headwords = self._get_acronym_headwords(soup)
+ else:
+ headwords = self._get_regular_headwords(soup)
+ return headwords
+
+ def _get_kanji_headwords(self, soup):
+ readings = []
+ for el in soup.find_all("漢字音"):
+ hira = Expressions.kata_to_hira(el.text)
+ readings.append(hira)
+ if soup.find("漢字音") is None:
+ readings.append("")
+ expressions = []
+ for el in soup.find_all("漢字見出"):
+ expressions.append(el.text)
+ headwords = {}
+ for reading in readings:
+ headwords[reading] = expressions
+ return headwords
+
+ def _get_acronym_headwords(self, soup):
+ expressions = []
+ for el in soup.find_all("略語"):
+ expression_parts = []
+ for part in el.find_all(["欧字", "和字"]):
+ expression_parts.append(part.text)
+ expression = "".join(expression_parts)
+ expressions.append(expression)
+ headwords = {"": expressions}
+ return headwords
diff --git a/bot/entries/daijirin2/phrase_entry.py b/bot/entries/daijirin2/phrase_entry.py
new file mode 100644
index 0000000..0470d7d
--- /dev/null
+++ b/bot/entries/daijirin2/phrase_entry.py
@@ -0,0 +1,67 @@
+import re
+
+import bot.entries.base.expressions as Expressions
+from bot.data import load_phrase_readings
+from bot.entries.daijirin2.base_entry import BaseEntry
+
+
+class PhraseEntry(BaseEntry):
+ def get_part_of_speech_tags(self):
+ # phrases do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ headwords = {}
+ expressions = self._find_expressions(soup)
+ readings = self._find_readings()
+ for idx, expression in enumerate(expressions):
+ reading = readings[idx]
+ if reading in headwords:
+ headwords[reading].append(expression)
+ else:
+ headwords[reading] = [expression]
+ return headwords
+
+ def _find_expressions(self, soup):
+ self._delete_unused_nodes(soup)
+ text = soup.find("句表記").text
+ text = self._clean_expression(text)
+ alternatives = parse_phrase(text)
+ expressions = []
+ for alt in alternatives:
+ for exp in Expressions.expand_abbreviation(alt):
+ expressions.append(exp)
+ return expressions
+
+ def _find_readings(self):
+ phrase_readings = load_phrase_readings(self.target)
+ text = phrase_readings[self.entry_id]
+ alternatives = parse_phrase(text)
+ readings = []
+ for alt in alternatives:
+ for reading in Expressions.expand_abbreviation(alt):
+ readings.append(reading)
+ return readings
+
+
+def parse_phrase(text):
+ """Return a list of strings described by = notation."""
+ group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
+ groups = re.findall(group_pattern, text)
+ expressions = [""]
+ for group in groups:
+ new_exps = []
+ for expression in expressions:
+ new_exps.append(expression + group[0])
+ expressions = new_exps.copy()
+ if group[1] == "":
+ continue
+ new_exps = []
+ for expression in expressions:
+ new_exps.append(expression + group[2])
+ for expression in expressions:
+ for alt in group[3].split("・"):
+ new_exps.append(expression + alt)
+ expressions = new_exps.copy()
+ return expressions
diff --git a/bot/entries/daijirin2_preprocess.py b/bot/entries/daijirin2/preprocess.py
similarity index 100%
rename from bot/entries/daijirin2_preprocess.py
rename to bot/entries/daijirin2/preprocess.py
diff --git a/bot/entries/factory.py b/bot/entries/factory.py
index 162c102..594762f 100644
--- a/bot/entries/factory.py
+++ b/bot/entries/factory.py
@@ -1,20 +1,7 @@
-from bot.targets import Targets
-
-from bot.entries.jitenon import JitenonKokugoEntry
-from bot.entries.jitenon import JitenonYojiEntry
-from bot.entries.jitenon import JitenonKotowazaEntry
-from bot.entries.smk8 import Smk8Entry
-from bot.entries.daijirin2 import Daijirin2Entry
-from bot.entries.sankoku8 import Sankoku8Entry
+import importlib
def new_entry(target, page_id):
- entry_map = {
- Targets.JITENON_KOKUGO: JitenonKokugoEntry,
- Targets.JITENON_YOJI: JitenonYojiEntry,
- Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
- Targets.SMK8: Smk8Entry,
- Targets.DAIJIRIN2: Daijirin2Entry,
- Targets.SANKOKU8: Sankoku8Entry,
- }
- return entry_map[target](target, page_id)
+ module_path = f"bot.entries.{target.name.lower()}.entry"
+ module = importlib.import_module(module_path)
+ return module.Entry(target, page_id)
diff --git a/bot/entries/jitenon_kokugo/entry.py b/bot/entries/jitenon_kokugo/entry.py
new file mode 100644
index 0000000..523ac63
--- /dev/null
+++ b/bot/entries/jitenon_kokugo/entry.py
@@ -0,0 +1,45 @@
+from bot.entries.base.jitenon_entry import JitenonEntry
+import bot.entries.base.expressions as Expressions
+
+
+class Entry(JitenonEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.example = ""
+ self.alt_expression = ""
+ self.antonym = ""
+ self.attachments = ""
+ self.compounds = ""
+ self.related_words = ""
+
+ def _get_column_map(self):
+ return {
+ "言葉": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "例文": "example",
+ "別表記": "alt_expression",
+ "対義語": "antonym",
+ "活用": "attachments",
+ "用例": "compounds",
+ "類語": "related_words",
+ }
+
+ def _get_headwords(self):
+ headwords = {}
+ for reading in self.yomikata.split("・"):
+ if reading not in headwords:
+ headwords[reading] = []
+ for expression in self.expression.split("・"):
+ headwords[reading].append(expression)
+ if self.alt_expression.strip() != "":
+ for expression in self.alt_expression.split("・"):
+ headwords[reading].append(expression)
+ return headwords
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
+ Expressions.add_fullwidth(expressions)
+ Expressions.remove_iteration_mark(expressions)
+ Expressions.add_iteration_mark(expressions)
diff --git a/bot/entries/jitenon_kotowaza/entry.py b/bot/entries/jitenon_kotowaza/entry.py
new file mode 100644
index 0000000..71dc35f
--- /dev/null
+++ b/bot/entries/jitenon_kotowaza/entry.py
@@ -0,0 +1,35 @@
+from bot.entries.base.jitenon_entry import JitenonEntry
+import bot.entries.base.expressions as Expressions
+
+
+class Entry(JitenonEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.origin = ""
+ self.example = ""
+ self.related_expressions = []
+
+ def _get_column_map(self):
+ return {
+ "言葉": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "異形": "other_forms",
+ "出典": "origin",
+ "例文": "example",
+ "類句": "related_expressions",
+ }
+
+ def _get_headwords(self):
+ if self.expression == "金棒引き・鉄棒引き":
+ headwords = {
+ "かなぼうひき": ["金棒引き", "鉄棒引き"]
+ }
+ else:
+ headwords = super()._get_headwords()
+ return headwords
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
+ Expressions.add_fullwidth(expressions)
diff --git a/bot/entries/jitenon_yoji/entry.py b/bot/entries/jitenon_yoji/entry.py
new file mode 100644
index 0000000..e0e8b13
--- /dev/null
+++ b/bot/entries/jitenon_yoji/entry.py
@@ -0,0 +1,27 @@
+import bot.entries.base.expressions as Expressions
+from bot.entries.base.jitenon_entry import JitenonEntry
+
+
+class Entry(JitenonEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.origin = ""
+ self.kanken_level = ""
+ self.category = ""
+ self.related_expressions = []
+
+ def _get_column_map(self):
+ return {
+ "四字熟語": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "異形": "other_forms",
+ "出典": "origin",
+ "漢検級": "kanken_level",
+ "場面用途": "category",
+ "類義語": "related_expressions",
+ }
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
diff --git a/bot/entries/sankoku8.py b/bot/entries/sankoku8.py
deleted file mode 100644
index 9653f68..0000000
--- a/bot/entries/sankoku8.py
+++ /dev/null
@@ -1,260 +0,0 @@
-from bs4 import BeautifulSoup
-import bot.entries.expressions as Expressions
-import bot.soup as Soup
-from bot.entries.entry import Entry
-from bot.data import load_phrase_readings
-from bot.entries.sankoku8_preprocess import preprocess_page
-
-
-class _BaseSankoku8Entry(Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.children = []
- self.phrases = []
- self._hyouki_name = "表記"
- self._midashi_name = None
- self._midashi_kana_name = None
-
- def get_global_identifier(self):
- parent_part = format(self.entry_id[0], '06')
- child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
- return f"@{self.target.value}-{parent_part}-{child_part}"
-
- def set_page(self, page):
- page = self.__decompose_subentries(page)
- self._page = page
-
- def get_page_soup(self):
- soup = BeautifulSoup(self._page, "xml")
- return soup
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- readings = self._find_readings(soup)
- expressions = self._find_expressions(soup)
- headwords = {}
- for reading in readings:
- headwords[reading] = []
- if len(readings) == 1:
- reading = readings[0]
- if soup.find(self._midashi_name).find(self._hyouki_name) is None:
- headwords[reading].append(reading)
- for exp in expressions:
- if exp not in headwords[reading]:
- headwords[reading].append(exp)
- elif len(readings) > 1 and len(expressions) == 0:
- for reading in readings:
- headwords[reading].append(reading)
- elif len(readings) > 1 and len(expressions) == 1:
- if soup.find(self._midashi_name).find(self._hyouki_name) is None:
- for reading in readings:
- headwords[reading].append(reading)
- expression = expressions[0]
- for reading in readings:
- if expression not in headwords[reading]:
- headwords[reading].append(expression)
- elif len(readings) > 1 and len(expressions) == len(readings):
- if soup.find(self._midashi_name).find(self._hyouki_name) is None:
- for reading in readings:
- headwords[reading].append(reading)
- for idx, reading in enumerate(readings):
- exp = expressions[idx]
- if exp not in headwords[reading]:
- headwords[reading].append(exp)
- else:
- raise Exception() # shouldn't happen
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
-
- def get_part_of_speech_tags(self):
- if self._part_of_speech_tags is not None:
- return self._part_of_speech_tags
- self._part_of_speech_tags = []
- soup = self.get_page_soup()
- for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
- pos_group = midashi.find("品詞G")
- if pos_group is None:
- continue
- for tag in pos_group.find_all("a"):
- if tag.text not in self._part_of_speech_tags:
- self._part_of_speech_tags.append(tag.text)
- return self._part_of_speech_tags
-
- def _find_expressions(self, soup):
- expressions = []
- for hyouki in soup.find_all(self._hyouki_name):
- for expression in parse_hyouki_soup(hyouki, [""]):
- expressions.append(expression)
- return expressions
-
- def _find_readings(self, soup):
- midasi_kana = soup.find(self._midashi_kana_name)
- readings = parse_hyouki_soup(midasi_kana, [""])
- return readings
-
- def __decompose_subentries(self, page):
- soup = BeautifulSoup(page, features="xml")
- subentry_parameters = [
- [Sankoku8ChildEntry, ["子項目"], self.children],
- [Sankoku8PhraseEntry, ["句項目"], self.phrases],
- ]
- for x in subentry_parameters:
- subentry_class, tags, subentry_list = x
- for tag in tags:
- tag_soup = soup.find(tag)
- while tag_soup is not None:
- tag_soup.name = "項目"
- subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
- self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(self.target, subentry_id)
- page = tag_soup.decode()
- subentry.set_page(page)
- subentry_list.append(subentry)
- tag_soup.decompose()
- tag_soup = soup.find(tag)
- return soup.decode()
-
- @staticmethod
- def id_string_to_entry_id(id_string):
- parts = id_string.split("-")
- if len(parts) == 1:
- return (int(parts[0]), 0)
- elif len(parts) == 2:
- # subentries have a hexadecimal part
- return (int(parts[0]), int(parts[1], 16))
- else:
- raise Exception(f"Invalid entry ID: {id_string}")
-
- @staticmethod
- def _delete_unused_nodes(soup):
- """Remove extra markup elements that appear in the entry
- headword line which are not part of the entry headword"""
- unused_nodes = [
- "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
- "アクセント分節", "活用分節", "ルビG", "分書"
- ]
- for name in unused_nodes:
- Soup.delete_soup_nodes(soup, name)
-
-
-class Sankoku8Entry(_BaseSankoku8Entry):
- def __init__(self, target, page_id):
- entry_id = (page_id, 0)
- super().__init__(target, entry_id)
- self._midashi_name = "見出部"
- self._midashi_kana_name = "見出仮名"
-
- def set_page(self, page):
- page = preprocess_page(page)
- super().set_page(page)
-
-
-class Sankoku8ChildEntry(_BaseSankoku8Entry):
- def __init__(self, target, page_id):
- super().__init__(target, page_id)
- self._midashi_name = "子見出部"
- self._midashi_kana_name = "子見出仮名"
-
-
-class Sankoku8PhraseEntry(_BaseSankoku8Entry):
- def get_part_of_speech_tags(self):
- # phrases do not contain these tags
- return []
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- expressions = self._find_expressions(soup)
- readings = self._find_readings(soup)
- headwords = {}
- if len(expressions) != len(readings):
- raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
- for idx, expression in enumerate(expressions):
- reading = readings[idx]
- if reading in headwords:
- headwords[reading].append(expression)
- else:
- headwords[reading] = [expression]
- return headwords
-
- def _find_expressions(self, soup):
- phrase_soup = soup.find("句表記")
- expressions = parse_hyouki_soup(phrase_soup, [""])
- return expressions
-
- def _find_readings(self, soup):
- reading_patterns = load_phrase_readings(self.target)
- reading_pattern = reading_patterns[self.entry_id]
- readings = parse_hyouki_pattern(reading_pattern)
- return readings
-
-
-def parse_hyouki_soup(soup, base_exps):
- omitted_characters = [
- "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
- ]
- exps = base_exps.copy()
- for child in soup.children:
- new_exps = []
- if child.name == "言換G":
- for alt in child.find_all("言換"):
- parts = parse_hyouki_soup(alt, [""])
- for exp in exps:
- for part in parts:
- new_exps.append(exp + part)
- elif child.name == "補足表記":
- alt1 = child.find("表記対象")
- alt2 = child.find("表記内容G")
- parts1 = parse_hyouki_soup(alt1, [""])
- parts2 = parse_hyouki_soup(alt2, [""])
- for exp in exps:
- for part in parts1:
- new_exps.append(exp + part)
- for part in parts2:
- new_exps.append(exp + part)
- elif child.name == "省略":
- parts = parse_hyouki_soup(child, [""])
- for exp in exps:
- new_exps.append(exp)
- for part in parts:
- new_exps.append(exp + part)
- elif child.name is not None:
- new_exps = parse_hyouki_soup(child, exps)
- else:
- text = child.text
- for char in omitted_characters:
- text = text.replace(char, "")
- for exp in exps:
- new_exps.append(exp + text)
- exps = new_exps.copy()
- return exps
-
-
-def parse_hyouki_pattern(pattern):
- replacements = {
- "(": "<省略>(",
- ")": ")省略>",
- "{": "<補足表記><表記対象>",
- "・": "表記対象><表記内容G>(<表記内容>",
- "}": "表記内容>)表記内容G>補足表記>",
- "〈": "<言換G>〈<言換>",
- "/": "言換>/<言換>",
- "〉": "言換>〉言換G>",
- "⦅": "<補足表記><表記対象>",
- "\": "表記対象><表記内容G>⦅<表記内容>",
- "⦆": "表記内容>⦆表記内容G>補足表記>",
- }
- markup = f"{pattern}"
- for key, val in replacements.items():
- markup = markup.replace(key, val)
- soup = BeautifulSoup(markup, "xml")
- hyouki_soup = soup.find("span")
- exps = parse_hyouki_soup(hyouki_soup, [""])
- return exps
diff --git a/bot/entries/sankoku8/base_entry.py b/bot/entries/sankoku8/base_entry.py
new file mode 100644
index 0000000..93c0515
--- /dev/null
+++ b/bot/entries/sankoku8/base_entry.py
@@ -0,0 +1,97 @@
+import bot.soup as Soup
+from bot.entries.base.sanseido_entry import SanseidoEntry
+from bot.entries.sankoku8.parse import parse_hyouki_soup
+
+
+class BaseEntry(SanseidoEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.children = []
+ self.phrases = []
+ self._hyouki_name = "表記"
+ self._midashi_name = None
+ self._midashi_kana_name = None
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ readings = self._find_readings(soup)
+ expressions = self._find_expressions(soup)
+ headwords = {}
+ for reading in readings:
+ headwords[reading] = []
+ if len(readings) == 1:
+ reading = readings[0]
+ if soup.find(self._midashi_name).find(self._hyouki_name) is None:
+ headwords[reading].append(reading)
+ for exp in expressions:
+ if exp not in headwords[reading]:
+ headwords[reading].append(exp)
+ elif len(readings) > 1 and len(expressions) == 0:
+ for reading in readings:
+ headwords[reading].append(reading)
+ elif len(readings) > 1 and len(expressions) == 1:
+ if soup.find(self._midashi_name).find(self._hyouki_name) is None:
+ for reading in readings:
+ headwords[reading].append(reading)
+ expression = expressions[0]
+ for reading in readings:
+ if expression not in headwords[reading]:
+ headwords[reading].append(expression)
+ elif len(readings) > 1 and len(expressions) == len(readings):
+ if soup.find(self._midashi_name).find(self._hyouki_name) is None:
+ for reading in readings:
+ headwords[reading].append(reading)
+ for idx, reading in enumerate(readings):
+ exp = expressions[idx]
+ if exp not in headwords[reading]:
+ headwords[reading].append(exp)
+ else:
+ raise Exception() # shouldn't happen
+ return headwords
+
+ def get_part_of_speech_tags(self):
+ if self._part_of_speech_tags is not None:
+ return self._part_of_speech_tags
+ self._part_of_speech_tags = []
+ soup = self.get_page_soup()
+ for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
+ pos_group = midashi.find("品詞G")
+ if pos_group is None:
+ continue
+ for tag in pos_group.find_all("a"):
+ if tag.text not in self._part_of_speech_tags:
+ self._part_of_speech_tags.append(tag.text)
+ return self._part_of_speech_tags
+
+ def _find_expressions(self, soup):
+ expressions = []
+ for hyouki in soup.find_all(self._hyouki_name):
+ for expression in parse_hyouki_soup(hyouki, [""]):
+ expressions.append(expression)
+ return expressions
+
+ def _find_readings(self, soup):
+ midasi_kana = soup.find(self._midashi_kana_name)
+ readings = parse_hyouki_soup(midasi_kana, [""])
+ return readings
+
+ def _get_subentry_parameters(self):
+ from bot.entries.sankoku8.child_entry import ChildEntry
+ from bot.entries.sankoku8.phrase_entry import PhraseEntry
+ subentry_parameters = [
+ [ChildEntry, ["子項目"], self.children],
+ [PhraseEntry, ["句項目"], self.phrases],
+ ]
+ return subentry_parameters
+
+ @staticmethod
+ def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
+ unused_nodes = [
+ "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
+ "アクセント分節", "活用分節", "ルビG", "分書"
+ ]
+ for name in unused_nodes:
+ Soup.delete_soup_nodes(soup, name)
diff --git a/bot/entries/sankoku8/child_entry.py b/bot/entries/sankoku8/child_entry.py
new file mode 100644
index 0000000..9f6b1c1
--- /dev/null
+++ b/bot/entries/sankoku8/child_entry.py
@@ -0,0 +1,8 @@
+from bot.entries.sankoku8.base_entry import BaseEntry
+
+
+class ChildEntry(BaseEntry):
+ def __init__(self, target, page_id):
+ super().__init__(target, page_id)
+ self._midashi_name = "子見出部"
+ self._midashi_kana_name = "子見出仮名"
diff --git a/bot/entries/sankoku8/entry.py b/bot/entries/sankoku8/entry.py
new file mode 100644
index 0000000..533ac66
--- /dev/null
+++ b/bot/entries/sankoku8/entry.py
@@ -0,0 +1,14 @@
+from bot.entries.sankoku8.base_entry import BaseEntry
+from bot.entries.sankoku8.preprocess import preprocess_page
+
+
+class Entry(BaseEntry):
+ def __init__(self, target, page_id):
+ entry_id = (page_id, 0)
+ super().__init__(target, entry_id)
+ self._midashi_name = "見出部"
+ self._midashi_kana_name = "見出仮名"
+
+ def set_page(self, page):
+ page = preprocess_page(page)
+ super().set_page(page)
diff --git a/bot/entries/sankoku8/parse.py b/bot/entries/sankoku8/parse.py
new file mode 100644
index 0000000..a57574b
--- /dev/null
+++ b/bot/entries/sankoku8/parse.py
@@ -0,0 +1,65 @@
+from bs4 import BeautifulSoup
+
+
+def parse_hyouki_soup(soup, base_exps):
+ omitted_characters = [
+ "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
+ ]
+ exps = base_exps.copy()
+ for child in soup.children:
+ new_exps = []
+ if child.name == "言換G":
+ for alt in child.find_all("言換"):
+ parts = parse_hyouki_soup(alt, [""])
+ for exp in exps:
+ for part in parts:
+ new_exps.append(exp + part)
+ elif child.name == "補足表記":
+ alt1 = child.find("表記対象")
+ alt2 = child.find("表記内容G")
+ parts1 = parse_hyouki_soup(alt1, [""])
+ parts2 = parse_hyouki_soup(alt2, [""])
+ for exp in exps:
+ for part in parts1:
+ new_exps.append(exp + part)
+ for part in parts2:
+ new_exps.append(exp + part)
+ elif child.name == "省略":
+ parts = parse_hyouki_soup(child, [""])
+ for exp in exps:
+ new_exps.append(exp)
+ for part in parts:
+ new_exps.append(exp + part)
+ elif child.name is not None:
+ new_exps = parse_hyouki_soup(child, exps)
+ else:
+ text = child.text
+ for char in omitted_characters:
+ text = text.replace(char, "")
+ for exp in exps:
+ new_exps.append(exp + text)
+ exps = new_exps.copy()
+ return exps
+
+
+def parse_hyouki_pattern(pattern):
+ replacements = {
+ "(": "<省略>(",
+ ")": ")省略>",
+ "{": "<補足表記><表記対象>",
+ "・": "表記対象><表記内容G>(<表記内容>",
+ "}": "表記内容>)表記内容G>補足表記>",
+ "〈": "<言換G>〈<言換>",
+ "/": "言換>/<言換>",
+ "〉": "言換>〉言換G>",
+ "⦅": "<補足表記><表記対象>",
+ "\": "表記対象><表記内容G>⦅<表記内容>",
+ "⦆": "表記内容>⦆表記内容G>補足表記>",
+ }
+ markup = f"{pattern}"
+ for key, val in replacements.items():
+ markup = markup.replace(key, val)
+ soup = BeautifulSoup(markup, "xml")
+ hyouki_soup = soup.find("span")
+ exps = parse_hyouki_soup(hyouki_soup, [""])
+ return exps
diff --git a/bot/entries/sankoku8/phrase_entry.py b/bot/entries/sankoku8/phrase_entry.py
new file mode 100644
index 0000000..e5da208
--- /dev/null
+++ b/bot/entries/sankoku8/phrase_entry.py
@@ -0,0 +1,37 @@
+from bot.data import load_phrase_readings
+from bot.entries.sankoku8.base_entry import BaseEntry
+from bot.entries.sankoku8.parse import parse_hyouki_soup
+from bot.entries.sankoku8.parse import parse_hyouki_pattern
+
+
+class PhraseEntry(BaseEntry):
+ def get_part_of_speech_tags(self):
+ # phrases do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ expressions = self._find_expressions(soup)
+ readings = self._find_readings(soup)
+ headwords = {}
+ if len(expressions) != len(readings):
+ raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
+ for idx, expression in enumerate(expressions):
+ reading = readings[idx]
+ if reading in headwords:
+ headwords[reading].append(expression)
+ else:
+ headwords[reading] = [expression]
+ return headwords
+
+ def _find_expressions(self, soup):
+ phrase_soup = soup.find("句表記")
+ expressions = parse_hyouki_soup(phrase_soup, [""])
+ return expressions
+
+ def _find_readings(self, soup):
+ reading_patterns = load_phrase_readings(self.target)
+ reading_pattern = reading_patterns[self.entry_id]
+ readings = parse_hyouki_pattern(reading_pattern)
+ return readings
diff --git a/bot/entries/sankoku8_preprocess.py b/bot/entries/sankoku8/preprocess.py
similarity index 100%
rename from bot/entries/sankoku8_preprocess.py
rename to bot/entries/sankoku8/preprocess.py
diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py
deleted file mode 100644
index 2d43e4a..0000000
--- a/bot/entries/smk8.py
+++ /dev/null
@@ -1,221 +0,0 @@
-from bs4 import BeautifulSoup
-
-import bot.entries.expressions as Expressions
-import bot.soup as Soup
-from bot.data import load_phrase_readings
-from bot.entries.entry import Entry
-from bot.entries.smk8_preprocess import preprocess_page
-
-
-class _BaseSmk8Entry(Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.children = []
- self.phrases = []
- self.kanjis = []
-
- def get_global_identifier(self):
- parent_part = format(self.entry_id[0], '06')
- child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
- return f"@{self.target.value}-{parent_part}-{child_part}"
-
- def set_page(self, page):
- page = self.__decompose_subentries(page)
- self._page = page
-
- def get_page_soup(self):
- soup = BeautifulSoup(self._page, "xml")
- return soup
-
- def get_part_of_speech_tags(self):
- if self._part_of_speech_tags is not None:
- return self._part_of_speech_tags
- self._part_of_speech_tags = []
- soup = self.get_page_soup()
- headword_info = soup.find("見出要素")
- if headword_info is None:
- return self._part_of_speech_tags
- for tag in headword_info.find_all("品詞M"):
- if tag.text not in self._part_of_speech_tags:
- self._part_of_speech_tags.append(tag.text)
- return self._part_of_speech_tags
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
-
- def _find_reading(self, soup):
- midasi_kana = soup.find("見出仮名")
- reading = midasi_kana.text
- for x in [" ", "・"]:
- reading = reading.replace(x, "")
- return reading
-
- def _find_expressions(self, soup):
- clean_expressions = []
- for expression in soup.find_all("標準表記"):
- clean_expression = self._clean_expression(expression.text)
- clean_expressions.append(clean_expression)
- expressions = Expressions.expand_abbreviation_list(clean_expressions)
- return expressions
-
- def __decompose_subentries(self, page):
- soup = BeautifulSoup(page, features="xml")
- subentry_parameters = [
- [Smk8ChildEntry, ["子項目F", "子項目"], self.children],
- [Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
- [Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
- ]
- for x in subentry_parameters:
- subentry_class, tags, subentry_list = x
- for tag in tags:
- tag_soup = soup.find(tag)
- while tag_soup is not None:
- tag_soup.name = "項目"
- subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
- self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(self.target, subentry_id)
- page = tag_soup.decode()
- subentry.set_page(page)
- subentry_list.append(subentry)
- tag_soup.decompose()
- tag_soup = soup.find(tag)
- return soup.decode()
-
- @staticmethod
- def id_string_to_entry_id(id_string):
- parts = id_string.split("-")
- if len(parts) == 1:
- return (int(parts[0]), 0)
- elif len(parts) == 2:
- # subentries have a hexadecimal part
- return (int(parts[0]), int(parts[1], 16))
- else:
- raise Exception(f"Invalid entry ID: {id_string}")
-
- @staticmethod
- def _delete_unused_nodes(soup):
- """Remove extra markup elements that appear in the entry
- headword line which are not part of the entry headword"""
- unused_nodes = [
- "表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
- ]
- for name in unused_nodes:
- Soup.delete_soup_nodes(soup, name)
-
- @staticmethod
- def _clean_expression(expression):
- for x in ["〈", "〉", "{", "}", "…", " "]:
- expression = expression.replace(x, "")
- return expression
-
- @staticmethod
- def _fill_alts(soup):
- for el in soup.find_all(["親見出仮名", "親見出表記"]):
- el.string = el.attrs["alt"]
- for gaiji in soup.find_all("外字"):
- gaiji.string = gaiji.img.attrs["alt"]
-
-
-class Smk8Entry(_BaseSmk8Entry):
- def __init__(self, target, page_id):
- entry_id = (page_id, 0)
- super().__init__(target, entry_id)
-
- def set_page(self, page):
- page = preprocess_page(page)
- super().set_page(page)
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- reading = self._find_reading(soup)
- expressions = []
- if soup.find("見出部").find("標準表記") is None:
- expressions.append(reading)
- for expression in self._find_expressions(soup):
- if expression not in expressions:
- expressions.append(expression)
- headwords = {reading: expressions}
- return headwords
-
-
-class Smk8ChildEntry(_BaseSmk8Entry):
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- reading = self._find_reading(soup)
- expressions = []
- if soup.find("子見出部").find("標準表記") is None:
- expressions.append(reading)
- for expression in self._find_expressions(soup):
- if expression not in expressions:
- expressions.append(expression)
- headwords = {reading: expressions}
- return headwords
-
-
-class Smk8PhraseEntry(_BaseSmk8Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.__phrase_readings = load_phrase_readings(self.target)
-
- def get_part_of_speech_tags(self):
- # phrases do not contain these tags
- return []
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- headwords = {}
- expressions = self._find_expressions(soup)
- readings = self._find_readings()
- for idx, expression in enumerate(expressions):
- reading = readings[idx]
- if reading in headwords:
- headwords[reading].append(expression)
- else:
- headwords[reading] = [expression]
- return headwords
-
- def _find_expressions(self, soup):
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- text = soup.find("標準表記").text
- text = self._clean_expression(text)
- alternatives = Expressions.expand_smk_alternatives(text)
- expressions = []
- for alt in alternatives:
- for exp in Expressions.expand_abbreviation(alt):
- expressions.append(exp)
- return expressions
-
- def _find_readings(self):
- text = self.__phrase_readings[self.entry_id]
- alternatives = Expressions.expand_smk_alternatives(text)
- readings = []
- for alt in alternatives:
- for reading in Expressions.expand_abbreviation(alt):
- readings.append(reading)
- return readings
-
-
-class Smk8KanjiEntry(_BaseSmk8Entry):
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- reading = self.__get_parent_reading()
- expressions = self._find_expressions(soup)
- headwords = {reading: expressions}
- return headwords
-
- def __get_parent_reading(self):
- parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
- parent = self.ID_TO_ENTRY[parent_id]
- reading = parent.get_first_reading()
- return reading
diff --git a/bot/entries/smk8/base_entry.py b/bot/entries/smk8/base_entry.py
new file mode 100644
index 0000000..7bf32c2
--- /dev/null
+++ b/bot/entries/smk8/base_entry.py
@@ -0,0 +1,73 @@
+import bot.soup as Soup
+import bot.entries.base.expressions as Expressions
+from bot.entries.base.sanseido_entry import SanseidoEntry
+
+
+class BaseEntry(SanseidoEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.children = []
+ self.phrases = []
+ self.kanjis = []
+
+ def get_part_of_speech_tags(self):
+ if self._part_of_speech_tags is not None:
+ return self._part_of_speech_tags
+ self._part_of_speech_tags = []
+ soup = self.get_page_soup()
+ headword_info = soup.find("見出要素")
+ if headword_info is None:
+ return self._part_of_speech_tags
+ for tag in headword_info.find_all("品詞M"):
+ if tag.text not in self._part_of_speech_tags:
+ self._part_of_speech_tags.append(tag.text)
+ return self._part_of_speech_tags
+
+ def _find_reading(self, soup):
+ midasi_kana = soup.find("見出仮名")
+ reading = midasi_kana.text
+ for x in [" ", "・"]:
+ reading = reading.replace(x, "")
+ return reading
+
+ def _find_expressions(self, soup):
+ clean_expressions = []
+ for expression in soup.find_all("標準表記"):
+ clean_expression = self._clean_expression(expression.text)
+ clean_expressions.append(clean_expression)
+ expressions = Expressions.expand_abbreviation_list(clean_expressions)
+ return expressions
+
+ def _get_subentry_parameters(self):
+ from bot.entries.smk8.child_entry import ChildEntry
+ from bot.entries.smk8.phrase_entry import PhraseEntry
+ from bot.entries.smk8.kanji_entry import KanjiEntry
+ subentry_parameters = [
+ [ChildEntry, ["子項目F", "子項目"], self.children],
+ [PhraseEntry, ["句項目F", "句項目"], self.phrases],
+ [KanjiEntry, ["造語成分項目"], self.kanjis],
+ ]
+ return subentry_parameters
+
+ @staticmethod
+ def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
+ unused_nodes = [
+ "表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
+ ]
+ for name in unused_nodes:
+ Soup.delete_soup_nodes(soup, name)
+
+ @staticmethod
+ def _clean_expression(expression):
+ for x in ["〈", "〉", "{", "}", "…", " "]:
+ expression = expression.replace(x, "")
+ return expression
+
+ @staticmethod
+ def _fill_alts(soup):
+ for elm in soup.find_all(["親見出仮名", "親見出表記"]):
+ elm.string = elm.attrs["alt"]
+ for gaiji in soup.find_all("外字"):
+ gaiji.string = gaiji.img.attrs["alt"]
diff --git a/bot/entries/smk8/child_entry.py b/bot/entries/smk8/child_entry.py
new file mode 100644
index 0000000..0dbe375
--- /dev/null
+++ b/bot/entries/smk8/child_entry.py
@@ -0,0 +1,17 @@
+from bot.entries.smk8.base_entry import BaseEntry
+
+
+class ChildEntry(BaseEntry):
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ reading = self._find_reading(soup)
+ expressions = []
+ if soup.find("子見出部").find("標準表記") is None:
+ expressions.append(reading)
+ for expression in self._find_expressions(soup):
+ if expression not in expressions:
+ expressions.append(expression)
+ headwords = {reading: expressions}
+ return headwords
diff --git a/bot/entries/smk8/entry.py b/bot/entries/smk8/entry.py
new file mode 100644
index 0000000..4baed42
--- /dev/null
+++ b/bot/entries/smk8/entry.py
@@ -0,0 +1,26 @@
+from bot.entries.smk8.base_entry import BaseEntry
+from bot.entries.smk8.preprocess import preprocess_page
+
+
+class Entry(BaseEntry):
+ def __init__(self, target, page_id):
+ entry_id = (page_id, 0)
+ super().__init__(target, entry_id)
+
+ def set_page(self, page):
+ page = preprocess_page(page)
+ super().set_page(page)
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ reading = self._find_reading(soup)
+ expressions = []
+ if soup.find("見出部").find("標準表記") is None:
+ expressions.append(reading)
+ for expression in self._find_expressions(soup):
+ if expression not in expressions:
+ expressions.append(expression)
+ headwords = {reading: expressions}
+ return headwords
diff --git a/bot/entries/smk8/kanji_entry.py b/bot/entries/smk8/kanji_entry.py
new file mode 100644
index 0000000..3e77faf
--- /dev/null
+++ b/bot/entries/smk8/kanji_entry.py
@@ -0,0 +1,22 @@
+from bot.entries.smk8.base_entry import BaseEntry
+
+
+class KanjiEntry(BaseEntry):
+ def get_part_of_speech_tags(self):
+ # kanji entries do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ reading = self.__get_parent_reading()
+ expressions = self._find_expressions(soup)
+ headwords = {reading: expressions}
+ return headwords
+
+ def __get_parent_reading(self):
+ parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
+ parent = self.ID_TO_ENTRY[parent_id]
+ reading = parent.get_first_reading()
+ return reading
diff --git a/bot/entries/smk8/phrase_entry.py b/bot/entries/smk8/phrase_entry.py
new file mode 100644
index 0000000..aac9b84
--- /dev/null
+++ b/bot/entries/smk8/phrase_entry.py
@@ -0,0 +1,64 @@
+import re
+
+import bot.entries.base.expressions as Expressions
+from bot.data import load_phrase_readings
+from bot.entries.smk8.base_entry import BaseEntry
+
+
+class PhraseEntry(BaseEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.__phrase_readings = load_phrase_readings(self.target)
+
+ def get_part_of_speech_tags(self):
+ # phrase entries do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ headwords = {}
+ expressions = self._find_expressions(soup)
+ readings = self._find_readings()
+ for idx, expression in enumerate(expressions):
+ reading = readings[idx]
+ if reading in headwords:
+ headwords[reading].append(expression)
+ else:
+ headwords[reading] = [expression]
+ return headwords
+
+ def _find_expressions(self, soup):
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ text = soup.find("標準表記").text
+ text = self._clean_expression(text)
+ alternatives = parse_phrase(text)
+ expressions = []
+ for alt in alternatives:
+ for exp in Expressions.expand_abbreviation(alt):
+ expressions.append(exp)
+ return expressions
+
+ def _find_readings(self):
+ text = self.__phrase_readings[self.entry_id]
+ alternatives = parse_phrase(text)
+ readings = []
+ for alt in alternatives:
+ for reading in Expressions.expand_abbreviation(alt):
+ readings.append(reading)
+ return readings
+
+
+def parse_phrase(text):
+ """Return a list of strings described by △ notation."""
+ match = re.search(r"△([^(]+)(([^(]+))", text)
+ if match is None:
+ return [text]
+ alt_parts = [match.group(1)]
+ for alt_part in match.group(2).split("・"):
+ alt_parts.append(alt_part)
+ alts = []
+ for alt_part in alt_parts:
+ alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
+ alts.append(alt_exp)
+ return alts
diff --git a/bot/entries/smk8_preprocess.py b/bot/entries/smk8/preprocess.py
similarity index 100%
rename from bot/entries/smk8_preprocess.py
rename to bot/entries/smk8/preprocess.py
diff --git a/bot/yomichan/terms/daijirin2.py b/bot/yomichan/terms/daijirin2.py
index 10aaa76..281fac4 100644
--- a/bot/yomichan/terms/daijirin2.py
+++ b/bot/yomichan/terms/daijirin2.py
@@ -1,4 +1,4 @@
-from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
+from bot.entries.daijirin2.phrase_entry import PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.daijirin2 import make_glossary
@@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Daijirin2Terminator(Terminator):
- def __init__(self, target):
- super().__init__(target)
-
def _definition_tags(self, entry):
return ""
diff --git a/bot/yomichan/terms/sankoku8.py b/bot/yomichan/terms/sankoku8.py
index 613f3bb..cff264f 100644
--- a/bot/yomichan/terms/sankoku8.py
+++ b/bot/yomichan/terms/sankoku8.py
@@ -1,4 +1,4 @@
-from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry
+from bot.entries.sankoku8.phrase_entry import PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.sankoku8 import make_glossary
@@ -6,9 +6,6 @@ from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Sankoku8Terminator(Terminator):
- def __init__(self, target):
- super().__init__(target)
-
def _definition_tags(self, entry):
return ""
diff --git a/bot/yomichan/terms/smk8.py b/bot/yomichan/terms/smk8.py
index d1e3ca7..766f4a0 100644
--- a/bot/yomichan/terms/smk8.py
+++ b/bot/yomichan/terms/smk8.py
@@ -1,5 +1,5 @@
-from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
-from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
+from bot.entries.smk8.kanji_entry import KanjiEntry
+from bot.entries.smk8.phrase_entry import PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.smk8 import make_glossary
diff --git a/tests/test_daijirin_phrases.py b/tests/test_daijirin_phrases.py
new file mode 100644
index 0000000..3ab02dd
--- /dev/null
+++ b/tests/test_daijirin_phrases.py
@@ -0,0 +1,21 @@
+import unittest
+from bot.entries.daijirin2.phrase_entry import parse_phrase
+
+
+class TestDaijirin2PhraseParse(unittest.TestCase):
+ def test1(self):
+ text = "同じ穴の=狢(=狐・狸)"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 3)
+ self.assertIn("同じ穴の狢", exps)
+ self.assertIn("同じ穴の狐", exps)
+ self.assertIn("同じ穴の狸", exps)
+
+ def test2(self):
+ text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 4)
+ self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
+ self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
+ self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
+ self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
diff --git a/tests/test_expressions.py b/tests/test_expressions.py
index b2ebc26..5d90ce1 100644
--- a/tests/test_expressions.py
+++ b/tests/test_expressions.py
@@ -1,5 +1,5 @@
import unittest
-import bot.entries.expressions as Expressions
+import bot.entries.base.expressions as Expressions
class TestExpressions(unittest.TestCase):
@@ -69,28 +69,3 @@ class TestExpressions(unittest.TestCase):
self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs)
-
- def test_smk_expand_alternatives(self):
- text = "△金(時間・暇)に飽かして"
- exps = Expressions.expand_smk_alternatives(text)
- self.assertEqual(len(exps), 3)
- self.assertIn("金に飽かして", exps)
- self.assertIn("時間に飽かして", exps)
- self.assertIn("暇に飽かして", exps)
-
- def test_daijirin_expand_alternatives(self):
- text = "同じ穴の=狢(=狐・狸)"
- exps = Expressions.expand_daijirin_alternatives(text)
- self.assertEqual(len(exps), 3)
- self.assertIn("同じ穴の狢", exps)
- self.assertIn("同じ穴の狐", exps)
- self.assertIn("同じ穴の狸", exps)
-
- def test_daijirin_expand_alternatives2(self):
- text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
- exps = Expressions.expand_daijirin_alternatives(text)
- self.assertEqual(len(exps), 4)
- self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
- self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
- self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
- self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
diff --git a/tests/test_sankoku_phrases.py b/tests/test_sankoku_phrases.py
index 7faf289..c3894e9 100644
--- a/tests/test_sankoku_phrases.py
+++ b/tests/test_sankoku_phrases.py
@@ -1,16 +1,16 @@
import unittest
-from bot.entries.sankoku8 import parse_hyouki_pattern
+from bot.entries.sankoku8.parse import parse_hyouki_pattern
-class TestSankokuPhrases(unittest.TestCase):
- def test_sankoku_phrases1(self):
+class TestSankoku8PhraseParse(unittest.TestCase):
+ def test1(self):
pattern = '耳にたこ(ができる)'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("耳にたこ", exps)
self.assertIn("耳にたこができる", exps)
- def test_sankoku_phrases2(self):
+ def test2(self):
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 4)
@@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("一斑をもって全豹を卜す", exps)
self.assertIn("一斑をもって全豹を推す", exps)
- def test_sankoku_phrases3(self):
+ def test3(self):
pattern = '{かじ・舵}を切る'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("かじを切る", exps)
self.assertIn("舵を切る", exps)
- def test_sankoku_phrases4(self):
+ def test4(self):
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 6)
@@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("重箱の隅をようじでほじくる", exps)
self.assertIn("重箱の隅を楊枝でほじくる", exps)
- def test_sankoku_phrases5(self):
+ def test5(self):
pattern = '群盲象を〈{な・撫}でる/評する〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 3)
diff --git a/tests/test_smk_phrases.py b/tests/test_smk_phrases.py
new file mode 100644
index 0000000..e5ce231
--- /dev/null
+++ b/tests/test_smk_phrases.py
@@ -0,0 +1,19 @@
+import unittest
+from bot.entries.smk8.phrase_entry import parse_phrase
+
+
+class TestSmk8PhraseParse(unittest.TestCase):
+ def test1(self):
+ text = "目と鼻の△先(間)"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 2)
+ self.assertIn("目と鼻の先", exps)
+ self.assertIn("目と鼻の間", exps)
+
+ def test2(self):
+ text = "△金(時間・暇)に飽かして"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 3)
+ self.assertIn("金に飽かして", exps)
+ self.assertIn("時間に飽かして", exps)
+ self.assertIn("暇に飽かして", exps)