diff --git a/TODO.md b/TODO.md index 877a7ee..4aaadee 100644 --- a/TODO.md +++ b/TODO.md @@ -1,7 +1,7 @@ ### Todo - [x] Add factory classes to reduce the amount of class import statements -- [ ] Add dynamic import functionality to factory classes to reduce boilerplate +- [x] Add dynamic import functionality to factory classes to reduce boilerplate - [x] Support exporting to MDict (.MDX) dictionary format - [x] Validate JSON schema of Yomichan terms during export - [ ] Add support for monokakido search keys from index files diff --git a/bot/crawlers/base/crawler.py b/bot/crawlers/base/crawler.py new file mode 100644 index 0000000..bbbcb9b --- /dev/null +++ b/bot/crawlers/base/crawler.py @@ -0,0 +1,54 @@ +import re +from abc import ABC, abstractmethod + +from bot.factory import new_entry +from bot.factory import new_yomichan_exporter +from bot.factory import new_mdict_exporter + + +class BaseCrawler(ABC): + def __init__(self, target): + self._target = target + self._page_map = {} + self._entries = [] + self._page_id_pattern = None + + @abstractmethod + def collect_pages(self, page_dir): + raise NotImplementedError + + def read_pages(self): + pages_len = len(self._page_map) + items = self._page_map.items() + for idx, (page_id, page_path) in enumerate(items): + update = f"\tReading page {idx+1}/{pages_len}" + print(update, end='\r', flush=True) + entry = new_entry(self._target, page_id) + with open(page_path, "r", encoding="utf-8") as f: + page = f.read() + try: + entry.set_page(page) + except ValueError as err: + print(err) + print("Try deleting and redownloading file:") + print(f"\t{page_path}\n") + continue + self._entries.append(entry) + print() + + def make_yomichan_dictionary(self, media_dir, validate): + exporter = new_yomichan_exporter(self._target) + exporter.export(self._entries, media_dir, validate) + + def make_mdict_dictionary(self, media_dir, icon_file): + exporter = new_mdict_exporter(self._target) + exporter.export(self._entries, media_dir, icon_file) + + def _parse_page_id(self, page_link): + m = re.search(self._page_id_pattern, page_link) + if m is None: + return None + page_id = int(m.group(1)) + if page_id in self._page_map: + return None + return page_id diff --git a/bot/crawlers/base/jitenon.py b/bot/crawlers/base/jitenon.py new file mode 100644 index 0000000..49e4626 --- /dev/null +++ b/bot/crawlers/base/jitenon.py @@ -0,0 +1,30 @@ +from bs4 import BeautifulSoup + +from bot.time import timestamp +from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper +from bot.crawlers.base.crawler import BaseCrawler + + +class JitenonCrawler(BaseCrawler): + def __init__(self, target): + super().__init__(target) + self._gojuon_url = None + + def collect_pages(self, page_dir): + print(f"{timestamp()} Scraping {self._gojuon_url}") + jitenon = JitenonScraper() + gojuon_doc, _ = jitenon.scrape(self._gojuon_url) + gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") + for gojuon_a in gojuon_soup.select(".kana_area a", href=True): + gojuon_href = gojuon_a['href'] + kana_doc, _ = jitenon.scrape(gojuon_href) + kana_soup = BeautifulSoup(kana_doc, features="html.parser") + for kana_a in kana_soup.select(".word_box a", href=True): + page_link = kana_a['href'] + page_id = self._parse_page_id(page_link) + if page_id is None: + continue + _, page_path = jitenon.scrape(page_link) + self._page_map[page_id] = page_path + pages_len = len(self._page_map) + print(f"\n{timestamp()} Found {pages_len} entry pages") diff --git a/bot/crawlers/base/monokakido.py b/bot/crawlers/base/monokakido.py new file mode 100644 index 0000000..ca98545 --- /dev/null +++ b/bot/crawlers/base/monokakido.py @@ -0,0 +1,20 @@ +import os +from bot.time import timestamp +from bot.crawlers.base.crawler import BaseCrawler + + +class MonokakidoCrawler(BaseCrawler): + def __init__(self, target): + super().__init__(target) + self._page_id_pattern = r"^([0-9]+)\.xml$" + + def collect_pages(self, page_dir): + print(f"{timestamp()} Searching for page files in `{page_dir}`") + for pagefile in os.listdir(page_dir): + page_id = self._parse_page_id(pagefile) + if page_id is None or page_id == 0: + continue + path = os.path.join(page_dir, pagefile) + self._page_map[page_id] = path + pages_len = len(self._page_map) + print(f"{timestamp()} Found {pages_len} page files for processing") diff --git a/bot/crawlers/crawlers.py b/bot/crawlers/crawlers.py deleted file mode 100644 index 51e0552..0000000 --- a/bot/crawlers/crawlers.py +++ /dev/null @@ -1,158 +0,0 @@ -import os -import re -from abc import ABC, abstractmethod -from bs4 import BeautifulSoup - -import bot.crawlers.scraper as Scraper -from bot.entries.factory import new_entry -from bot.yomichan.exporters.factory import new_yomi_exporter -from bot.mdict.exporters.factory import new_mdict_exporter - - -class Crawler(ABC): - def __init__(self, target): - self._target = target - self._page_map = {} - self._entries = [] - self._page_id_pattern = None - - @abstractmethod - def collect_pages(self, page_dir): - pass - - def read_pages(self): - pages_len = len(self._page_map) - items = self._page_map.items() - for idx, (page_id, page_path) in enumerate(items): - update = f"Reading page {idx+1}/{pages_len}" - print(update, end='\r', flush=True) - entry = new_entry(self._target, page_id) - with open(page_path, "r", encoding="utf-8") as f: - page = f.read() - try: - entry.set_page(page) - except ValueError as err: - print(err) - print("Try deleting and redownloading file:") - print(f"\t{page_path}\n") - continue - self._entries.append(entry) - print() - - def make_yomichan_dictionary(self, media_dir, validate): - exporter = new_yomi_exporter(self._target) - exporter.export(self._entries, media_dir, validate) - - def make_mdict_dictionary(self, media_dir, icon_file): - exporter = new_mdict_exporter(self._target) - exporter.export(self._entries, media_dir, icon_file) - - def _parse_page_id(self, page_link): - m = re.search(self._page_id_pattern, page_link) - if m is None: - return None - page_id = int(m.group(1)) - if page_id in self._page_map: - return None - return page_id - - -class JitenonKokugoCrawler(Crawler): - def __init__(self, target): - super().__init__(target) - self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php" - self._page_id_pattern = r"word/p([0-9]+)$" - - def collect_pages(self, page_dir): - jitenon = Scraper.Jitenon() - gojuon_doc, _ = jitenon.scrape(self._gojuon_url) - gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") - for gojuon_a in gojuon_soup.select(".kana_area a", href=True): - gojuon_href = gojuon_a['href'] - max_kana_page = 1 - current_kana_page = 1 - while current_kana_page <= max_kana_page: - kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}") - current_kana_page += 1 - kana_soup = BeautifulSoup(kana_doc, features="html.parser") - page_total = kana_soup.find(class_="page_total").text - m = re.search(r"全([0-9]+)件", page_total) - if m: - max_kana_page = int(m.group(1)) - for kana_a in kana_soup.select(".word_box a", href=True): - page_link = kana_a['href'] - page_id = self._parse_page_id(page_link) - if page_id is None: - continue - _, page_path = jitenon.scrape(page_link) - self._page_map[page_id] = page_path - pages_len = len(self._page_map) - print(f"Finished scraping {pages_len} pages") - - -class _JitenonCrawler(Crawler): - def __init__(self, target): - super().__init__(target) - self._gojuon_url = None - - def collect_pages(self, page_dir): - print("Scraping jitenon.jp") - jitenon = Scraper.Jitenon() - gojuon_doc, _ = jitenon.scrape(self._gojuon_url) - gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") - for gojuon_a in gojuon_soup.select(".kana_area a", href=True): - gojuon_href = gojuon_a['href'] - kana_doc, _ = jitenon.scrape(gojuon_href) - kana_soup = BeautifulSoup(kana_doc, features="html.parser") - for kana_a in kana_soup.select(".word_box a", href=True): - page_link = kana_a['href'] - page_id = self._parse_page_id(page_link) - if page_id is None: - continue - _, page_path = jitenon.scrape(page_link) - self._page_map[page_id] = page_path - pages_len = len(self._page_map) - print(f"Finished scraping {pages_len} pages") - - -class JitenonYojiCrawler(_JitenonCrawler): - def __init__(self, target): - super().__init__(target) - self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html" - self._page_id_pattern = r"([0-9]+)\.html$" - - -class JitenonKotowazaCrawler(_JitenonCrawler): - def __init__(self, target): - super().__init__(target) - self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php" - self._page_id_pattern = r"([0-9]+)\.php$" - - -class _MonokakidoCrawler(Crawler): - def __init__(self, target): - super().__init__(target) - self._page_id_pattern = r"^([0-9]+)\.xml$" - - def collect_pages(self, page_dir): - print(f"Searching for page files in `{page_dir}`") - for pagefile in os.listdir(page_dir): - page_id = self._parse_page_id(pagefile) - if page_id is None or page_id == 0: - continue - path = os.path.join(page_dir, pagefile) - self._page_map[page_id] = path - pages_len = len(self._page_map) - print(f"Found {pages_len} page files for processing") - - -class Smk8Crawler(_MonokakidoCrawler): - pass - - -class Daijirin2Crawler(_MonokakidoCrawler): - pass - - -class Sankoku8Crawler(_MonokakidoCrawler): - pass diff --git a/bot/crawlers/daijirin2.py b/bot/crawlers/daijirin2.py new file mode 100644 index 0000000..a9c711b --- /dev/null +++ b/bot/crawlers/daijirin2.py @@ -0,0 +1,5 @@ +from bot.crawlers.base.monokakido import MonokakidoCrawler + + +class Crawler(MonokakidoCrawler): + pass diff --git a/bot/crawlers/factory.py b/bot/crawlers/factory.py deleted file mode 100644 index d7450ea..0000000 --- a/bot/crawlers/factory.py +++ /dev/null @@ -1,20 +0,0 @@ -from bot.targets import Targets - -from bot.crawlers.crawlers import JitenonKokugoCrawler -from bot.crawlers.crawlers import JitenonYojiCrawler -from bot.crawlers.crawlers import JitenonKotowazaCrawler -from bot.crawlers.crawlers import Smk8Crawler -from bot.crawlers.crawlers import Daijirin2Crawler -from bot.crawlers.crawlers import Sankoku8Crawler - - -def new_crawler(target): - crawler_map = { - Targets.JITENON_KOKUGO: JitenonKokugoCrawler, - Targets.JITENON_YOJI: JitenonYojiCrawler, - Targets.JITENON_KOTOWAZA: JitenonKotowazaCrawler, - Targets.SMK8: Smk8Crawler, - Targets.DAIJIRIN2: Daijirin2Crawler, - Targets.SANKOKU8: Sankoku8Crawler, - } - return crawler_map[target](target) diff --git a/bot/crawlers/jitenon_kokugo.py b/bot/crawlers/jitenon_kokugo.py new file mode 100644 index 0000000..e748ea1 --- /dev/null +++ b/bot/crawlers/jitenon_kokugo.py @@ -0,0 +1,40 @@ +import re +from bs4 import BeautifulSoup + +from bot.time import timestamp +from bot.crawlers.base.crawler import BaseCrawler +from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper + + +class Crawler(BaseCrawler): + def __init__(self, target): + super().__init__(target) + self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php" + self._page_id_pattern = r"word/p([0-9]+)$" + + def collect_pages(self, page_dir): + print(f"{timestamp()} Scraping {self._gojuon_url}") + jitenon = JitenonScraper() + gojuon_doc, _ = jitenon.scrape(self._gojuon_url) + gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") + for gojuon_a in gojuon_soup.select(".kana_area a", href=True): + gojuon_href = gojuon_a['href'] + max_kana_page = 1 + current_kana_page = 1 + while current_kana_page <= max_kana_page: + kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}") + current_kana_page += 1 + kana_soup = BeautifulSoup(kana_doc, features="html.parser") + page_total = kana_soup.find(class_="page_total").text + m = re.search(r"全([0-9]+)件", page_total) + if m: + max_kana_page = int(m.group(1)) + for kana_a in kana_soup.select(".word_box a", href=True): + page_link = kana_a['href'] + page_id = self._parse_page_id(page_link) + if page_id is None: + continue + _, page_path = jitenon.scrape(page_link) + self._page_map[page_id] = page_path + pages_len = len(self._page_map) + print(f"\n{timestamp()} Found {pages_len} entry pages") diff --git a/bot/crawlers/jitenon_kotowaza.py b/bot/crawlers/jitenon_kotowaza.py new file mode 100644 index 0000000..693fa52 --- /dev/null +++ b/bot/crawlers/jitenon_kotowaza.py @@ -0,0 +1,8 @@ +from bot.crawlers.base.jitenon import JitenonCrawler + + +class Crawler(JitenonCrawler): + def __init__(self, target): + super().__init__(target) + self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php" + self._page_id_pattern = r"([0-9]+)\.php$" diff --git a/bot/crawlers/jitenon_yoji.py b/bot/crawlers/jitenon_yoji.py new file mode 100644 index 0000000..5b89875 --- /dev/null +++ b/bot/crawlers/jitenon_yoji.py @@ -0,0 +1,8 @@ +from bot.crawlers.base.jitenon import JitenonCrawler + + +class Crawler(JitenonCrawler): + def __init__(self, target): + super().__init__(target) + self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html" + self._page_id_pattern = r"([0-9]+)\.html$" diff --git a/bot/crawlers/sankoku8.py b/bot/crawlers/sankoku8.py new file mode 100644 index 0000000..a9c711b --- /dev/null +++ b/bot/crawlers/sankoku8.py @@ -0,0 +1,5 @@ +from bot.crawlers.base.monokakido import MonokakidoCrawler + + +class Crawler(MonokakidoCrawler): + pass diff --git a/bot/crawlers/scrapers/jitenon.py b/bot/crawlers/scrapers/jitenon.py new file mode 100644 index 0000000..e4163d9 --- /dev/null +++ b/bot/crawlers/scrapers/jitenon.py @@ -0,0 +1,10 @@ +import re +from bot.crawlers.scrapers.scraper import BaseScraper + + +class Jitenon(BaseScraper): + def _get_netloc_re(self): + domain = r"jitenon\.jp" + pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + domain + r"$" + netloc_re = re.compile(pattern) + return netloc_re diff --git a/bot/crawlers/scraper.py b/bot/crawlers/scrapers/scraper.py similarity index 82% rename from bot/crawlers/scraper.py rename to bot/crawlers/scrapers/scraper.py index 577f602..eeb9534 100644 --- a/bot/crawlers/scraper.py +++ b/bot/crawlers/scrapers/scraper.py @@ -1,24 +1,28 @@ import time -import requests import re import os import hashlib +import random +import math from datetime import datetime -from pathlib import Path - -from platformdirs import user_cache_dir from urllib.parse import urlparse +from pathlib import Path +from abc import ABC, abstractmethod + +import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry +from platformdirs import user_cache_dir +from bot.time import timestamp from bot.data import load_config -class Scraper(): +class BaseScraper(ABC): def __init__(self): + self.cache_count = 0 self._config = load_config() - pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$" - self.netloc_re = re.compile(pattern) + self.netloc_re = self._get_netloc_re() self.__set_session() def scrape(self, urlstring): @@ -31,9 +35,14 @@ class Scraper(): with open(cache_path, "w", encoding="utf-8") as f: f.write(html) else: - print("Discovering cached files...", end='\r', flush=True) + self.cache_count += 1 + print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True) return html, cache_path + @abstractmethod + def _get_netloc_re(self): + raise NotImplementedError + def __set_session(self): retry_strategy = Retry( total=3, @@ -87,21 +96,14 @@ class Scraper(): def __get(self, urlstring): delay = 10 time.sleep(delay) - now = datetime.now().strftime("%H:%M:%S") - print(f"{now} scraping {urlstring} ...", end='') + print(f"{timestamp()} Scraping {urlstring} ...", end='') try: response = self.session.get(urlstring, timeout=10) - print("OK") + print(f"{timestamp()} OK") return response.text - except Exception: - print("failed") - print("resetting session and trying again") + except Exception as ex: + print(f"\tFailed: {str(ex)}") + print(f"{timestamp()} Resetting session and trying again") self.__set_session() response = self.session.get(urlstring, timeout=10) return response.text - - -class Jitenon(Scraper): - def __init__(self): - self.domain = r"jitenon\.jp" - super().__init__() diff --git a/bot/crawlers/smk8.py b/bot/crawlers/smk8.py new file mode 100644 index 0000000..a9c711b --- /dev/null +++ b/bot/crawlers/smk8.py @@ -0,0 +1,5 @@ +from bot.crawlers.base.monokakido import MonokakidoCrawler + + +class Crawler(MonokakidoCrawler): + pass diff --git a/bot/entries/entry.py b/bot/entries/base/entry.py similarity index 89% rename from bot/entries/entry.py rename to bot/entries/base/entry.py index 3811a77..60d4f16 100644 --- a/bot/entries/entry.py +++ b/bot/entries/base/entry.py @@ -18,15 +18,15 @@ class Entry(ABC): @abstractmethod def get_global_identifier(self): - pass + raise NotImplementedError @abstractmethod def set_page(self, page): - pass + raise NotImplementedError @abstractmethod def get_page_soup(self): - pass + raise NotImplementedError def get_headwords(self): if self._headwords is not None: @@ -38,15 +38,15 @@ class Entry(ABC): @abstractmethod def _get_headwords(self): - pass + raise NotImplementedError @abstractmethod def _add_variant_expressions(self, headwords): - pass + raise NotImplementedError @abstractmethod def get_part_of_speech_tags(self): - pass + raise NotImplementedError def get_parent(self): if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID: diff --git a/bot/entries/expressions.py b/bot/entries/base/expressions.py similarity index 63% rename from bot/entries/expressions.py rename to bot/entries/base/expressions.py index 687a325..8049a99 100644 --- a/bot/entries/expressions.py +++ b/bot/entries/base/expressions.py @@ -31,11 +31,14 @@ def add_fullwidth(expressions): def add_variant_kanji(expressions): variant_kanji = load_variant_kanji() - for old_kanji, new_kanji in variant_kanji.items(): + for kyuuji, shinji in variant_kanji.items(): new_exps = [] for expression in expressions: - if old_kanji in expression: - new_exp = expression.replace(old_kanji, new_kanji) + if kyuuji in expression: + new_exp = expression.replace(kyuuji, shinji) + new_exps.append(new_exp) + if shinji in expression: + new_exp = expression.replace(shinji, kyuuji) new_exps.append(new_exp) for new_exp in new_exps: if new_exp not in expressions: @@ -85,40 +88,3 @@ def expand_abbreviation_list(expressions): if new_exp not in new_exps: new_exps.append(new_exp) return new_exps - - -def expand_smk_alternatives(text): - """Return a list of strings described by △ notation.""" - m = re.search(r"△([^(]+)(([^(]+))", text) - if m is None: - return [text] - alt_parts = [m.group(1)] - for alt_part in m.group(2).split("・"): - alt_parts.append(alt_part) - alts = [] - for alt_part in alt_parts: - alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text) - alts.append(alt_exp) - return alts - - -def expand_daijirin_alternatives(text): - """Return a list of strings described by = notation.""" - group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?" - groups = re.findall(group_pattern, text) - expressions = [""] - for group in groups: - new_exps = [] - for expression in expressions: - new_exps.append(expression + group[0]) - expressions = new_exps.copy() - if group[1] == "": - continue - new_exps = [] - for expression in expressions: - new_exps.append(expression + group[2]) - for expression in expressions: - for alt in group[3].split("・"): - new_exps.append(expression + alt) - expressions = new_exps.copy() - return expressions diff --git a/bot/entries/jitenon.py b/bot/entries/base/jitenon_entry.py similarity index 58% rename from bot/entries/jitenon.py rename to bot/entries/base/jitenon_entry.py index 65c4d2e..43d6005 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/base/jitenon_entry.py @@ -3,11 +3,11 @@ from abc import abstractmethod from datetime import datetime, date from bs4 import BeautifulSoup -from bot.entries.entry import Entry -import bot.entries.expressions as Expressions +from bot.entries.base.entry import Entry +import bot.entries.base.expressions as Expressions -class _JitenonEntry(Entry): +class JitenonEntry(Entry): def __init__(self, target, entry_id): super().__init__(target, entry_id) self.expression = "" @@ -58,7 +58,7 @@ class _JitenonEntry(Entry): @abstractmethod def _get_column_map(self): - pass + raise NotImplementedError def __set_modified_date(self, page): m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) @@ -140,104 +140,3 @@ class _JitenonEntry(Entry): elif isinstance(attr_val, list): colvals.append(";".join(attr_val)) return ",".join(colvals) - - -class JitenonYojiEntry(_JitenonEntry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.origin = "" - self.kanken_level = "" - self.category = "" - self.related_expressions = [] - - def _get_column_map(self): - return { - "四字熟語": "expression", - "読み方": "yomikata", - "意味": "definition", - "異形": "other_forms", - "出典": "origin", - "漢検級": "kanken_level", - "場面用途": "category", - "類義語": "related_expressions", - } - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - - -class JitenonKotowazaEntry(_JitenonEntry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.origin = "" - self.example = "" - self.related_expressions = [] - - def _get_column_map(self): - return { - "言葉": "expression", - "読み方": "yomikata", - "意味": "definition", - "異形": "other_forms", - "出典": "origin", - "例文": "example", - "類句": "related_expressions", - } - - def _get_headwords(self): - if self.expression == "金棒引き・鉄棒引き": - headwords = { - "かなぼうひき": ["金棒引き", "鉄棒引き"] - } - else: - headwords = super()._get_headwords() - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - - -class JitenonKokugoEntry(_JitenonEntry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.example = "" - self.alt_expression = "" - self.antonym = "" - self.attachments = "" - self.compounds = "" - self.related_words = "" - - def _get_column_map(self): - return { - "言葉": "expression", - "読み方": "yomikata", - "意味": "definition", - "例文": "example", - "別表記": "alt_expression", - "対義語": "antonym", - "活用": "attachments", - "用例": "compounds", - "類語": "related_words", - } - - def _get_headwords(self): - headwords = {} - for reading in self.yomikata.split("・"): - if reading not in headwords: - headwords[reading] = [] - for expression in self.expression.split("・"): - headwords[reading].append(expression) - if self.alt_expression.strip() != "": - for expression in self.alt_expression.split("・"): - headwords[reading].append(expression) - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) diff --git a/bot/entries/base/sanseido_entry.py b/bot/entries/base/sanseido_entry.py new file mode 100644 index 0000000..bb52431 --- /dev/null +++ b/bot/entries/base/sanseido_entry.py @@ -0,0 +1,60 @@ +from abc import abstractmethod +from bs4 import BeautifulSoup + +from bot.entries.base.entry import Entry +import bot.entries.base.expressions as Expressions + + +class SanseidoEntry(Entry): + def set_page(self, page): + page = self._decompose_subentries(page) + self._page = page + + def get_page_soup(self): + soup = BeautifulSoup(self._page, "xml") + return soup + + def get_global_identifier(self): + parent_part = format(self.entry_id[0], '06') + child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() + return f"@{self.target.value}-{parent_part}-{child_part}" + + def _decompose_subentries(self, page): + soup = BeautifulSoup(page, features="xml") + for x in self._get_subentry_parameters(): + subentry_class, tags, subentry_list = x + for tag in tags: + tag_soup = soup.find(tag) + while tag_soup is not None: + tag_soup.name = "項目" + subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) + self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id + subentry = subentry_class(self.target, subentry_id) + page = tag_soup.decode() + subentry.set_page(page) + subentry_list.append(subentry) + tag_soup.decompose() + tag_soup = soup.find(tag) + return soup.decode() + + @abstractmethod + def _get_subentry_parameters(self): + raise NotImplementedError + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) + Expressions.add_fullwidth(expressions) + Expressions.remove_iteration_mark(expressions) + Expressions.add_iteration_mark(expressions) + + @staticmethod + def id_string_to_entry_id(id_string): + parts = id_string.split("-") + if len(parts) == 1: + return (int(parts[0]), 0) + elif len(parts) == 2: + # subentries have a hexadecimal part + return (int(parts[0]), int(parts[1], 16)) + else: + raise Exception(f"Invalid entry ID: {id_string}") diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py deleted file mode 100644 index f7a629c..0000000 --- a/bot/entries/daijirin2.py +++ /dev/null @@ -1,231 +0,0 @@ -from bs4 import BeautifulSoup - -import bot.entries.expressions as Expressions -import bot.soup as Soup -from bot.data import load_phrase_readings -from bot.data import load_daijirin2_kana_abbreviations -from bot.entries.entry import Entry -from bot.entries.daijirin2_preprocess import preprocess_page - - -class _BaseDaijirin2Entry(Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.children = [] - self.phrases = [] - self._kana_abbreviations = load_daijirin2_kana_abbreviations() - - def get_global_identifier(self): - parent_part = format(self.entry_id[0], '06') - child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() - return f"@{self.target.value}-{parent_part}-{child_part}" - - def set_page(self, page): - page = self.__decompose_subentries(page) - self._page = page - - def get_page_soup(self): - soup = BeautifulSoup(self._page, "xml") - return soup - - def get_part_of_speech_tags(self): - if self._part_of_speech_tags is not None: - return self._part_of_speech_tags - self._part_of_speech_tags = [] - soup = self.get_page_soup() - for pos_group in soup.find_all("品詞G"): - if pos_group.parent.name == "大語義": - self._set_part_of_speech_tags(pos_group) - return self._part_of_speech_tags - - def _set_part_of_speech_tags(self, el): - pos_names = ["品詞", "品詞活用", "品詞行", "用法"] - for child in el.children: - if child.name is not None: - self._set_part_of_speech_tags(child) - continue - pos = str(child) - if el.name not in pos_names: - continue - elif pos in ["[", "]"]: - continue - elif pos in self._part_of_speech_tags: - continue - else: - self._part_of_speech_tags.append(pos) - - def _get_regular_headwords(self, soup): - self._fill_alts(soup) - reading = soup.find("見出仮名").text - expressions = [] - for el in soup.find_all("標準表記"): - expression = self._clean_expression(el.text) - if "—" in expression: - kana_abbrs = self._kana_abbreviations[self.entry_id] - for abbr in kana_abbrs: - expression = expression.replace("—", abbr, 1) - expressions.append(expression) - expressions = Expressions.expand_abbreviation_list(expressions) - if len(expressions) == 0: - expressions.append(reading) - headwords = {reading: expressions} - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) - - def __decompose_subentries(self, page): - soup = BeautifulSoup(page, features="xml") - subentry_parameters = [ - [Daijirin2ChildEntry, ["子項目"], self.children], - [Daijirin2PhraseEntry, ["句項目"], self.phrases], - ] - for x in subentry_parameters: - subentry_class, tags, subentry_list = x - for tag in tags: - tag_soup = soup.find(tag) - while tag_soup is not None: - tag_soup.name = "項目" - subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) - self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(self.target, subentry_id) - page = tag_soup.decode() - subentry.set_page(page) - subentry_list.append(subentry) - tag_soup.decompose() - tag_soup = soup.find(tag) - return soup.decode() - - @staticmethod - def id_string_to_entry_id(id_string): - parts = id_string.split("-") - if len(parts) == 1: - return (int(parts[0]), 0) - elif len(parts) == 2: - # subentries have a hexadecimal part - return (int(parts[0]), int(parts[1], 16)) - else: - raise Exception(f"Invalid entry ID: {id_string}") - - @staticmethod - def _delete_unused_nodes(soup): - """Remove extra markup elements that appear in the entry - headword line which are not part of the entry headword""" - unused_nodes = [ - "漢字音logo", "活用分節", "連語句活用分節", "語構成", - "表外字マーク", "表外字マーク", "ルビG" - ] - for name in unused_nodes: - Soup.delete_soup_nodes(soup, name) - - @staticmethod - def _clean_expression(expression): - for x in ["〈", "〉", "《", "》", " "]: - expression = expression.replace(x, "") - return expression - - @staticmethod - def _fill_alts(soup): - for gaiji in soup.find_all(class_="gaiji"): - if gaiji.name == "img" and gaiji.has_attr("alt"): - gaiji.name = "span" - gaiji.string = gaiji.attrs["alt"] - - -class Daijirin2Entry(_BaseDaijirin2Entry): - def __init__(self, target, page_id): - entry_id = (page_id, 0) - super().__init__(target, entry_id) - - def set_page(self, page): - page = preprocess_page(page) - super().set_page(page) - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - if soup.find("漢字見出") is not None: - headwords = self._get_kanji_headwords(soup) - elif soup.find("略語G") is not None: - headwords = self._get_acronym_headwords(soup) - else: - headwords = self._get_regular_headwords(soup) - return headwords - - def _get_kanji_headwords(self, soup): - readings = [] - for el in soup.find_all("漢字音"): - hira = Expressions.kata_to_hira(el.text) - readings.append(hira) - if soup.find("漢字音") is None: - readings.append("") - expressions = [] - for el in soup.find_all("漢字見出"): - expressions.append(el.text) - headwords = {} - for reading in readings: - headwords[reading] = expressions - return headwords - - def _get_acronym_headwords(self, soup): - expressions = [] - for el in soup.find_all("略語"): - expression_parts = [] - for part in el.find_all(["欧字", "和字"]): - expression_parts.append(part.text) - expression = "".join(expression_parts) - expressions.append(expression) - headwords = {"": expressions} - return headwords - - -class Daijirin2ChildEntry(_BaseDaijirin2Entry): - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - headwords = self._get_regular_headwords(soup) - return headwords - - -class Daijirin2PhraseEntry(_BaseDaijirin2Entry): - def get_part_of_speech_tags(self): - # phrases do not contain these tags - return [] - - def _get_headwords(self): - soup = self.get_page_soup() - headwords = {} - expressions = self._find_expressions(soup) - readings = self._find_readings() - for idx, expression in enumerate(expressions): - reading = readings[idx] - if reading in headwords: - headwords[reading].append(expression) - else: - headwords[reading] = [expression] - return headwords - - def _find_expressions(self, soup): - self._delete_unused_nodes(soup) - text = soup.find("句表記").text - text = self._clean_expression(text) - alternatives = Expressions.expand_daijirin_alternatives(text) - expressions = [] - for alt in alternatives: - for exp in Expressions.expand_abbreviation(alt): - expressions.append(exp) - return expressions - - def _find_readings(self): - phrase_readings = load_phrase_readings(self.target) - text = phrase_readings[self.entry_id] - alternatives = Expressions.expand_daijirin_alternatives(text) - readings = [] - for alt in alternatives: - for reading in Expressions.expand_abbreviation(alt): - readings.append(reading) - return readings diff --git a/bot/entries/daijirin2/base_entry.py b/bot/entries/daijirin2/base_entry.py new file mode 100644 index 0000000..1113404 --- /dev/null +++ b/bot/entries/daijirin2/base_entry.py @@ -0,0 +1,88 @@ +import bot.soup as Soup +from bot.data import load_daijirin2_kana_abbreviations +from bot.entries.base.sanseido_entry import SanseidoEntry +import bot.entries.base.expressions as Expressions + + +class BaseEntry(SanseidoEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.children = [] + self.phrases = [] + self._kana_abbreviations = load_daijirin2_kana_abbreviations() + + def get_part_of_speech_tags(self): + if self._part_of_speech_tags is not None: + return self._part_of_speech_tags + self._part_of_speech_tags = [] + soup = self.get_page_soup() + for pos_group in soup.find_all("品詞G"): + if pos_group.parent.name == "大語義": + self._set_part_of_speech_tags(pos_group) + return self._part_of_speech_tags + + def _set_part_of_speech_tags(self, el): + pos_names = ["品詞", "品詞活用", "品詞行", "用法"] + for child in el.children: + if child.name is not None: + self._set_part_of_speech_tags(child) + continue + pos = str(child) + if el.name not in pos_names: + continue + elif pos in ["[", "]"]: + continue + elif pos in self._part_of_speech_tags: + continue + else: + self._part_of_speech_tags.append(pos) + + def _get_regular_headwords(self, soup): + self._fill_alts(soup) + reading = soup.find("見出仮名").text + expressions = [] + for el in soup.find_all("標準表記"): + expression = self._clean_expression(el.text) + if "—" in expression: + kana_abbrs = self._kana_abbreviations[self.entry_id] + for abbr in kana_abbrs: + expression = expression.replace("—", abbr, 1) + expressions.append(expression) + expressions = Expressions.expand_abbreviation_list(expressions) + if len(expressions) == 0: + expressions.append(reading) + headwords = {reading: expressions} + return headwords + + def _get_subentry_parameters(self): + from bot.entries.daijirin2.child_entry import ChildEntry + from bot.entries.daijirin2.phrase_entry import PhraseEntry + subentry_parameters = [ + [ChildEntry, ["子項目"], self.children], + [PhraseEntry, ["句項目"], self.phrases], + ] + return subentry_parameters + + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "漢字音logo", "活用分節", "連語句活用分節", "語構成", + "表外字マーク", "表外字マーク", "ルビG" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) + + @staticmethod + def _clean_expression(expression): + for x in ["〈", "〉", "《", "》", " "]: + expression = expression.replace(x, "") + return expression + + @staticmethod + def _fill_alts(soup): + for gaiji in soup.find_all(class_="gaiji"): + if gaiji.name == "img" and gaiji.has_attr("alt"): + gaiji.name = "span" + gaiji.string = gaiji.attrs["alt"] diff --git a/bot/entries/daijirin2/child_entry.py b/bot/entries/daijirin2/child_entry.py new file mode 100644 index 0000000..42685a0 --- /dev/null +++ b/bot/entries/daijirin2/child_entry.py @@ -0,0 +1,9 @@ +from bot.entries.daijirin2.base_entry import BaseEntry + + +class ChildEntry(BaseEntry): + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + headwords = self._get_regular_headwords(soup) + return headwords diff --git a/bot/entries/daijirin2/entry.py b/bot/entries/daijirin2/entry.py new file mode 100644 index 0000000..0b6970f --- /dev/null +++ b/bot/entries/daijirin2/entry.py @@ -0,0 +1,50 @@ +import bot.entries.base.expressions as Expressions +from bot.entries.daijirin2.base_entry import BaseEntry +from bot.entries.daijirin2.preprocess import preprocess_page + + +class Entry(BaseEntry): + def __init__(self, target, page_id): + entry_id = (page_id, 0) + super().__init__(target, entry_id) + + def set_page(self, page): + page = preprocess_page(page) + super().set_page(page) + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + if soup.find("漢字見出") is not None: + headwords = self._get_kanji_headwords(soup) + elif soup.find("略語G") is not None: + headwords = self._get_acronym_headwords(soup) + else: + headwords = self._get_regular_headwords(soup) + return headwords + + def _get_kanji_headwords(self, soup): + readings = [] + for el in soup.find_all("漢字音"): + hira = Expressions.kata_to_hira(el.text) + readings.append(hira) + if soup.find("漢字音") is None: + readings.append("") + expressions = [] + for el in soup.find_all("漢字見出"): + expressions.append(el.text) + headwords = {} + for reading in readings: + headwords[reading] = expressions + return headwords + + def _get_acronym_headwords(self, soup): + expressions = [] + for el in soup.find_all("略語"): + expression_parts = [] + for part in el.find_all(["欧字", "和字"]): + expression_parts.append(part.text) + expression = "".join(expression_parts) + expressions.append(expression) + headwords = {"": expressions} + return headwords diff --git a/bot/entries/daijirin2/phrase_entry.py b/bot/entries/daijirin2/phrase_entry.py new file mode 100644 index 0000000..0470d7d --- /dev/null +++ b/bot/entries/daijirin2/phrase_entry.py @@ -0,0 +1,67 @@ +import re + +import bot.entries.base.expressions as Expressions +from bot.data import load_phrase_readings +from bot.entries.daijirin2.base_entry import BaseEntry + + +class PhraseEntry(BaseEntry): + def get_part_of_speech_tags(self): + # phrases do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + headwords = {} + expressions = self._find_expressions(soup) + readings = self._find_readings() + for idx, expression in enumerate(expressions): + reading = readings[idx] + if reading in headwords: + headwords[reading].append(expression) + else: + headwords[reading] = [expression] + return headwords + + def _find_expressions(self, soup): + self._delete_unused_nodes(soup) + text = soup.find("句表記").text + text = self._clean_expression(text) + alternatives = parse_phrase(text) + expressions = [] + for alt in alternatives: + for exp in Expressions.expand_abbreviation(alt): + expressions.append(exp) + return expressions + + def _find_readings(self): + phrase_readings = load_phrase_readings(self.target) + text = phrase_readings[self.entry_id] + alternatives = parse_phrase(text) + readings = [] + for alt in alternatives: + for reading in Expressions.expand_abbreviation(alt): + readings.append(reading) + return readings + + +def parse_phrase(text): + """Return a list of strings described by = notation.""" + group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?" + groups = re.findall(group_pattern, text) + expressions = [""] + for group in groups: + new_exps = [] + for expression in expressions: + new_exps.append(expression + group[0]) + expressions = new_exps.copy() + if group[1] == "": + continue + new_exps = [] + for expression in expressions: + new_exps.append(expression + group[2]) + for expression in expressions: + for alt in group[3].split("・"): + new_exps.append(expression + alt) + expressions = new_exps.copy() + return expressions diff --git a/bot/entries/daijirin2_preprocess.py b/bot/entries/daijirin2/preprocess.py similarity index 100% rename from bot/entries/daijirin2_preprocess.py rename to bot/entries/daijirin2/preprocess.py diff --git a/bot/entries/factory.py b/bot/entries/factory.py deleted file mode 100644 index 162c102..0000000 --- a/bot/entries/factory.py +++ /dev/null @@ -1,20 +0,0 @@ -from bot.targets import Targets - -from bot.entries.jitenon import JitenonKokugoEntry -from bot.entries.jitenon import JitenonYojiEntry -from bot.entries.jitenon import JitenonKotowazaEntry -from bot.entries.smk8 import Smk8Entry -from bot.entries.daijirin2 import Daijirin2Entry -from bot.entries.sankoku8 import Sankoku8Entry - - -def new_entry(target, page_id): - entry_map = { - Targets.JITENON_KOKUGO: JitenonKokugoEntry, - Targets.JITENON_YOJI: JitenonYojiEntry, - Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry, - Targets.SMK8: Smk8Entry, - Targets.DAIJIRIN2: Daijirin2Entry, - Targets.SANKOKU8: Sankoku8Entry, - } - return entry_map[target](target, page_id) diff --git a/bot/entries/jitenon_kokugo/entry.py b/bot/entries/jitenon_kokugo/entry.py new file mode 100644 index 0000000..523ac63 --- /dev/null +++ b/bot/entries/jitenon_kokugo/entry.py @@ -0,0 +1,45 @@ +from bot.entries.base.jitenon_entry import JitenonEntry +import bot.entries.base.expressions as Expressions + + +class Entry(JitenonEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.example = "" + self.alt_expression = "" + self.antonym = "" + self.attachments = "" + self.compounds = "" + self.related_words = "" + + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "例文": "example", + "別表記": "alt_expression", + "対義語": "antonym", + "活用": "attachments", + "用例": "compounds", + "類語": "related_words", + } + + def _get_headwords(self): + headwords = {} + for reading in self.yomikata.split("・"): + if reading not in headwords: + headwords[reading] = [] + for expression in self.expression.split("・"): + headwords[reading].append(expression) + if self.alt_expression.strip() != "": + for expression in self.alt_expression.split("・"): + headwords[reading].append(expression) + return headwords + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) + Expressions.add_fullwidth(expressions) + Expressions.remove_iteration_mark(expressions) + Expressions.add_iteration_mark(expressions) diff --git a/bot/entries/jitenon_kotowaza/entry.py b/bot/entries/jitenon_kotowaza/entry.py new file mode 100644 index 0000000..71dc35f --- /dev/null +++ b/bot/entries/jitenon_kotowaza/entry.py @@ -0,0 +1,35 @@ +from bot.entries.base.jitenon_entry import JitenonEntry +import bot.entries.base.expressions as Expressions + + +class Entry(JitenonEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.example = "" + self.related_expressions = [] + + def _get_column_map(self): + return { + "言葉": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "例文": "example", + "類句": "related_expressions", + } + + def _get_headwords(self): + if self.expression == "金棒引き・鉄棒引き": + headwords = { + "かなぼうひき": ["金棒引き", "鉄棒引き"] + } + else: + headwords = super()._get_headwords() + return headwords + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) + Expressions.add_fullwidth(expressions) diff --git a/bot/entries/jitenon_yoji/entry.py b/bot/entries/jitenon_yoji/entry.py new file mode 100644 index 0000000..e0e8b13 --- /dev/null +++ b/bot/entries/jitenon_yoji/entry.py @@ -0,0 +1,27 @@ +import bot.entries.base.expressions as Expressions +from bot.entries.base.jitenon_entry import JitenonEntry + + +class Entry(JitenonEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.origin = "" + self.kanken_level = "" + self.category = "" + self.related_expressions = [] + + def _get_column_map(self): + return { + "四字熟語": "expression", + "読み方": "yomikata", + "意味": "definition", + "異形": "other_forms", + "出典": "origin", + "漢検級": "kanken_level", + "場面用途": "category", + "類義語": "related_expressions", + } + + def _add_variant_expressions(self, headwords): + for expressions in headwords.values(): + Expressions.add_variant_kanji(expressions) diff --git a/bot/entries/sankoku8.py b/bot/entries/sankoku8.py deleted file mode 100644 index 9653f68..0000000 --- a/bot/entries/sankoku8.py +++ /dev/null @@ -1,260 +0,0 @@ -from bs4 import BeautifulSoup -import bot.entries.expressions as Expressions -import bot.soup as Soup -from bot.entries.entry import Entry -from bot.data import load_phrase_readings -from bot.entries.sankoku8_preprocess import preprocess_page - - -class _BaseSankoku8Entry(Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.children = [] - self.phrases = [] - self._hyouki_name = "表記" - self._midashi_name = None - self._midashi_kana_name = None - - def get_global_identifier(self): - parent_part = format(self.entry_id[0], '06') - child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() - return f"@{self.target.value}-{parent_part}-{child_part}" - - def set_page(self, page): - page = self.__decompose_subentries(page) - self._page = page - - def get_page_soup(self): - soup = BeautifulSoup(self._page, "xml") - return soup - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - readings = self._find_readings(soup) - expressions = self._find_expressions(soup) - headwords = {} - for reading in readings: - headwords[reading] = [] - if len(readings) == 1: - reading = readings[0] - if soup.find(self._midashi_name).find(self._hyouki_name) is None: - headwords[reading].append(reading) - for exp in expressions: - if exp not in headwords[reading]: - headwords[reading].append(exp) - elif len(readings) > 1 and len(expressions) == 0: - for reading in readings: - headwords[reading].append(reading) - elif len(readings) > 1 and len(expressions) == 1: - if soup.find(self._midashi_name).find(self._hyouki_name) is None: - for reading in readings: - headwords[reading].append(reading) - expression = expressions[0] - for reading in readings: - if expression not in headwords[reading]: - headwords[reading].append(expression) - elif len(readings) > 1 and len(expressions) == len(readings): - if soup.find(self._midashi_name).find(self._hyouki_name) is None: - for reading in readings: - headwords[reading].append(reading) - for idx, reading in enumerate(readings): - exp = expressions[idx] - if exp not in headwords[reading]: - headwords[reading].append(exp) - else: - raise Exception() # shouldn't happen - return headwords - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) - - def get_part_of_speech_tags(self): - if self._part_of_speech_tags is not None: - return self._part_of_speech_tags - self._part_of_speech_tags = [] - soup = self.get_page_soup() - for midashi in soup.find_all([self._midashi_name, "見出部要素"]): - pos_group = midashi.find("品詞G") - if pos_group is None: - continue - for tag in pos_group.find_all("a"): - if tag.text not in self._part_of_speech_tags: - self._part_of_speech_tags.append(tag.text) - return self._part_of_speech_tags - - def _find_expressions(self, soup): - expressions = [] - for hyouki in soup.find_all(self._hyouki_name): - for expression in parse_hyouki_soup(hyouki, [""]): - expressions.append(expression) - return expressions - - def _find_readings(self, soup): - midasi_kana = soup.find(self._midashi_kana_name) - readings = parse_hyouki_soup(midasi_kana, [""]) - return readings - - def __decompose_subentries(self, page): - soup = BeautifulSoup(page, features="xml") - subentry_parameters = [ - [Sankoku8ChildEntry, ["子項目"], self.children], - [Sankoku8PhraseEntry, ["句項目"], self.phrases], - ] - for x in subentry_parameters: - subentry_class, tags, subentry_list = x - for tag in tags: - tag_soup = soup.find(tag) - while tag_soup is not None: - tag_soup.name = "項目" - subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) - self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(self.target, subentry_id) - page = tag_soup.decode() - subentry.set_page(page) - subentry_list.append(subentry) - tag_soup.decompose() - tag_soup = soup.find(tag) - return soup.decode() - - @staticmethod - def id_string_to_entry_id(id_string): - parts = id_string.split("-") - if len(parts) == 1: - return (int(parts[0]), 0) - elif len(parts) == 2: - # subentries have a hexadecimal part - return (int(parts[0]), int(parts[1], 16)) - else: - raise Exception(f"Invalid entry ID: {id_string}") - - @staticmethod - def _delete_unused_nodes(soup): - """Remove extra markup elements that appear in the entry - headword line which are not part of the entry headword""" - unused_nodes = [ - "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク", - "アクセント分節", "活用分節", "ルビG", "分書" - ] - for name in unused_nodes: - Soup.delete_soup_nodes(soup, name) - - -class Sankoku8Entry(_BaseSankoku8Entry): - def __init__(self, target, page_id): - entry_id = (page_id, 0) - super().__init__(target, entry_id) - self._midashi_name = "見出部" - self._midashi_kana_name = "見出仮名" - - def set_page(self, page): - page = preprocess_page(page) - super().set_page(page) - - -class Sankoku8ChildEntry(_BaseSankoku8Entry): - def __init__(self, target, page_id): - super().__init__(target, page_id) - self._midashi_name = "子見出部" - self._midashi_kana_name = "子見出仮名" - - -class Sankoku8PhraseEntry(_BaseSankoku8Entry): - def get_part_of_speech_tags(self): - # phrases do not contain these tags - return [] - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - expressions = self._find_expressions(soup) - readings = self._find_readings(soup) - headwords = {} - if len(expressions) != len(readings): - raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}") - for idx, expression in enumerate(expressions): - reading = readings[idx] - if reading in headwords: - headwords[reading].append(expression) - else: - headwords[reading] = [expression] - return headwords - - def _find_expressions(self, soup): - phrase_soup = soup.find("句表記") - expressions = parse_hyouki_soup(phrase_soup, [""]) - return expressions - - def _find_readings(self, soup): - reading_patterns = load_phrase_readings(self.target) - reading_pattern = reading_patterns[self.entry_id] - readings = parse_hyouki_pattern(reading_pattern) - return readings - - -def parse_hyouki_soup(soup, base_exps): - omitted_characters = [ - "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…" - ] - exps = base_exps.copy() - for child in soup.children: - new_exps = [] - if child.name == "言換G": - for alt in child.find_all("言換"): - parts = parse_hyouki_soup(alt, [""]) - for exp in exps: - for part in parts: - new_exps.append(exp + part) - elif child.name == "補足表記": - alt1 = child.find("表記対象") - alt2 = child.find("表記内容G") - parts1 = parse_hyouki_soup(alt1, [""]) - parts2 = parse_hyouki_soup(alt2, [""]) - for exp in exps: - for part in parts1: - new_exps.append(exp + part) - for part in parts2: - new_exps.append(exp + part) - elif child.name == "省略": - parts = parse_hyouki_soup(child, [""]) - for exp in exps: - new_exps.append(exp) - for part in parts: - new_exps.append(exp + part) - elif child.name is not None: - new_exps = parse_hyouki_soup(child, exps) - else: - text = child.text - for char in omitted_characters: - text = text.replace(char, "") - for exp in exps: - new_exps.append(exp + text) - exps = new_exps.copy() - return exps - - -def parse_hyouki_pattern(pattern): - replacements = { - "(": "<省略>(", - ")": ")", - "{": "<補足表記><表記対象>", - "・": "<表記内容G>(<表記内容>", - "}": "", - "〈": "<言換G>〈<言換>", - "/": "/<言換>", - "〉": "", - "⦅": "<補足表記><表記対象>", - "\": "<表記内容G>⦅<表記内容>", - "⦆": "", - } - markup = f"{pattern}" - for key, val in replacements.items(): - markup = markup.replace(key, val) - soup = BeautifulSoup(markup, "xml") - hyouki_soup = soup.find("span") - exps = parse_hyouki_soup(hyouki_soup, [""]) - return exps diff --git a/bot/entries/sankoku8/base_entry.py b/bot/entries/sankoku8/base_entry.py new file mode 100644 index 0000000..8d7a394 --- /dev/null +++ b/bot/entries/sankoku8/base_entry.py @@ -0,0 +1,104 @@ +import bot.soup as Soup +from bot.entries.base.sanseido_entry import SanseidoEntry +from bot.entries.sankoku8.parse import parse_hyouki_soup + + +class BaseEntry(SanseidoEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.children = [] + self.phrases = [] + self._hyouki_name = "表記" + self._midashi_name = None + self._midashi_kana_name = None + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + readings = self._find_readings(soup) + expressions = self._find_expressions(soup) + headwords = {} + for reading in readings: + headwords[reading] = [] + if len(readings) == 1: + reading = readings[0] + if soup.find(self._midashi_name).find(self._hyouki_name) is None: + headwords[reading].append(reading) + for exp in expressions: + if exp not in headwords[reading]: + headwords[reading].append(exp) + elif len(readings) > 1 and len(expressions) == 0: + for reading in readings: + headwords[reading].append(reading) + elif len(readings) > 1 and len(expressions) == 1: + if soup.find(self._midashi_name).find(self._hyouki_name) is None: + for reading in readings: + headwords[reading].append(reading) + expression = expressions[0] + for reading in readings: + if expression not in headwords[reading]: + headwords[reading].append(expression) + elif len(readings) > 1 and len(expressions) == len(readings): + if soup.find(self._midashi_name).find(self._hyouki_name) is None: + for reading in readings: + headwords[reading].append(reading) + for idx, reading in enumerate(readings): + exp = expressions[idx] + if exp not in headwords[reading]: + headwords[reading].append(exp) + else: + raise Exception() # shouldn't happen + return headwords + + def get_part_of_speech_tags(self): + if self._part_of_speech_tags is not None: + return self._part_of_speech_tags + self._part_of_speech_tags = [] + soup = self.get_page_soup() + for midashi in soup.find_all([self._midashi_name, "見出部要素"]): + pos_group = midashi.find("品詞G") + if pos_group is None: + continue + for tag in pos_group.find_all("a"): + if tag.text not in self._part_of_speech_tags: + self._part_of_speech_tags.append(tag.text) + return self._part_of_speech_tags + + def _find_expressions(self, soup): + expressions = [] + for hyouki in soup.find_all(self._hyouki_name): + self._fill_alts(hyouki) + for expression in parse_hyouki_soup(hyouki, [""]): + expressions.append(expression) + return expressions + + def _find_readings(self, soup): + midasi_kana = soup.find(self._midashi_kana_name) + readings = parse_hyouki_soup(midasi_kana, [""]) + return readings + + def _get_subentry_parameters(self): + from bot.entries.sankoku8.child_entry import ChildEntry + from bot.entries.sankoku8.phrase_entry import PhraseEntry + subentry_parameters = [ + [ChildEntry, ["子項目"], self.children], + [PhraseEntry, ["句項目"], self.phrases], + ] + return subentry_parameters + + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク", + "アクセント分節", "活用分節", "ルビG", "分書" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) + + @staticmethod + def _fill_alts(soup): + for img in soup.find_all("img"): + if img.has_attr("alt"): + img.string = img.attrs["alt"] diff --git a/bot/entries/sankoku8/child_entry.py b/bot/entries/sankoku8/child_entry.py new file mode 100644 index 0000000..9f6b1c1 --- /dev/null +++ b/bot/entries/sankoku8/child_entry.py @@ -0,0 +1,8 @@ +from bot.entries.sankoku8.base_entry import BaseEntry + + +class ChildEntry(BaseEntry): + def __init__(self, target, page_id): + super().__init__(target, page_id) + self._midashi_name = "子見出部" + self._midashi_kana_name = "子見出仮名" diff --git a/bot/entries/sankoku8/entry.py b/bot/entries/sankoku8/entry.py new file mode 100644 index 0000000..533ac66 --- /dev/null +++ b/bot/entries/sankoku8/entry.py @@ -0,0 +1,14 @@ +from bot.entries.sankoku8.base_entry import BaseEntry +from bot.entries.sankoku8.preprocess import preprocess_page + + +class Entry(BaseEntry): + def __init__(self, target, page_id): + entry_id = (page_id, 0) + super().__init__(target, entry_id) + self._midashi_name = "見出部" + self._midashi_kana_name = "見出仮名" + + def set_page(self, page): + page = preprocess_page(page) + super().set_page(page) diff --git a/bot/entries/sankoku8/parse.py b/bot/entries/sankoku8/parse.py new file mode 100644 index 0000000..a57574b --- /dev/null +++ b/bot/entries/sankoku8/parse.py @@ -0,0 +1,65 @@ +from bs4 import BeautifulSoup + + +def parse_hyouki_soup(soup, base_exps): + omitted_characters = [ + "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…" + ] + exps = base_exps.copy() + for child in soup.children: + new_exps = [] + if child.name == "言換G": + for alt in child.find_all("言換"): + parts = parse_hyouki_soup(alt, [""]) + for exp in exps: + for part in parts: + new_exps.append(exp + part) + elif child.name == "補足表記": + alt1 = child.find("表記対象") + alt2 = child.find("表記内容G") + parts1 = parse_hyouki_soup(alt1, [""]) + parts2 = parse_hyouki_soup(alt2, [""]) + for exp in exps: + for part in parts1: + new_exps.append(exp + part) + for part in parts2: + new_exps.append(exp + part) + elif child.name == "省略": + parts = parse_hyouki_soup(child, [""]) + for exp in exps: + new_exps.append(exp) + for part in parts: + new_exps.append(exp + part) + elif child.name is not None: + new_exps = parse_hyouki_soup(child, exps) + else: + text = child.text + for char in omitted_characters: + text = text.replace(char, "") + for exp in exps: + new_exps.append(exp + text) + exps = new_exps.copy() + return exps + + +def parse_hyouki_pattern(pattern): + replacements = { + "(": "<省略>(", + ")": ")", + "{": "<補足表記><表記対象>", + "・": "<表記内容G>(<表記内容>", + "}": "", + "〈": "<言換G>〈<言換>", + "/": "/<言換>", + "〉": "", + "⦅": "<補足表記><表記対象>", + "\": "<表記内容G>⦅<表記内容>", + "⦆": "", + } + markup = f"{pattern}" + for key, val in replacements.items(): + markup = markup.replace(key, val) + soup = BeautifulSoup(markup, "xml") + hyouki_soup = soup.find("span") + exps = parse_hyouki_soup(hyouki_soup, [""]) + return exps diff --git a/bot/entries/sankoku8/phrase_entry.py b/bot/entries/sankoku8/phrase_entry.py new file mode 100644 index 0000000..e5da208 --- /dev/null +++ b/bot/entries/sankoku8/phrase_entry.py @@ -0,0 +1,37 @@ +from bot.data import load_phrase_readings +from bot.entries.sankoku8.base_entry import BaseEntry +from bot.entries.sankoku8.parse import parse_hyouki_soup +from bot.entries.sankoku8.parse import parse_hyouki_pattern + + +class PhraseEntry(BaseEntry): + def get_part_of_speech_tags(self): + # phrases do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + expressions = self._find_expressions(soup) + readings = self._find_readings(soup) + headwords = {} + if len(expressions) != len(readings): + raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}") + for idx, expression in enumerate(expressions): + reading = readings[idx] + if reading in headwords: + headwords[reading].append(expression) + else: + headwords[reading] = [expression] + return headwords + + def _find_expressions(self, soup): + phrase_soup = soup.find("句表記") + expressions = parse_hyouki_soup(phrase_soup, [""]) + return expressions + + def _find_readings(self, soup): + reading_patterns = load_phrase_readings(self.target) + reading_pattern = reading_patterns[self.entry_id] + readings = parse_hyouki_pattern(reading_pattern) + return readings diff --git a/bot/entries/sankoku8_preprocess.py b/bot/entries/sankoku8/preprocess.py similarity index 58% rename from bot/entries/sankoku8_preprocess.py rename to bot/entries/sankoku8/preprocess.py index 73fb31a..1eee32d 100644 --- a/bot/entries/sankoku8_preprocess.py +++ b/bot/entries/sankoku8/preprocess.py @@ -4,9 +4,17 @@ from bs4 import BeautifulSoup from bot.data import get_adobe_glyph +__GAIJI = { + "svg-gaiji/byan.svg": "𰻞", + "svg-gaiji/G16EF.svg": "篡", +} + + def preprocess_page(page): soup = BeautifulSoup(page, features="xml") __replace_glyph_codes(soup) + __add_image_alt_text(soup) + __replace_tatehyphen(soup) page = __strip_page(soup) return page @@ -20,6 +28,21 @@ def __replace_glyph_codes(soup): geta.replace_with(glyph) +def __add_image_alt_text(soup): + for img in soup.find_all("img"): + if not img.has_attr("src"): + continue + src = img.attrs["src"] + if src in __GAIJI: + img.attrs["alt"] = __GAIJI[src] + + +def __replace_tatehyphen(soup): + for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}): + img.string = "−" + img.unwrap() + + def __strip_page(soup): koumoku = soup.find(["項目"]) if koumoku is not None: diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py deleted file mode 100644 index 2d43e4a..0000000 --- a/bot/entries/smk8.py +++ /dev/null @@ -1,221 +0,0 @@ -from bs4 import BeautifulSoup - -import bot.entries.expressions as Expressions -import bot.soup as Soup -from bot.data import load_phrase_readings -from bot.entries.entry import Entry -from bot.entries.smk8_preprocess import preprocess_page - - -class _BaseSmk8Entry(Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.children = [] - self.phrases = [] - self.kanjis = [] - - def get_global_identifier(self): - parent_part = format(self.entry_id[0], '06') - child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper() - return f"@{self.target.value}-{parent_part}-{child_part}" - - def set_page(self, page): - page = self.__decompose_subentries(page) - self._page = page - - def get_page_soup(self): - soup = BeautifulSoup(self._page, "xml") - return soup - - def get_part_of_speech_tags(self): - if self._part_of_speech_tags is not None: - return self._part_of_speech_tags - self._part_of_speech_tags = [] - soup = self.get_page_soup() - headword_info = soup.find("見出要素") - if headword_info is None: - return self._part_of_speech_tags - for tag in headword_info.find_all("品詞M"): - if tag.text not in self._part_of_speech_tags: - self._part_of_speech_tags.append(tag.text) - return self._part_of_speech_tags - - def _add_variant_expressions(self, headwords): - for expressions in headwords.values(): - Expressions.add_variant_kanji(expressions) - Expressions.add_fullwidth(expressions) - Expressions.remove_iteration_mark(expressions) - Expressions.add_iteration_mark(expressions) - - def _find_reading(self, soup): - midasi_kana = soup.find("見出仮名") - reading = midasi_kana.text - for x in [" ", "・"]: - reading = reading.replace(x, "") - return reading - - def _find_expressions(self, soup): - clean_expressions = [] - for expression in soup.find_all("標準表記"): - clean_expression = self._clean_expression(expression.text) - clean_expressions.append(clean_expression) - expressions = Expressions.expand_abbreviation_list(clean_expressions) - return expressions - - def __decompose_subentries(self, page): - soup = BeautifulSoup(page, features="xml") - subentry_parameters = [ - [Smk8ChildEntry, ["子項目F", "子項目"], self.children], - [Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases], - [Smk8KanjiEntry, ["造語成分項目"], self.kanjis], - ] - for x in subentry_parameters: - subentry_class, tags, subentry_list = x - for tag in tags: - tag_soup = soup.find(tag) - while tag_soup is not None: - tag_soup.name = "項目" - subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"]) - self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id - subentry = subentry_class(self.target, subentry_id) - page = tag_soup.decode() - subentry.set_page(page) - subentry_list.append(subentry) - tag_soup.decompose() - tag_soup = soup.find(tag) - return soup.decode() - - @staticmethod - def id_string_to_entry_id(id_string): - parts = id_string.split("-") - if len(parts) == 1: - return (int(parts[0]), 0) - elif len(parts) == 2: - # subentries have a hexadecimal part - return (int(parts[0]), int(parts[1], 16)) - else: - raise Exception(f"Invalid entry ID: {id_string}") - - @staticmethod - def _delete_unused_nodes(soup): - """Remove extra markup elements that appear in the entry - headword line which are not part of the entry headword""" - unused_nodes = [ - "表音表記", "表外音訓マーク", "表外字マーク", "ルビG" - ] - for name in unused_nodes: - Soup.delete_soup_nodes(soup, name) - - @staticmethod - def _clean_expression(expression): - for x in ["〈", "〉", "{", "}", "…", " "]: - expression = expression.replace(x, "") - return expression - - @staticmethod - def _fill_alts(soup): - for el in soup.find_all(["親見出仮名", "親見出表記"]): - el.string = el.attrs["alt"] - for gaiji in soup.find_all("外字"): - gaiji.string = gaiji.img.attrs["alt"] - - -class Smk8Entry(_BaseSmk8Entry): - def __init__(self, target, page_id): - entry_id = (page_id, 0) - super().__init__(target, entry_id) - - def set_page(self, page): - page = preprocess_page(page) - super().set_page(page) - - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - self._fill_alts(soup) - reading = self._find_reading(soup) - expressions = [] - if soup.find("見出部").find("標準表記") is None: - expressions.append(reading) - for expression in self._find_expressions(soup): - if expression not in expressions: - expressions.append(expression) - headwords = {reading: expressions} - return headwords - - -class Smk8ChildEntry(_BaseSmk8Entry): - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - self._fill_alts(soup) - reading = self._find_reading(soup) - expressions = [] - if soup.find("子見出部").find("標準表記") is None: - expressions.append(reading) - for expression in self._find_expressions(soup): - if expression not in expressions: - expressions.append(expression) - headwords = {reading: expressions} - return headwords - - -class Smk8PhraseEntry(_BaseSmk8Entry): - def __init__(self, target, entry_id): - super().__init__(target, entry_id) - self.__phrase_readings = load_phrase_readings(self.target) - - def get_part_of_speech_tags(self): - # phrases do not contain these tags - return [] - - def _get_headwords(self): - soup = self.get_page_soup() - headwords = {} - expressions = self._find_expressions(soup) - readings = self._find_readings() - for idx, expression in enumerate(expressions): - reading = readings[idx] - if reading in headwords: - headwords[reading].append(expression) - else: - headwords[reading] = [expression] - return headwords - - def _find_expressions(self, soup): - self._delete_unused_nodes(soup) - self._fill_alts(soup) - text = soup.find("標準表記").text - text = self._clean_expression(text) - alternatives = Expressions.expand_smk_alternatives(text) - expressions = [] - for alt in alternatives: - for exp in Expressions.expand_abbreviation(alt): - expressions.append(exp) - return expressions - - def _find_readings(self): - text = self.__phrase_readings[self.entry_id] - alternatives = Expressions.expand_smk_alternatives(text) - readings = [] - for alt in alternatives: - for reading in Expressions.expand_abbreviation(alt): - readings.append(reading) - return readings - - -class Smk8KanjiEntry(_BaseSmk8Entry): - def _get_headwords(self): - soup = self.get_page_soup() - self._delete_unused_nodes(soup) - self._fill_alts(soup) - reading = self.__get_parent_reading() - expressions = self._find_expressions(soup) - headwords = {reading: expressions} - return headwords - - def __get_parent_reading(self): - parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] - parent = self.ID_TO_ENTRY[parent_id] - reading = parent.get_first_reading() - return reading diff --git a/bot/entries/smk8/base_entry.py b/bot/entries/smk8/base_entry.py new file mode 100644 index 0000000..7bf32c2 --- /dev/null +++ b/bot/entries/smk8/base_entry.py @@ -0,0 +1,73 @@ +import bot.soup as Soup +import bot.entries.base.expressions as Expressions +from bot.entries.base.sanseido_entry import SanseidoEntry + + +class BaseEntry(SanseidoEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.children = [] + self.phrases = [] + self.kanjis = [] + + def get_part_of_speech_tags(self): + if self._part_of_speech_tags is not None: + return self._part_of_speech_tags + self._part_of_speech_tags = [] + soup = self.get_page_soup() + headword_info = soup.find("見出要素") + if headword_info is None: + return self._part_of_speech_tags + for tag in headword_info.find_all("品詞M"): + if tag.text not in self._part_of_speech_tags: + self._part_of_speech_tags.append(tag.text) + return self._part_of_speech_tags + + def _find_reading(self, soup): + midasi_kana = soup.find("見出仮名") + reading = midasi_kana.text + for x in [" ", "・"]: + reading = reading.replace(x, "") + return reading + + def _find_expressions(self, soup): + clean_expressions = [] + for expression in soup.find_all("標準表記"): + clean_expression = self._clean_expression(expression.text) + clean_expressions.append(clean_expression) + expressions = Expressions.expand_abbreviation_list(clean_expressions) + return expressions + + def _get_subentry_parameters(self): + from bot.entries.smk8.child_entry import ChildEntry + from bot.entries.smk8.phrase_entry import PhraseEntry + from bot.entries.smk8.kanji_entry import KanjiEntry + subentry_parameters = [ + [ChildEntry, ["子項目F", "子項目"], self.children], + [PhraseEntry, ["句項目F", "句項目"], self.phrases], + [KanjiEntry, ["造語成分項目"], self.kanjis], + ] + return subentry_parameters + + @staticmethod + def _delete_unused_nodes(soup): + """Remove extra markup elements that appear in the entry + headword line which are not part of the entry headword""" + unused_nodes = [ + "表音表記", "表外音訓マーク", "表外字マーク", "ルビG" + ] + for name in unused_nodes: + Soup.delete_soup_nodes(soup, name) + + @staticmethod + def _clean_expression(expression): + for x in ["〈", "〉", "{", "}", "…", " "]: + expression = expression.replace(x, "") + return expression + + @staticmethod + def _fill_alts(soup): + for elm in soup.find_all(["親見出仮名", "親見出表記"]): + elm.string = elm.attrs["alt"] + for gaiji in soup.find_all("外字"): + gaiji.string = gaiji.img.attrs["alt"] diff --git a/bot/entries/smk8/child_entry.py b/bot/entries/smk8/child_entry.py new file mode 100644 index 0000000..0dbe375 --- /dev/null +++ b/bot/entries/smk8/child_entry.py @@ -0,0 +1,17 @@ +from bot.entries.smk8.base_entry import BaseEntry + + +class ChildEntry(BaseEntry): + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + self._fill_alts(soup) + reading = self._find_reading(soup) + expressions = [] + if soup.find("子見出部").find("標準表記") is None: + expressions.append(reading) + for expression in self._find_expressions(soup): + if expression not in expressions: + expressions.append(expression) + headwords = {reading: expressions} + return headwords diff --git a/bot/entries/smk8/entry.py b/bot/entries/smk8/entry.py new file mode 100644 index 0000000..4baed42 --- /dev/null +++ b/bot/entries/smk8/entry.py @@ -0,0 +1,26 @@ +from bot.entries.smk8.base_entry import BaseEntry +from bot.entries.smk8.preprocess import preprocess_page + + +class Entry(BaseEntry): + def __init__(self, target, page_id): + entry_id = (page_id, 0) + super().__init__(target, entry_id) + + def set_page(self, page): + page = preprocess_page(page) + super().set_page(page) + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + self._fill_alts(soup) + reading = self._find_reading(soup) + expressions = [] + if soup.find("見出部").find("標準表記") is None: + expressions.append(reading) + for expression in self._find_expressions(soup): + if expression not in expressions: + expressions.append(expression) + headwords = {reading: expressions} + return headwords diff --git a/bot/entries/smk8/kanji_entry.py b/bot/entries/smk8/kanji_entry.py new file mode 100644 index 0000000..3e77faf --- /dev/null +++ b/bot/entries/smk8/kanji_entry.py @@ -0,0 +1,22 @@ +from bot.entries.smk8.base_entry import BaseEntry + + +class KanjiEntry(BaseEntry): + def get_part_of_speech_tags(self): + # kanji entries do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + self._delete_unused_nodes(soup) + self._fill_alts(soup) + reading = self.__get_parent_reading() + expressions = self._find_expressions(soup) + headwords = {reading: expressions} + return headwords + + def __get_parent_reading(self): + parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id] + parent = self.ID_TO_ENTRY[parent_id] + reading = parent.get_first_reading() + return reading diff --git a/bot/entries/smk8/phrase_entry.py b/bot/entries/smk8/phrase_entry.py new file mode 100644 index 0000000..aac9b84 --- /dev/null +++ b/bot/entries/smk8/phrase_entry.py @@ -0,0 +1,64 @@ +import re + +import bot.entries.base.expressions as Expressions +from bot.data import load_phrase_readings +from bot.entries.smk8.base_entry import BaseEntry + + +class PhraseEntry(BaseEntry): + def __init__(self, target, entry_id): + super().__init__(target, entry_id) + self.__phrase_readings = load_phrase_readings(self.target) + + def get_part_of_speech_tags(self): + # phrase entries do not contain these tags + return [] + + def _get_headwords(self): + soup = self.get_page_soup() + headwords = {} + expressions = self._find_expressions(soup) + readings = self._find_readings() + for idx, expression in enumerate(expressions): + reading = readings[idx] + if reading in headwords: + headwords[reading].append(expression) + else: + headwords[reading] = [expression] + return headwords + + def _find_expressions(self, soup): + self._delete_unused_nodes(soup) + self._fill_alts(soup) + text = soup.find("標準表記").text + text = self._clean_expression(text) + alternatives = parse_phrase(text) + expressions = [] + for alt in alternatives: + for exp in Expressions.expand_abbreviation(alt): + expressions.append(exp) + return expressions + + def _find_readings(self): + text = self.__phrase_readings[self.entry_id] + alternatives = parse_phrase(text) + readings = [] + for alt in alternatives: + for reading in Expressions.expand_abbreviation(alt): + readings.append(reading) + return readings + + +def parse_phrase(text): + """Return a list of strings described by △ notation.""" + match = re.search(r"△([^(]+)(([^(]+))", text) + if match is None: + return [text] + alt_parts = [match.group(1)] + for alt_part in match.group(2).split("・"): + alt_parts.append(alt_part) + alts = [] + for alt_part in alt_parts: + alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text) + alts.append(alt_exp) + return alts diff --git a/bot/entries/smk8_preprocess.py b/bot/entries/smk8/preprocess.py similarity index 98% rename from bot/entries/smk8_preprocess.py rename to bot/entries/smk8/preprocess.py index 5c9b924..ebda252 100644 --- a/bot/entries/smk8_preprocess.py +++ b/bot/entries/smk8/preprocess.py @@ -6,8 +6,8 @@ from bot.data import get_adobe_glyph __GAIJI = { "gaiji/5350.svg": "卐", - "gaiji/62cb.svg": "抛", - "gaiji/7be1.svg": "簒", + "gaiji/62cb.svg": "拋", + "gaiji/7be1.svg": "篡", } diff --git a/bot/factory.py b/bot/factory.py new file mode 100644 index 0000000..7b025d4 --- /dev/null +++ b/bot/factory.py @@ -0,0 +1,37 @@ +import importlib + + +def new_crawler(target): + module_path = f"bot.crawlers.{target.name.lower()}" + module = importlib.import_module(module_path) + return module.Crawler(target) + + +def new_entry(target, page_id): + module_path = f"bot.entries.{target.name.lower()}.entry" + module = importlib.import_module(module_path) + return module.Entry(target, page_id) + + +def new_yomichan_exporter(target): + module_path = f"bot.yomichan.exporters.{target.name.lower()}" + module = importlib.import_module(module_path) + return module.Exporter(target) + + +def new_yomichan_terminator(target): + module_path = f"bot.yomichan.terms.{target.name.lower()}" + module = importlib.import_module(module_path) + return module.Terminator(target) + + +def new_mdict_exporter(target): + module_path = f"bot.mdict.exporters.{target.name.lower()}" + module = importlib.import_module(module_path) + return module.Exporter(target) + + +def new_mdict_terminator(target): + module_path = f"bot.mdict.terms.{target.name.lower()}" + module = importlib.import_module(module_path) + return module.Terminator(target) diff --git a/bot/mdict/exporters/export.py b/bot/mdict/exporters/base/exporter.py similarity index 74% rename from bot/mdict/exporters/export.py rename to bot/mdict/exporters/base/exporter.py index b8e8347..37ed376 100644 --- a/bot/mdict/exporters/export.py +++ b/bot/mdict/exporters/base/exporter.py @@ -1,20 +1,19 @@ -# pylint: disable=too-few-public-methods - -import subprocess import os import shutil +import subprocess from abc import ABC, abstractmethod from pathlib import Path -from datetime import datetime + from platformdirs import user_documents_dir, user_cache_dir -from bot.mdict.terms.factory import new_terminator +from bot.time import timestamp +from bot.factory import new_mdict_terminator -class Exporter(ABC): +class BaseExporter(ABC): def __init__(self, target): self._target = target - self._terminator = new_terminator(target) + self._terminator = new_mdict_terminator(target) self._build_dir = None self._build_media_dir = None self._description_file = None @@ -34,7 +33,7 @@ class Exporter(ABC): return self._build_dir cache_dir = user_cache_dir("jitenbot") build_directory = os.path.join(cache_dir, "mdict_build") - print(f"Initializing build directory `{build_directory}`") + print(f"{timestamp()} Initializing build directory `{build_directory}`") if Path(build_directory).is_dir(): shutil.rmtree(build_directory) os.makedirs(build_directory) @@ -45,7 +44,7 @@ class Exporter(ABC): build_dir = self._get_build_dir() build_media_dir = os.path.join(build_dir, self._target.value) if media_dir is not None: - print("Copying media files to build directory...") + print(f"{timestamp()} Copying media files to build directory...") shutil.copytree(media_dir, build_media_dir) else: os.makedirs(build_media_dir) @@ -71,7 +70,7 @@ class Exporter(ABC): def _write_mdx_file(self, entries): terms = self._get_terms(entries) - print(f"Exporting {len(terms)} Mdict keys...") + print(f"{timestamp()} Exporting {len(terms)} Mdict keys...") out_dir = self._get_out_dir() out_file = os.path.join(out_dir, f"{self._target.value}.mdx") params = [ @@ -87,7 +86,7 @@ class Exporter(ABC): terms = [] entries_len = len(entries) for idx, entry in enumerate(entries): - update = f"Creating Mdict terms for entry {idx+1}/{entries_len}" + update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}" print(update, end='\r', flush=True) new_terms = self._terminator.make_terms(entry) for term in new_terms: @@ -126,7 +125,7 @@ class Exporter(ABC): return self._out_dir out_dir = os.path.join( user_documents_dir(), "jitenbot", "mdict", self._target.value) - print(f"Initializing output directory `{out_dir}`") + print(f"{timestamp()} Initializing output directory `{out_dir}`") if Path(out_dir).is_dir(): shutil.rmtree(out_dir) os.makedirs(out_dir) @@ -168,58 +167,8 @@ class Exporter(ABC): @abstractmethod def _get_revision(self, entries): - pass + raise NotImplementedError @abstractmethod def _get_attribution(self, entries): - pass - - -class _JitenonExporter(Exporter): - def _get_revision(self, entries): - modified_date = None - for entry in entries: - if modified_date is None or entry.modified_date > modified_date: - modified_date = entry.modified_date - revision = modified_date.strftime("%Y年%m月%d日閲覧") - return revision - - def _get_attribution(self, entries): - modified_date = None - for entry in entries: - if modified_date is None or entry.modified_date > modified_date: - attribution = entry.attribution - return attribution - - -class JitenonKokugoExporter(_JitenonExporter): - pass - - -class JitenonYojiExporter(_JitenonExporter): - pass - - -class JitenonKotowazaExporter(_JitenonExporter): - pass - - -class _MonokakidoExporter(Exporter): - def _get_revision(self, entries): - timestamp = datetime.now().strftime("%Y年%m月%d日作成") - return timestamp - - -class Smk8Exporter(_MonokakidoExporter): - def _get_attribution(self, entries): - return "© Sanseido Co., LTD. 2020" - - -class Daijirin2Exporter(_MonokakidoExporter): - def _get_attribution(self, entries): - return "© Sanseido Co., LTD. 2019" - - -class Sankoku8Exporter(_MonokakidoExporter): - def _get_attribution(self, entries): - return "© Sanseido Co., LTD. 2021" + raise NotImplementedError diff --git a/bot/mdict/exporters/base/jitenon.py b/bot/mdict/exporters/base/jitenon.py new file mode 100644 index 0000000..2e6b1df --- /dev/null +++ b/bot/mdict/exporters/base/jitenon.py @@ -0,0 +1,18 @@ +from bot.mdict.exporters.base.exporter import BaseExporter + + +class JitenonExporter(BaseExporter): + def _get_revision(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + revision = modified_date.strftime("%Y年%m月%d日閲覧") + return revision + + def _get_attribution(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + attribution = entry.attribution + return attribution diff --git a/bot/mdict/exporters/base/monokakido.py b/bot/mdict/exporters/base/monokakido.py new file mode 100644 index 0000000..b9b9629 --- /dev/null +++ b/bot/mdict/exporters/base/monokakido.py @@ -0,0 +1,8 @@ +from datetime import datetime +from bot.mdict.exporters.base.exporter import BaseExporter + + +class MonokakidoExporter(BaseExporter): + def _get_revision(self, entries): + timestamp = datetime.now().strftime("%Y年%m月%d日作成") + return timestamp diff --git a/bot/mdict/exporters/daijirin2.py b/bot/mdict/exporters/daijirin2.py new file mode 100644 index 0000000..4692470 --- /dev/null +++ b/bot/mdict/exporters/daijirin2.py @@ -0,0 +1,6 @@ +from bot.mdict.exporters.base.monokakido import MonokakidoExporter + + +class Exporter(MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2019" diff --git a/bot/mdict/exporters/factory.py b/bot/mdict/exporters/factory.py deleted file mode 100644 index 5417493..0000000 --- a/bot/mdict/exporters/factory.py +++ /dev/null @@ -1,20 +0,0 @@ -from bot.targets import Targets - -from bot.mdict.exporters.export import JitenonKokugoExporter -from bot.mdict.exporters.export import JitenonYojiExporter -from bot.mdict.exporters.export import JitenonKotowazaExporter -from bot.mdict.exporters.export import Smk8Exporter -from bot.mdict.exporters.export import Daijirin2Exporter -from bot.mdict.exporters.export import Sankoku8Exporter - - -def new_mdict_exporter(target): - exporter_map = { - Targets.JITENON_KOKUGO: JitenonKokugoExporter, - Targets.JITENON_YOJI: JitenonYojiExporter, - Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter, - Targets.SMK8: Smk8Exporter, - Targets.DAIJIRIN2: Daijirin2Exporter, - Targets.SANKOKU8: Sankoku8Exporter, - } - return exporter_map[target](target) diff --git a/bot/mdict/exporters/jitenon_kokugo.py b/bot/mdict/exporters/jitenon_kokugo.py new file mode 100644 index 0000000..5689fa8 --- /dev/null +++ b/bot/mdict/exporters/jitenon_kokugo.py @@ -0,0 +1,5 @@ +from bot.mdict.exporters.base.jitenon import JitenonExporter + + +class Exporter(JitenonExporter): + pass diff --git a/bot/mdict/exporters/jitenon_kotowaza.py b/bot/mdict/exporters/jitenon_kotowaza.py new file mode 100644 index 0000000..5689fa8 --- /dev/null +++ b/bot/mdict/exporters/jitenon_kotowaza.py @@ -0,0 +1,5 @@ +from bot.mdict.exporters.base.jitenon import JitenonExporter + + +class Exporter(JitenonExporter): + pass diff --git a/bot/mdict/exporters/jitenon_yoji.py b/bot/mdict/exporters/jitenon_yoji.py new file mode 100644 index 0000000..5689fa8 --- /dev/null +++ b/bot/mdict/exporters/jitenon_yoji.py @@ -0,0 +1,5 @@ +from bot.mdict.exporters.base.jitenon import JitenonExporter + + +class Exporter(JitenonExporter): + pass diff --git a/bot/mdict/exporters/sankoku8.py b/bot/mdict/exporters/sankoku8.py new file mode 100644 index 0000000..6063864 --- /dev/null +++ b/bot/mdict/exporters/sankoku8.py @@ -0,0 +1,6 @@ +from bot.mdict.exporters.base.monokakido import MonokakidoExporter + + +class Exporter(MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2021" diff --git a/bot/mdict/exporters/smk8.py b/bot/mdict/exporters/smk8.py new file mode 100644 index 0000000..a030b4b --- /dev/null +++ b/bot/mdict/exporters/smk8.py @@ -0,0 +1,6 @@ +from bot.mdict.exporters.base.monokakido import MonokakidoExporter + + +class Exporter(MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2020" diff --git a/bot/mdict/terms/base/jitenon.py b/bot/mdict/terms/base/jitenon.py new file mode 100644 index 0000000..4f255bf --- /dev/null +++ b/bot/mdict/terms/base/jitenon.py @@ -0,0 +1,20 @@ +from bot.mdict.terms.base.terminator import BaseTerminator + + +class JitenonTerminator(BaseTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = None + + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = self._glossary_maker.make_glossary(entry, self._media_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _link_glossary_parameters(self, entry): + return [] + + def _subentry_lists(self, entry): + return [] diff --git a/bot/mdict/terms/terminator.py b/bot/mdict/terms/base/terminator.py similarity index 95% rename from bot/mdict/terms/terminator.py rename to bot/mdict/terms/base/terminator.py index ee62411..945a65b 100644 --- a/bot/mdict/terms/terminator.py +++ b/bot/mdict/terms/base/terminator.py @@ -2,7 +2,7 @@ import re from abc import abstractmethod, ABC -class Terminator(ABC): +class BaseTerminator(ABC): def __init__(self, target): self._target = target self._glossary_cache = {} @@ -72,12 +72,12 @@ class Terminator(ABC): @abstractmethod def _glossary(self, entry): - pass + raise NotImplementedError @abstractmethod def _link_glossary_parameters(self, entry): - pass + raise NotImplementedError @abstractmethod def _subentry_lists(self, entry): - pass + raise NotImplementedError diff --git a/bot/mdict/terms/daijirin2.py b/bot/mdict/terms/daijirin2.py index 3b5ce68..640b520 100644 --- a/bot/mdict/terms/daijirin2.py +++ b/bot/mdict/terms/daijirin2.py @@ -1,8 +1,8 @@ -from bot.mdict.terms.terminator import Terminator +from bot.mdict.terms.base.terminator import BaseTerminator from bot.mdict.glossary.daijirin2 import make_glossary -class Daijirin2Terminator(Terminator): +class Terminator(BaseTerminator): def _glossary(self, entry): if entry.entry_id in self._glossary_cache: return self._glossary_cache[entry.entry_id] diff --git a/bot/mdict/terms/factory.py b/bot/mdict/terms/factory.py deleted file mode 100644 index 8cee8e7..0000000 --- a/bot/mdict/terms/factory.py +++ /dev/null @@ -1,20 +0,0 @@ -from bot.targets import Targets - -from bot.mdict.terms.jitenon import JitenonKokugoTerminator -from bot.mdict.terms.jitenon import JitenonYojiTerminator -from bot.mdict.terms.jitenon import JitenonKotowazaTerminator -from bot.mdict.terms.smk8 import Smk8Terminator -from bot.mdict.terms.daijirin2 import Daijirin2Terminator -from bot.mdict.terms.sankoku8 import Sankoku8Terminator - - -def new_terminator(target): - terminator_map = { - Targets.JITENON_KOKUGO: JitenonKokugoTerminator, - Targets.JITENON_YOJI: JitenonYojiTerminator, - Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator, - Targets.SMK8: Smk8Terminator, - Targets.DAIJIRIN2: Daijirin2Terminator, - Targets.SANKOKU8: Sankoku8Terminator, - } - return terminator_map[target](target) diff --git a/bot/mdict/terms/jitenon.py b/bot/mdict/terms/jitenon.py deleted file mode 100644 index 3f9cfc1..0000000 --- a/bot/mdict/terms/jitenon.py +++ /dev/null @@ -1,42 +0,0 @@ -from bot.mdict.terms.terminator import Terminator - -from bot.mdict.glossary.jitenon import JitenonKokugoGlossary -from bot.mdict.glossary.jitenon import JitenonYojiGlossary -from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary - - -class JitenonTerminator(Terminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = None - - def _glossary(self, entry): - if entry.entry_id in self._glossary_cache: - return self._glossary_cache[entry.entry_id] - glossary = self._glossary_maker.make_glossary(entry, self._media_dir) - self._glossary_cache[entry.entry_id] = glossary - return glossary - - def _link_glossary_parameters(self, entry): - return [] - - def _subentry_lists(self, entry): - return [] - - -class JitenonKokugoTerminator(JitenonTerminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = JitenonKokugoGlossary() - - -class JitenonYojiTerminator(JitenonTerminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = JitenonYojiGlossary() - - -class JitenonKotowazaTerminator(JitenonTerminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = JitenonKotowazaGlossary() diff --git a/bot/mdict/terms/jitenon_kokugo.py b/bot/mdict/terms/jitenon_kokugo.py new file mode 100644 index 0000000..2a44b7b --- /dev/null +++ b/bot/mdict/terms/jitenon_kokugo.py @@ -0,0 +1,8 @@ +from bot.mdict.terms.base.jitenon import JitenonTerminator +from bot.mdict.glossary.jitenon import JitenonKokugoGlossary + + +class Terminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKokugoGlossary() diff --git a/bot/mdict/terms/jitenon_kotowaza.py b/bot/mdict/terms/jitenon_kotowaza.py new file mode 100644 index 0000000..3492a49 --- /dev/null +++ b/bot/mdict/terms/jitenon_kotowaza.py @@ -0,0 +1,8 @@ +from bot.mdict.terms.base.jitenon import JitenonTerminator +from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary + + +class Terminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKotowazaGlossary() diff --git a/bot/mdict/terms/jitenon_yoji.py b/bot/mdict/terms/jitenon_yoji.py new file mode 100644 index 0000000..a4175a1 --- /dev/null +++ b/bot/mdict/terms/jitenon_yoji.py @@ -0,0 +1,8 @@ +from bot.mdict.terms.base.jitenon import JitenonTerminator +from bot.mdict.glossary.jitenon import JitenonYojiGlossary + + +class Terminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonYojiGlossary() diff --git a/bot/mdict/terms/sankoku8.py b/bot/mdict/terms/sankoku8.py index 5c1bfb7..71a3b8f 100644 --- a/bot/mdict/terms/sankoku8.py +++ b/bot/mdict/terms/sankoku8.py @@ -1,8 +1,8 @@ -from bot.mdict.terms.terminator import Terminator +from bot.mdict.terms.base.terminator import BaseTerminator from bot.mdict.glossary.sankoku8 import make_glossary -class Sankoku8Terminator(Terminator): +class Terminator(BaseTerminator): def _glossary(self, entry): if entry.entry_id in self._glossary_cache: return self._glossary_cache[entry.entry_id] diff --git a/bot/mdict/terms/smk8.py b/bot/mdict/terms/smk8.py index 22275d5..ef2b7a2 100644 --- a/bot/mdict/terms/smk8.py +++ b/bot/mdict/terms/smk8.py @@ -1,8 +1,8 @@ -from bot.mdict.terms.terminator import Terminator +from bot.mdict.terms.base.terminator import BaseTerminator from bot.mdict.glossary.smk8 import make_glossary -class Smk8Terminator(Terminator): +class Terminator(BaseTerminator): def _glossary(self, entry): if entry.entry_id in self._glossary_cache: return self._glossary_cache[entry.entry_id] diff --git a/bot/time.py b/bot/time.py new file mode 100644 index 0000000..f8dae94 --- /dev/null +++ b/bot/time.py @@ -0,0 +1,5 @@ +import time + + +def timestamp(): + return time.strftime('%X') diff --git a/bot/yomichan/exporters/export.py b/bot/yomichan/exporters/base/exporter.py similarity index 69% rename from bot/yomichan/exporters/export.py rename to bot/yomichan/exporters/base/exporter.py index d348fed..5e4e870 100644 --- a/bot/yomichan/exporters/export.py +++ b/bot/yomichan/exporters/base/exporter.py @@ -1,24 +1,23 @@ -# pylint: disable=too-few-public-methods - import json import os import shutil import copy from pathlib import Path -from datetime import datetime from abc import ABC, abstractmethod -from platformdirs import user_documents_dir, user_cache_dir import fastjsonschema +from platformdirs import user_documents_dir, user_cache_dir + +from bot.time import timestamp from bot.data import load_yomichan_metadata -from bot.yomichan.terms.factory import new_terminator from bot.data import load_yomichan_term_schema +from bot.factory import new_yomichan_terminator -class Exporter(ABC): +class BaseExporter(ABC): def __init__(self, target): self._target = target - self._terminator = new_terminator(target) + self._terminator = new_yomichan_terminator(target) self._build_dir = None self._terms_per_file = 2000 @@ -36,18 +35,18 @@ class Exporter(ABC): @abstractmethod def _get_revision(self, entries): - pass + raise NotImplementedError @abstractmethod def _get_attribution(self, entries): - pass + raise NotImplementedError def _get_build_dir(self): if self._build_dir is not None: return self._build_dir cache_dir = user_cache_dir("jitenbot") build_directory = os.path.join(cache_dir, "yomichan_build") - print(f"Initializing build directory `{build_directory}`") + print(f"{timestamp()} Initializing build directory `{build_directory}`") if Path(build_directory).is_dir(): shutil.rmtree(build_directory) os.makedirs(build_directory) @@ -66,8 +65,9 @@ class Exporter(ABC): build_dir = self._get_build_dir() build_img_dir = os.path.join(build_dir, self._target.value) if image_dir is not None: - print("Copying media files to build directory...") + print(f"{timestamp()} Copying media files to build directory...") shutil.copytree(image_dir, build_img_dir) + print(f"{timestamp()} Finished copying files") else: os.makedirs(build_img_dir) self._terminator.set_image_dir(build_img_dir) @@ -76,7 +76,7 @@ class Exporter(ABC): terms = [] entries_len = len(entries) for idx, entry in enumerate(entries): - update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}" + update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}" print(update, end='\r', flush=True) new_terms = self._terminator.make_terms(entry) for term in new_terms: @@ -85,7 +85,7 @@ class Exporter(ABC): return terms def __validate_terms(self, terms): - print("Making a copy of term data for validation...") + print(f"{timestamp()} Making a copy of term data for validation...") terms_copy = copy.deepcopy(terms) # because validator will alter data! term_count = len(terms_copy) log_dir = self.__get_invalid_term_dir() @@ -93,7 +93,7 @@ class Exporter(ABC): validator = fastjsonschema.compile(schema) failure_count = 0 for idx, term in enumerate(terms_copy): - update = f"Validating term {idx+1}/{term_count}" + update = f"\tValidating term {idx+1}/{term_count}" print(update, end='\r', flush=True) try: validator([term]) @@ -102,9 +102,9 @@ class Exporter(ABC): term_file = os.path.join(log_dir, f"{idx}.json") with open(term_file, "w", encoding='utf8') as f: json.dump([term], f, indent=4, ensure_ascii=False) - print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}") + print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}") if failure_count > 0: - print(f"Invalid terms saved to `{log_dir}` for debugging") + print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging") def __make_dictionary(self, terms, index, tags): self.__write_term_banks(terms) @@ -114,14 +114,14 @@ class Exporter(ABC): self.__rm_build_dir() def __write_term_banks(self, terms): - print(f"Exporting {len(terms)} JSON terms") + print(f"{timestamp()} Exporting {len(terms)} JSON terms") build_dir = self._get_build_dir() max_i = int(len(terms) / self._terms_per_file) + 1 for i in range(max_i): + update = f"\tWriting terms to term bank {i+1}/{max_i}" + print(update, end='\r', flush=True) start = self._terms_per_file * i end = self._terms_per_file * (i + 1) - update = f"Writing terms to term banks {start} - {end}" - print(update, end='\r', flush=True) term_file = os.path.join(build_dir, f"term_bank_{i+1}.json") with open(term_file, "w", encoding='utf8') as f: json.dump(terms[start:end], f, indent=4, ensure_ascii=False) @@ -142,8 +142,8 @@ class Exporter(ABC): json.dump(tags, f, indent=4, ensure_ascii=False) def __write_archive(self, filename): - print("Archiving data to ZIP file...") archive_format = "zip" + print(f"{timestamp()} Archiving data to {archive_format.upper()} file...") out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan") if not Path(out_dir).is_dir(): os.makedirs(out_dir) @@ -154,58 +154,8 @@ class Exporter(ABC): base_filename = os.path.join(out_dir, filename) build_dir = self._get_build_dir() shutil.make_archive(base_filename, archive_format, build_dir) - print(f"Dictionary file saved to {out_filepath}") + print(f"{timestamp()} Dictionary file saved to `{out_filepath}`") def __rm_build_dir(self): build_dir = self._get_build_dir() shutil.rmtree(build_dir) - - -class _JitenonExporter(Exporter): - def _get_revision(self, entries): - modified_date = None - for entry in entries: - if modified_date is None or entry.modified_date > modified_date: - modified_date = entry.modified_date - revision = f"{self._target.value};{modified_date}" - return revision - - def _get_attribution(self, entries): - modified_date = None - for entry in entries: - if modified_date is None or entry.modified_date > modified_date: - attribution = entry.attribution - return attribution - - -class JitenonKokugoExporter(_JitenonExporter): - pass - - -class JitenonYojiExporter(_JitenonExporter): - pass - - -class JitenonKotowazaExporter(_JitenonExporter): - pass - - -class _MonokakidoExporter(Exporter): - def _get_revision(self, entries): - timestamp = datetime.now().strftime("%Y-%m-%d") - return f"{self._target.value};{timestamp}" - - -class Smk8Exporter(_MonokakidoExporter): - def _get_attribution(self, entries): - return "© Sanseido Co., LTD. 2020" - - -class Daijirin2Exporter(_MonokakidoExporter): - def _get_attribution(self, entries): - return "© Sanseido Co., LTD. 2019" - - -class Sankoku8Exporter(_MonokakidoExporter): - def _get_attribution(self, entries): - return "© Sanseido Co., LTD. 2021" diff --git a/bot/yomichan/exporters/base/jitenon.py b/bot/yomichan/exporters/base/jitenon.py new file mode 100644 index 0000000..80f0175 --- /dev/null +++ b/bot/yomichan/exporters/base/jitenon.py @@ -0,0 +1,18 @@ +from bot.yomichan.exporters.base.exporter import BaseExporter + + +class JitenonExporter(BaseExporter): + def _get_revision(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + modified_date = entry.modified_date + revision = f"{self._target.value};{modified_date}" + return revision + + def _get_attribution(self, entries): + modified_date = None + for entry in entries: + if modified_date is None or entry.modified_date > modified_date: + attribution = entry.attribution + return attribution diff --git a/bot/yomichan/exporters/base/monokakido.py b/bot/yomichan/exporters/base/monokakido.py new file mode 100644 index 0000000..5c5f3fa --- /dev/null +++ b/bot/yomichan/exporters/base/monokakido.py @@ -0,0 +1,8 @@ +from datetime import datetime +from bot.yomichan.exporters.base.exporter import BaseExporter + + +class MonokakidoExporter(BaseExporter): + def _get_revision(self, entries): + timestamp = datetime.now().strftime("%Y-%m-%d") + return f"{self._target.value};{timestamp}" diff --git a/bot/yomichan/exporters/daijirin2.py b/bot/yomichan/exporters/daijirin2.py new file mode 100644 index 0000000..7115342 --- /dev/null +++ b/bot/yomichan/exporters/daijirin2.py @@ -0,0 +1,6 @@ +from bot.yomichan.exporters.base.monokakido import MonokakidoExporter + + +class Exporter(MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2019" diff --git a/bot/yomichan/exporters/factory.py b/bot/yomichan/exporters/factory.py deleted file mode 100644 index afed7fd..0000000 --- a/bot/yomichan/exporters/factory.py +++ /dev/null @@ -1,20 +0,0 @@ -from bot.targets import Targets - -from bot.yomichan.exporters.export import JitenonKokugoExporter -from bot.yomichan.exporters.export import JitenonYojiExporter -from bot.yomichan.exporters.export import JitenonKotowazaExporter -from bot.yomichan.exporters.export import Smk8Exporter -from bot.yomichan.exporters.export import Daijirin2Exporter -from bot.yomichan.exporters.export import Sankoku8Exporter - - -def new_yomi_exporter(target): - exporter_map = { - Targets.JITENON_KOKUGO: JitenonKokugoExporter, - Targets.JITENON_YOJI: JitenonYojiExporter, - Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter, - Targets.SMK8: Smk8Exporter, - Targets.DAIJIRIN2: Daijirin2Exporter, - Targets.SANKOKU8: Sankoku8Exporter, - } - return exporter_map[target](target) diff --git a/bot/yomichan/exporters/jitenon_kokugo.py b/bot/yomichan/exporters/jitenon_kokugo.py new file mode 100644 index 0000000..0a3ef7a --- /dev/null +++ b/bot/yomichan/exporters/jitenon_kokugo.py @@ -0,0 +1,5 @@ +from bot.yomichan.exporters.base.jitenon import JitenonExporter + + +class Exporter(JitenonExporter): + pass diff --git a/bot/yomichan/exporters/jitenon_kotowaza.py b/bot/yomichan/exporters/jitenon_kotowaza.py new file mode 100644 index 0000000..0a3ef7a --- /dev/null +++ b/bot/yomichan/exporters/jitenon_kotowaza.py @@ -0,0 +1,5 @@ +from bot.yomichan.exporters.base.jitenon import JitenonExporter + + +class Exporter(JitenonExporter): + pass diff --git a/bot/yomichan/exporters/jitenon_yoji.py b/bot/yomichan/exporters/jitenon_yoji.py new file mode 100644 index 0000000..0a3ef7a --- /dev/null +++ b/bot/yomichan/exporters/jitenon_yoji.py @@ -0,0 +1,5 @@ +from bot.yomichan.exporters.base.jitenon import JitenonExporter + + +class Exporter(JitenonExporter): + pass diff --git a/bot/yomichan/exporters/sankoku8.py b/bot/yomichan/exporters/sankoku8.py new file mode 100644 index 0000000..b33c389 --- /dev/null +++ b/bot/yomichan/exporters/sankoku8.py @@ -0,0 +1,6 @@ +from bot.yomichan.exporters.base.monokakido import MonokakidoExporter + + +class Exporter(MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2021" diff --git a/bot/yomichan/exporters/smk8.py b/bot/yomichan/exporters/smk8.py new file mode 100644 index 0000000..7f71aa3 --- /dev/null +++ b/bot/yomichan/exporters/smk8.py @@ -0,0 +1,6 @@ +from bot.yomichan.exporters.base.monokakido import MonokakidoExporter + + +class Exporter(MonokakidoExporter): + def _get_attribution(self, entries): + return "© Sanseido Co., LTD. 2020" diff --git a/bot/yomichan/glossary/daijirin2.py b/bot/yomichan/glossary/daijirin2.py index 0adaa96..178de00 100644 --- a/bot/yomichan/glossary/daijirin2.py +++ b/bot/yomichan/glossary/daijirin2.py @@ -1,9 +1,10 @@ import re import os -from bs4 import BeautifulSoup from functools import cache from pathlib import Path +from bs4 import BeautifulSoup + import bot.yomichan.glossary.icons as Icons from bot.soup import delete_soup_nodes from bot.data import load_yomichan_name_conversion diff --git a/bot/yomichan/terms/base/jitenon.py b/bot/yomichan/terms/base/jitenon.py new file mode 100644 index 0000000..d0d5388 --- /dev/null +++ b/bot/yomichan/terms/base/jitenon.py @@ -0,0 +1,26 @@ +from bot.yomichan.terms.base.terminator import BaseTerminator + + +class JitenonTerminator(BaseTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = None + + def _definition_tags(self, entry): + return None + + def _glossary(self, entry): + if entry.entry_id in self._glossary_cache: + return self._glossary_cache[entry.entry_id] + glossary = self._glossary_maker.make_glossary(entry, self._image_dir) + self._glossary_cache[entry.entry_id] = glossary + return glossary + + def _sequence(self, entry): + return entry.entry_id + + def _link_glossary_parameters(self, entry): + return [] + + def _subentry_lists(self, entry): + return [] diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/base/terminator.py similarity index 91% rename from bot/yomichan/terms/terminator.py rename to bot/yomichan/terms/base/terminator.py index dd0c02d..f57c4cc 100644 --- a/bot/yomichan/terms/terminator.py +++ b/bot/yomichan/terms/base/terminator.py @@ -2,7 +2,7 @@ from abc import abstractmethod, ABC from bot.data import load_yomichan_inflection_categories -class Terminator(ABC): +class BaseTerminator(ABC): def __init__(self, target): self._target = target self._glossary_cache = {} @@ -66,28 +66,28 @@ class Terminator(ABC): @abstractmethod def _definition_tags(self, entry): - pass + raise NotImplementedError @abstractmethod def _inflection_rules(self, entry, expression): - pass + raise NotImplementedError @abstractmethod def _glossary(self, entry): - pass + raise NotImplementedError @abstractmethod def _sequence(self, entry): - pass + raise NotImplementedError @abstractmethod def _term_tags(self, entry): - pass + raise NotImplementedError @abstractmethod def _link_glossary_parameters(self, entry): - pass + raise NotImplementedError @abstractmethod def _subentry_lists(self, entry): - pass + raise NotImplementedError diff --git a/bot/yomichan/terms/daijirin2.py b/bot/yomichan/terms/daijirin2.py index 10aaa76..7cf06fb 100644 --- a/bot/yomichan/terms/daijirin2.py +++ b/bot/yomichan/terms/daijirin2.py @@ -1,14 +1,10 @@ -from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry - -from bot.yomichan.terms.terminator import Terminator +from bot.entries.daijirin2.phrase_entry import PhraseEntry +from bot.yomichan.terms.base.terminator import BaseTerminator from bot.yomichan.glossary.daijirin2 import make_glossary from bot.yomichan.grammar import sudachi_rules, tags_to_rules -class Daijirin2Terminator(Terminator): - def __init__(self, target): - super().__init__(target) - +class Terminator(BaseTerminator): def _definition_tags(self, entry): return "" diff --git a/bot/yomichan/terms/factory.py b/bot/yomichan/terms/factory.py deleted file mode 100644 index 8c596cb..0000000 --- a/bot/yomichan/terms/factory.py +++ /dev/null @@ -1,20 +0,0 @@ -from bot.targets import Targets - -from bot.yomichan.terms.jitenon import JitenonKokugoTerminator -from bot.yomichan.terms.jitenon import JitenonYojiTerminator -from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator -from bot.yomichan.terms.smk8 import Smk8Terminator -from bot.yomichan.terms.daijirin2 import Daijirin2Terminator -from bot.yomichan.terms.sankoku8 import Sankoku8Terminator - - -def new_terminator(target): - terminator_map = { - Targets.JITENON_KOKUGO: JitenonKokugoTerminator, - Targets.JITENON_YOJI: JitenonYojiTerminator, - Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator, - Targets.SMK8: Smk8Terminator, - Targets.DAIJIRIN2: Daijirin2Terminator, - Targets.SANKOKU8: Sankoku8Terminator, - } - return terminator_map[target](target) diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py deleted file mode 100644 index 66bbed7..0000000 --- a/bot/yomichan/terms/jitenon.py +++ /dev/null @@ -1,68 +0,0 @@ -from bot.yomichan.grammar import sudachi_rules -from bot.yomichan.terms.terminator import Terminator - -from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary -from bot.yomichan.glossary.jitenon import JitenonYojiGlossary -from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary - - -class JitenonTerminator(Terminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = None - - def _definition_tags(self, entry): - return None - - def _glossary(self, entry): - if entry.entry_id in self._glossary_cache: - return self._glossary_cache[entry.entry_id] - glossary = self._glossary_maker.make_glossary(entry, self._image_dir) - self._glossary_cache[entry.entry_id] = glossary - return glossary - - def _sequence(self, entry): - return entry.entry_id - - def _link_glossary_parameters(self, entry): - return [] - - def _subentry_lists(self, entry): - return [] - - -class JitenonKokugoTerminator(JitenonTerminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = JitenonKokugoGlossary() - - def _inflection_rules(self, entry, expression): - return sudachi_rules(expression) - - def _term_tags(self, entry): - return "" - - -class JitenonYojiTerminator(JitenonTerminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = JitenonYojiGlossary() - - def _inflection_rules(self, entry, expression): - return "" - - def _term_tags(self, entry): - tags = entry.kanken_level.split("/") - return " ".join(tags) - - -class JitenonKotowazaTerminator(JitenonTerminator): - def __init__(self, target): - super().__init__(target) - self._glossary_maker = JitenonKotowazaGlossary() - - def _inflection_rules(self, entry, expression): - return sudachi_rules(expression) - - def _term_tags(self, entry): - return "" diff --git a/bot/yomichan/terms/jitenon_kokugo.py b/bot/yomichan/terms/jitenon_kokugo.py new file mode 100644 index 0000000..3e33b77 --- /dev/null +++ b/bot/yomichan/terms/jitenon_kokugo.py @@ -0,0 +1,15 @@ +from bot.yomichan.grammar import sudachi_rules +from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary +from bot.yomichan.terms.base.jitenon import JitenonTerminator + + +class Terminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKokugoGlossary() + + def _inflection_rules(self, entry, expression): + return sudachi_rules(expression) + + def _term_tags(self, entry): + return "" diff --git a/bot/yomichan/terms/jitenon_kotowaza.py b/bot/yomichan/terms/jitenon_kotowaza.py new file mode 100644 index 0000000..a0651b9 --- /dev/null +++ b/bot/yomichan/terms/jitenon_kotowaza.py @@ -0,0 +1,15 @@ +from bot.yomichan.grammar import sudachi_rules +from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary +from bot.yomichan.terms.base.jitenon import JitenonTerminator + + +class Terminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonKotowazaGlossary() + + def _inflection_rules(self, entry, expression): + return sudachi_rules(expression) + + def _term_tags(self, entry): + return "" diff --git a/bot/yomichan/terms/jitenon_yoji.py b/bot/yomichan/terms/jitenon_yoji.py new file mode 100644 index 0000000..5087539 --- /dev/null +++ b/bot/yomichan/terms/jitenon_yoji.py @@ -0,0 +1,15 @@ +from bot.yomichan.glossary.jitenon import JitenonYojiGlossary +from bot.yomichan.terms.base.jitenon import JitenonTerminator + + +class Terminator(JitenonTerminator): + def __init__(self, target): + super().__init__(target) + self._glossary_maker = JitenonYojiGlossary() + + def _inflection_rules(self, entry, expression): + return "" + + def _term_tags(self, entry): + tags = entry.kanken_level.split("/") + return " ".join(tags) diff --git a/bot/yomichan/terms/sankoku8.py b/bot/yomichan/terms/sankoku8.py index 613f3bb..d6e6afd 100644 --- a/bot/yomichan/terms/sankoku8.py +++ b/bot/yomichan/terms/sankoku8.py @@ -1,14 +1,10 @@ -from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry - -from bot.yomichan.terms.terminator import Terminator +from bot.entries.sankoku8.phrase_entry import PhraseEntry +from bot.yomichan.terms.base.terminator import BaseTerminator from bot.yomichan.glossary.sankoku8 import make_glossary from bot.yomichan.grammar import sudachi_rules, tags_to_rules -class Sankoku8Terminator(Terminator): - def __init__(self, target): - super().__init__(target) - +class Terminator(BaseTerminator): def _definition_tags(self, entry): return "" diff --git a/bot/yomichan/terms/smk8.py b/bot/yomichan/terms/smk8.py index d1e3ca7..9e85c17 100644 --- a/bot/yomichan/terms/smk8.py +++ b/bot/yomichan/terms/smk8.py @@ -1,12 +1,11 @@ -from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry -from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry - -from bot.yomichan.terms.terminator import Terminator +from bot.entries.smk8.kanji_entry import KanjiEntry +from bot.entries.smk8.phrase_entry import PhraseEntry +from bot.yomichan.terms.base.terminator import BaseTerminator from bot.yomichan.glossary.smk8 import make_glossary from bot.yomichan.grammar import sudachi_rules, tags_to_rules -class Smk8Terminator(Terminator): +class Terminator(BaseTerminator): def __init__(self, target): super().__init__(target) diff --git a/data/entries/variant_kanji.csv b/data/entries/variant_kanji.csv index 849eec3..0272164 100644 --- a/data/entries/variant_kanji.csv +++ b/data/entries/variant_kanji.csv @@ -1,65 +1,61 @@ -亙,亘 -俠,侠 -俱,倶 -儘,侭 -凜,凛 -剝,剥 +𠮟,叱 吞,呑 +靭,靱 +臈,﨟 啞,唖 -噓,嘘 嚙,噛 -囊,嚢 -塡,填 -壺,壷 屛,屏 -屢,屡 幷,并 彎,弯 搔,掻 -摑,掴 攪,撹 -曾,曽 枡,桝 -檜,桧 -檮,梼 -潑,溌 -濤,涛 濾,沪 -瀆,涜 -灌,潅 -焰,焔 -瘦,痩 -禰,祢 -禱,祷 -穎,頴 -竈,竃 -簞,箪 -籠,篭 繡,繍 -繫,繋 -萊,莱 蔣,蒋 -藪,薮 -蘆,芦 -蟬,蝉 -蠅,蝿 蠟,蝋 -蠣,蛎 -賤,賎 -軀,躯 -邇,迩 醬,醤 -醱,醗 -靱,靭 -頰,頬 -頸,頚 -顚,顛 -驒,騨 -鰺,鯵 -鶯,鴬 +穎,頴 鷗,鴎 鹼,鹸 麴,麹 -麵,麺 -﨟,臈 -𠮟,叱 +俠,侠 +俱,倶 +剝,剥 +噓,嘘 +囊,嚢 +塡,填 +屢,屡 +摑,掴 +瀆,涜 +潑,溌 +焰,焔 +簞,箪 +繫,繋 +萊,莱 +蟬,蝉 +軀,躯 +醱,醗 +頰,頬 +顚,顛 +驒,騨 +姸,妍 +攢,攅 +𣜜,杤 +檔,档 +槶,椢 +櫳,槞 +纊,絋 +纘,纉 +隯,陦 +筓,笄 +逬,迸 +腁,胼 +騈,駢 +拋,抛 +篡,簒 +檜,桧 +禰,祢 +禱,祷 +蘆,芦 +凜,凛 \ No newline at end of file diff --git a/jitenbot.py b/jitenbot.py index da44905..f0a2719 100644 --- a/jitenbot.py +++ b/jitenbot.py @@ -21,7 +21,7 @@ import sys import argparse import subprocess from bot.targets import Targets -from bot.crawlers.factory import new_crawler +from bot.factory import new_crawler def filename(f): diff --git a/run_all.sh b/run_all.sh index 706a911..9dcdfda 100755 --- a/run_all.sh +++ b/run_all.sh @@ -1,5 +1,7 @@ #!/bin/sh +export PYTHONPYCACHEPREFIX=/tmp/pycache + python -m unittest discover -s tests python jitenbot.py jitenon-kokugo diff --git a/tests/test_daijirin_phrases.py b/tests/test_daijirin_phrases.py new file mode 100644 index 0000000..3ab02dd --- /dev/null +++ b/tests/test_daijirin_phrases.py @@ -0,0 +1,21 @@ +import unittest +from bot.entries.daijirin2.phrase_entry import parse_phrase + + +class TestDaijirin2PhraseParse(unittest.TestCase): + def test1(self): + text = "同じ穴の=狢(=狐・狸)" + exps = parse_phrase(text) + self.assertEqual(len(exps), 3) + self.assertIn("同じ穴の狢", exps) + self.assertIn("同じ穴の狐", exps) + self.assertIn("同じ穴の狸", exps) + + def test2(self): + text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" + exps = parse_phrase(text) + self.assertEqual(len(exps), 4) + self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps) + self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps) + self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps) + self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps) diff --git a/tests/test_expressions.py b/tests/test_expressions.py index b2ebc26..9091dda 100644 --- a/tests/test_expressions.py +++ b/tests/test_expressions.py @@ -1,5 +1,5 @@ import unittest -import bot.entries.expressions as Expressions +import bot.entries.base.expressions as Expressions class TestExpressions(unittest.TestCase): @@ -34,8 +34,8 @@ class TestExpressions(unittest.TestCase): self.assertIn("凶々しい", exps) self.assertIn("凶凶しい", exps) - def test_add_variant_kanji(self): - exps = ["剝く", "掴む", "摑む"] + def test_add_variant_kanji1(self): + exps = ["剥く", "摑む"] Expressions.add_variant_kanji(exps) self.assertEqual(len(exps), 4) self.assertIn("剥く", exps) @@ -44,6 +44,15 @@ class TestExpressions(unittest.TestCase): self.assertIn("摑む", exps) def test_add_variant_kanji2(self): + exps = ["剝く", "掴む", "摑む"] + Expressions.add_variant_kanji(exps) + self.assertEqual(len(exps), 4) + self.assertIn("剥く", exps) + self.assertIn("剝く", exps) + self.assertIn("掴む", exps) + self.assertIn("摑む", exps) + + def test_add_variant_kanji3(self): exps = ["剝摑"] Expressions.add_variant_kanji(exps) self.assertEqual(len(exps), 4) @@ -52,6 +61,15 @@ class TestExpressions(unittest.TestCase): self.assertIn("剥掴", exps) self.assertIn("剥摑", exps) + def test_add_variant_kanji4(self): + exps = ["剥掴"] + Expressions.add_variant_kanji(exps) + self.assertEqual(len(exps), 4) + self.assertIn("剝摑", exps) + self.assertIn("剝掴", exps) + self.assertIn("剥掴", exps) + self.assertIn("剥摑", exps) + def test_expand_abbreviation(self): text = "有(り)合(わ)せ" abbrs = Expressions.expand_abbreviation(text) @@ -69,28 +87,3 @@ class TestExpressions(unittest.TestCase): self.assertIn("有合わせ", abbrs) self.assertIn("有り合せ", abbrs) self.assertIn("有合せ", abbrs) - - def test_smk_expand_alternatives(self): - text = "△金(時間・暇)に飽かして" - exps = Expressions.expand_smk_alternatives(text) - self.assertEqual(len(exps), 3) - self.assertIn("金に飽かして", exps) - self.assertIn("時間に飽かして", exps) - self.assertIn("暇に飽かして", exps) - - def test_daijirin_expand_alternatives(self): - text = "同じ穴の=狢(=狐・狸)" - exps = Expressions.expand_daijirin_alternatives(text) - self.assertEqual(len(exps), 3) - self.assertIn("同じ穴の狢", exps) - self.assertIn("同じ穴の狐", exps) - self.assertIn("同じ穴の狸", exps) - - def test_daijirin_expand_alternatives2(self): - text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" - exps = Expressions.expand_daijirin_alternatives(text) - self.assertEqual(len(exps), 4) - self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps) - self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps) - self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps) - self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps) diff --git a/tests/test_sankoku_phrases.py b/tests/test_sankoku_phrases.py index 7faf289..c3894e9 100644 --- a/tests/test_sankoku_phrases.py +++ b/tests/test_sankoku_phrases.py @@ -1,16 +1,16 @@ import unittest -from bot.entries.sankoku8 import parse_hyouki_pattern +from bot.entries.sankoku8.parse import parse_hyouki_pattern -class TestSankokuPhrases(unittest.TestCase): - def test_sankoku_phrases1(self): +class TestSankoku8PhraseParse(unittest.TestCase): + def test1(self): pattern = '耳にたこ(ができる)' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 2) self.assertIn("耳にたこ", exps) self.assertIn("耳にたこができる", exps) - def test_sankoku_phrases2(self): + def test2(self): pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 4) @@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase): self.assertIn("一斑をもって全豹を卜す", exps) self.assertIn("一斑をもって全豹を推す", exps) - def test_sankoku_phrases3(self): + def test3(self): pattern = '{かじ・舵}を切る' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 2) self.assertIn("かじを切る", exps) self.assertIn("舵を切る", exps) - def test_sankoku_phrases4(self): + def test4(self): pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 6) @@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase): self.assertIn("重箱の隅をようじでほじくる", exps) self.assertIn("重箱の隅を楊枝でほじくる", exps) - def test_sankoku_phrases5(self): + def test5(self): pattern = '群盲象を〈{な・撫}でる/評する〉' exps = parse_hyouki_pattern(pattern) self.assertEqual(len(exps), 3) diff --git a/tests/test_smk_phrases.py b/tests/test_smk_phrases.py new file mode 100644 index 0000000..e5ce231 --- /dev/null +++ b/tests/test_smk_phrases.py @@ -0,0 +1,19 @@ +import unittest +from bot.entries.smk8.phrase_entry import parse_phrase + + +class TestSmk8PhraseParse(unittest.TestCase): + def test1(self): + text = "目と鼻の△先(間)" + exps = parse_phrase(text) + self.assertEqual(len(exps), 2) + self.assertIn("目と鼻の先", exps) + self.assertIn("目と鼻の間", exps) + + def test2(self): + text = "△金(時間・暇)に飽かして" + exps = parse_phrase(text) + self.assertEqual(len(exps), 3) + self.assertIn("金に飽かして", exps) + self.assertIn("時間に飽かして", exps) + self.assertIn("暇に飽かして", exps)