diff --git a/TODO.md b/TODO.md
index 877a7ee..4aaadee 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,7 +1,7 @@
### Todo
- [x] Add factory classes to reduce the amount of class import statements
-- [ ] Add dynamic import functionality to factory classes to reduce boilerplate
+- [x] Add dynamic import functionality to factory classes to reduce boilerplate
- [x] Support exporting to MDict (.MDX) dictionary format
- [x] Validate JSON schema of Yomichan terms during export
- [ ] Add support for monokakido search keys from index files
diff --git a/bot/crawlers/base/crawler.py b/bot/crawlers/base/crawler.py
new file mode 100644
index 0000000..bbbcb9b
--- /dev/null
+++ b/bot/crawlers/base/crawler.py
@@ -0,0 +1,54 @@
+import re
+from abc import ABC, abstractmethod
+
+from bot.factory import new_entry
+from bot.factory import new_yomichan_exporter
+from bot.factory import new_mdict_exporter
+
+
+class BaseCrawler(ABC):
+ def __init__(self, target):
+ self._target = target
+ self._page_map = {}
+ self._entries = []
+ self._page_id_pattern = None
+
+ @abstractmethod
+ def collect_pages(self, page_dir):
+ raise NotImplementedError
+
+ def read_pages(self):
+ pages_len = len(self._page_map)
+ items = self._page_map.items()
+ for idx, (page_id, page_path) in enumerate(items):
+ update = f"\tReading page {idx+1}/{pages_len}"
+ print(update, end='\r', flush=True)
+ entry = new_entry(self._target, page_id)
+ with open(page_path, "r", encoding="utf-8") as f:
+ page = f.read()
+ try:
+ entry.set_page(page)
+ except ValueError as err:
+ print(err)
+ print("Try deleting and redownloading file:")
+ print(f"\t{page_path}\n")
+ continue
+ self._entries.append(entry)
+ print()
+
+ def make_yomichan_dictionary(self, media_dir, validate):
+ exporter = new_yomichan_exporter(self._target)
+ exporter.export(self._entries, media_dir, validate)
+
+ def make_mdict_dictionary(self, media_dir, icon_file):
+ exporter = new_mdict_exporter(self._target)
+ exporter.export(self._entries, media_dir, icon_file)
+
+ def _parse_page_id(self, page_link):
+ m = re.search(self._page_id_pattern, page_link)
+ if m is None:
+ return None
+ page_id = int(m.group(1))
+ if page_id in self._page_map:
+ return None
+ return page_id
diff --git a/bot/crawlers/base/jitenon.py b/bot/crawlers/base/jitenon.py
new file mode 100644
index 0000000..49e4626
--- /dev/null
+++ b/bot/crawlers/base/jitenon.py
@@ -0,0 +1,30 @@
+from bs4 import BeautifulSoup
+
+from bot.time import timestamp
+from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
+from bot.crawlers.base.crawler import BaseCrawler
+
+
+class JitenonCrawler(BaseCrawler):
+ def __init__(self, target):
+ super().__init__(target)
+ self._gojuon_url = None
+
+ def collect_pages(self, page_dir):
+ print(f"{timestamp()} Scraping {self._gojuon_url}")
+ jitenon = JitenonScraper()
+ gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
+ gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+ for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+ gojuon_href = gojuon_a['href']
+ kana_doc, _ = jitenon.scrape(gojuon_href)
+ kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+ for kana_a in kana_soup.select(".word_box a", href=True):
+ page_link = kana_a['href']
+ page_id = self._parse_page_id(page_link)
+ if page_id is None:
+ continue
+ _, page_path = jitenon.scrape(page_link)
+ self._page_map[page_id] = page_path
+ pages_len = len(self._page_map)
+ print(f"\n{timestamp()} Found {pages_len} entry pages")
diff --git a/bot/crawlers/base/monokakido.py b/bot/crawlers/base/monokakido.py
new file mode 100644
index 0000000..ca98545
--- /dev/null
+++ b/bot/crawlers/base/monokakido.py
@@ -0,0 +1,20 @@
+import os
+from bot.time import timestamp
+from bot.crawlers.base.crawler import BaseCrawler
+
+
+class MonokakidoCrawler(BaseCrawler):
+ def __init__(self, target):
+ super().__init__(target)
+ self._page_id_pattern = r"^([0-9]+)\.xml$"
+
+ def collect_pages(self, page_dir):
+ print(f"{timestamp()} Searching for page files in `{page_dir}`")
+ for pagefile in os.listdir(page_dir):
+ page_id = self._parse_page_id(pagefile)
+ if page_id is None or page_id == 0:
+ continue
+ path = os.path.join(page_dir, pagefile)
+ self._page_map[page_id] = path
+ pages_len = len(self._page_map)
+ print(f"{timestamp()} Found {pages_len} page files for processing")
diff --git a/bot/crawlers/crawlers.py b/bot/crawlers/crawlers.py
deleted file mode 100644
index 51e0552..0000000
--- a/bot/crawlers/crawlers.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import os
-import re
-from abc import ABC, abstractmethod
-from bs4 import BeautifulSoup
-
-import bot.crawlers.scraper as Scraper
-from bot.entries.factory import new_entry
-from bot.yomichan.exporters.factory import new_yomi_exporter
-from bot.mdict.exporters.factory import new_mdict_exporter
-
-
-class Crawler(ABC):
- def __init__(self, target):
- self._target = target
- self._page_map = {}
- self._entries = []
- self._page_id_pattern = None
-
- @abstractmethod
- def collect_pages(self, page_dir):
- pass
-
- def read_pages(self):
- pages_len = len(self._page_map)
- items = self._page_map.items()
- for idx, (page_id, page_path) in enumerate(items):
- update = f"Reading page {idx+1}/{pages_len}"
- print(update, end='\r', flush=True)
- entry = new_entry(self._target, page_id)
- with open(page_path, "r", encoding="utf-8") as f:
- page = f.read()
- try:
- entry.set_page(page)
- except ValueError as err:
- print(err)
- print("Try deleting and redownloading file:")
- print(f"\t{page_path}\n")
- continue
- self._entries.append(entry)
- print()
-
- def make_yomichan_dictionary(self, media_dir, validate):
- exporter = new_yomi_exporter(self._target)
- exporter.export(self._entries, media_dir, validate)
-
- def make_mdict_dictionary(self, media_dir, icon_file):
- exporter = new_mdict_exporter(self._target)
- exporter.export(self._entries, media_dir, icon_file)
-
- def _parse_page_id(self, page_link):
- m = re.search(self._page_id_pattern, page_link)
- if m is None:
- return None
- page_id = int(m.group(1))
- if page_id in self._page_map:
- return None
- return page_id
-
-
-class JitenonKokugoCrawler(Crawler):
- def __init__(self, target):
- super().__init__(target)
- self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
- self._page_id_pattern = r"word/p([0-9]+)$"
-
- def collect_pages(self, page_dir):
- jitenon = Scraper.Jitenon()
- gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
- gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
- for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
- gojuon_href = gojuon_a['href']
- max_kana_page = 1
- current_kana_page = 1
- while current_kana_page <= max_kana_page:
- kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
- current_kana_page += 1
- kana_soup = BeautifulSoup(kana_doc, features="html.parser")
- page_total = kana_soup.find(class_="page_total").text
- m = re.search(r"全([0-9]+)件", page_total)
- if m:
- max_kana_page = int(m.group(1))
- for kana_a in kana_soup.select(".word_box a", href=True):
- page_link = kana_a['href']
- page_id = self._parse_page_id(page_link)
- if page_id is None:
- continue
- _, page_path = jitenon.scrape(page_link)
- self._page_map[page_id] = page_path
- pages_len = len(self._page_map)
- print(f"Finished scraping {pages_len} pages")
-
-
-class _JitenonCrawler(Crawler):
- def __init__(self, target):
- super().__init__(target)
- self._gojuon_url = None
-
- def collect_pages(self, page_dir):
- print("Scraping jitenon.jp")
- jitenon = Scraper.Jitenon()
- gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
- gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
- for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
- gojuon_href = gojuon_a['href']
- kana_doc, _ = jitenon.scrape(gojuon_href)
- kana_soup = BeautifulSoup(kana_doc, features="html.parser")
- for kana_a in kana_soup.select(".word_box a", href=True):
- page_link = kana_a['href']
- page_id = self._parse_page_id(page_link)
- if page_id is None:
- continue
- _, page_path = jitenon.scrape(page_link)
- self._page_map[page_id] = page_path
- pages_len = len(self._page_map)
- print(f"Finished scraping {pages_len} pages")
-
-
-class JitenonYojiCrawler(_JitenonCrawler):
- def __init__(self, target):
- super().__init__(target)
- self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
- self._page_id_pattern = r"([0-9]+)\.html$"
-
-
-class JitenonKotowazaCrawler(_JitenonCrawler):
- def __init__(self, target):
- super().__init__(target)
- self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
- self._page_id_pattern = r"([0-9]+)\.php$"
-
-
-class _MonokakidoCrawler(Crawler):
- def __init__(self, target):
- super().__init__(target)
- self._page_id_pattern = r"^([0-9]+)\.xml$"
-
- def collect_pages(self, page_dir):
- print(f"Searching for page files in `{page_dir}`")
- for pagefile in os.listdir(page_dir):
- page_id = self._parse_page_id(pagefile)
- if page_id is None or page_id == 0:
- continue
- path = os.path.join(page_dir, pagefile)
- self._page_map[page_id] = path
- pages_len = len(self._page_map)
- print(f"Found {pages_len} page files for processing")
-
-
-class Smk8Crawler(_MonokakidoCrawler):
- pass
-
-
-class Daijirin2Crawler(_MonokakidoCrawler):
- pass
-
-
-class Sankoku8Crawler(_MonokakidoCrawler):
- pass
diff --git a/bot/crawlers/daijirin2.py b/bot/crawlers/daijirin2.py
new file mode 100644
index 0000000..a9c711b
--- /dev/null
+++ b/bot/crawlers/daijirin2.py
@@ -0,0 +1,5 @@
+from bot.crawlers.base.monokakido import MonokakidoCrawler
+
+
+class Crawler(MonokakidoCrawler):
+ pass
diff --git a/bot/crawlers/factory.py b/bot/crawlers/factory.py
deleted file mode 100644
index d7450ea..0000000
--- a/bot/crawlers/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from bot.targets import Targets
-
-from bot.crawlers.crawlers import JitenonKokugoCrawler
-from bot.crawlers.crawlers import JitenonYojiCrawler
-from bot.crawlers.crawlers import JitenonKotowazaCrawler
-from bot.crawlers.crawlers import Smk8Crawler
-from bot.crawlers.crawlers import Daijirin2Crawler
-from bot.crawlers.crawlers import Sankoku8Crawler
-
-
-def new_crawler(target):
- crawler_map = {
- Targets.JITENON_KOKUGO: JitenonKokugoCrawler,
- Targets.JITENON_YOJI: JitenonYojiCrawler,
- Targets.JITENON_KOTOWAZA: JitenonKotowazaCrawler,
- Targets.SMK8: Smk8Crawler,
- Targets.DAIJIRIN2: Daijirin2Crawler,
- Targets.SANKOKU8: Sankoku8Crawler,
- }
- return crawler_map[target](target)
diff --git a/bot/crawlers/jitenon_kokugo.py b/bot/crawlers/jitenon_kokugo.py
new file mode 100644
index 0000000..e748ea1
--- /dev/null
+++ b/bot/crawlers/jitenon_kokugo.py
@@ -0,0 +1,40 @@
+import re
+from bs4 import BeautifulSoup
+
+from bot.time import timestamp
+from bot.crawlers.base.crawler import BaseCrawler
+from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
+
+
+class Crawler(BaseCrawler):
+ def __init__(self, target):
+ super().__init__(target)
+ self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
+ self._page_id_pattern = r"word/p([0-9]+)$"
+
+ def collect_pages(self, page_dir):
+ print(f"{timestamp()} Scraping {self._gojuon_url}")
+ jitenon = JitenonScraper()
+ gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
+ gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+ for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+ gojuon_href = gojuon_a['href']
+ max_kana_page = 1
+ current_kana_page = 1
+ while current_kana_page <= max_kana_page:
+ kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
+ current_kana_page += 1
+ kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+ page_total = kana_soup.find(class_="page_total").text
+ m = re.search(r"全([0-9]+)件", page_total)
+ if m:
+ max_kana_page = int(m.group(1))
+ for kana_a in kana_soup.select(".word_box a", href=True):
+ page_link = kana_a['href']
+ page_id = self._parse_page_id(page_link)
+ if page_id is None:
+ continue
+ _, page_path = jitenon.scrape(page_link)
+ self._page_map[page_id] = page_path
+ pages_len = len(self._page_map)
+ print(f"\n{timestamp()} Found {pages_len} entry pages")
diff --git a/bot/crawlers/jitenon_kotowaza.py b/bot/crawlers/jitenon_kotowaza.py
new file mode 100644
index 0000000..693fa52
--- /dev/null
+++ b/bot/crawlers/jitenon_kotowaza.py
@@ -0,0 +1,8 @@
+from bot.crawlers.base.jitenon import JitenonCrawler
+
+
+class Crawler(JitenonCrawler):
+ def __init__(self, target):
+ super().__init__(target)
+ self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
+ self._page_id_pattern = r"([0-9]+)\.php$"
diff --git a/bot/crawlers/jitenon_yoji.py b/bot/crawlers/jitenon_yoji.py
new file mode 100644
index 0000000..5b89875
--- /dev/null
+++ b/bot/crawlers/jitenon_yoji.py
@@ -0,0 +1,8 @@
+from bot.crawlers.base.jitenon import JitenonCrawler
+
+
+class Crawler(JitenonCrawler):
+ def __init__(self, target):
+ super().__init__(target)
+ self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
+ self._page_id_pattern = r"([0-9]+)\.html$"
diff --git a/bot/crawlers/sankoku8.py b/bot/crawlers/sankoku8.py
new file mode 100644
index 0000000..a9c711b
--- /dev/null
+++ b/bot/crawlers/sankoku8.py
@@ -0,0 +1,5 @@
+from bot.crawlers.base.monokakido import MonokakidoCrawler
+
+
+class Crawler(MonokakidoCrawler):
+ pass
diff --git a/bot/crawlers/scrapers/jitenon.py b/bot/crawlers/scrapers/jitenon.py
new file mode 100644
index 0000000..e4163d9
--- /dev/null
+++ b/bot/crawlers/scrapers/jitenon.py
@@ -0,0 +1,10 @@
+import re
+from bot.crawlers.scrapers.scraper import BaseScraper
+
+
+class Jitenon(BaseScraper):
+ def _get_netloc_re(self):
+ domain = r"jitenon\.jp"
+ pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + domain + r"$"
+ netloc_re = re.compile(pattern)
+ return netloc_re
diff --git a/bot/crawlers/scraper.py b/bot/crawlers/scrapers/scraper.py
similarity index 82%
rename from bot/crawlers/scraper.py
rename to bot/crawlers/scrapers/scraper.py
index 577f602..eeb9534 100644
--- a/bot/crawlers/scraper.py
+++ b/bot/crawlers/scrapers/scraper.py
@@ -1,24 +1,28 @@
import time
-import requests
import re
import os
import hashlib
+import random
+import math
from datetime import datetime
-from pathlib import Path
-
-from platformdirs import user_cache_dir
from urllib.parse import urlparse
+from pathlib import Path
+from abc import ABC, abstractmethod
+
+import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
+from platformdirs import user_cache_dir
+from bot.time import timestamp
from bot.data import load_config
-class Scraper():
+class BaseScraper(ABC):
def __init__(self):
+ self.cache_count = 0
self._config = load_config()
- pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
- self.netloc_re = re.compile(pattern)
+ self.netloc_re = self._get_netloc_re()
self.__set_session()
def scrape(self, urlstring):
@@ -31,9 +35,14 @@ class Scraper():
with open(cache_path, "w", encoding="utf-8") as f:
f.write(html)
else:
- print("Discovering cached files...", end='\r', flush=True)
+ self.cache_count += 1
+ print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
return html, cache_path
+ @abstractmethod
+ def _get_netloc_re(self):
+ raise NotImplementedError
+
def __set_session(self):
retry_strategy = Retry(
total=3,
@@ -87,21 +96,14 @@ class Scraper():
def __get(self, urlstring):
delay = 10
time.sleep(delay)
- now = datetime.now().strftime("%H:%M:%S")
- print(f"{now} scraping {urlstring} ...", end='')
+ print(f"{timestamp()} Scraping {urlstring} ...", end='')
try:
response = self.session.get(urlstring, timeout=10)
- print("OK")
+ print(f"{timestamp()} OK")
return response.text
- except Exception:
- print("failed")
- print("resetting session and trying again")
+ except Exception as ex:
+ print(f"\tFailed: {str(ex)}")
+ print(f"{timestamp()} Resetting session and trying again")
self.__set_session()
response = self.session.get(urlstring, timeout=10)
return response.text
-
-
-class Jitenon(Scraper):
- def __init__(self):
- self.domain = r"jitenon\.jp"
- super().__init__()
diff --git a/bot/crawlers/smk8.py b/bot/crawlers/smk8.py
new file mode 100644
index 0000000..a9c711b
--- /dev/null
+++ b/bot/crawlers/smk8.py
@@ -0,0 +1,5 @@
+from bot.crawlers.base.monokakido import MonokakidoCrawler
+
+
+class Crawler(MonokakidoCrawler):
+ pass
diff --git a/bot/entries/entry.py b/bot/entries/base/entry.py
similarity index 89%
rename from bot/entries/entry.py
rename to bot/entries/base/entry.py
index 3811a77..60d4f16 100644
--- a/bot/entries/entry.py
+++ b/bot/entries/base/entry.py
@@ -18,15 +18,15 @@ class Entry(ABC):
@abstractmethod
def get_global_identifier(self):
- pass
+ raise NotImplementedError
@abstractmethod
def set_page(self, page):
- pass
+ raise NotImplementedError
@abstractmethod
def get_page_soup(self):
- pass
+ raise NotImplementedError
def get_headwords(self):
if self._headwords is not None:
@@ -38,15 +38,15 @@ class Entry(ABC):
@abstractmethod
def _get_headwords(self):
- pass
+ raise NotImplementedError
@abstractmethod
def _add_variant_expressions(self, headwords):
- pass
+ raise NotImplementedError
@abstractmethod
def get_part_of_speech_tags(self):
- pass
+ raise NotImplementedError
def get_parent(self):
if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:
diff --git a/bot/entries/expressions.py b/bot/entries/base/expressions.py
similarity index 63%
rename from bot/entries/expressions.py
rename to bot/entries/base/expressions.py
index 687a325..8049a99 100644
--- a/bot/entries/expressions.py
+++ b/bot/entries/base/expressions.py
@@ -31,11 +31,14 @@ def add_fullwidth(expressions):
def add_variant_kanji(expressions):
variant_kanji = load_variant_kanji()
- for old_kanji, new_kanji in variant_kanji.items():
+ for kyuuji, shinji in variant_kanji.items():
new_exps = []
for expression in expressions:
- if old_kanji in expression:
- new_exp = expression.replace(old_kanji, new_kanji)
+ if kyuuji in expression:
+ new_exp = expression.replace(kyuuji, shinji)
+ new_exps.append(new_exp)
+ if shinji in expression:
+ new_exp = expression.replace(shinji, kyuuji)
new_exps.append(new_exp)
for new_exp in new_exps:
if new_exp not in expressions:
@@ -85,40 +88,3 @@ def expand_abbreviation_list(expressions):
if new_exp not in new_exps:
new_exps.append(new_exp)
return new_exps
-
-
-def expand_smk_alternatives(text):
- """Return a list of strings described by △ notation."""
- m = re.search(r"△([^(]+)(([^(]+))", text)
- if m is None:
- return [text]
- alt_parts = [m.group(1)]
- for alt_part in m.group(2).split("・"):
- alt_parts.append(alt_part)
- alts = []
- for alt_part in alt_parts:
- alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
- alts.append(alt_exp)
- return alts
-
-
-def expand_daijirin_alternatives(text):
- """Return a list of strings described by = notation."""
- group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
- groups = re.findall(group_pattern, text)
- expressions = [""]
- for group in groups:
- new_exps = []
- for expression in expressions:
- new_exps.append(expression + group[0])
- expressions = new_exps.copy()
- if group[1] == "":
- continue
- new_exps = []
- for expression in expressions:
- new_exps.append(expression + group[2])
- for expression in expressions:
- for alt in group[3].split("・"):
- new_exps.append(expression + alt)
- expressions = new_exps.copy()
- return expressions
diff --git a/bot/entries/jitenon.py b/bot/entries/base/jitenon_entry.py
similarity index 58%
rename from bot/entries/jitenon.py
rename to bot/entries/base/jitenon_entry.py
index 65c4d2e..43d6005 100644
--- a/bot/entries/jitenon.py
+++ b/bot/entries/base/jitenon_entry.py
@@ -3,11 +3,11 @@ from abc import abstractmethod
from datetime import datetime, date
from bs4 import BeautifulSoup
-from bot.entries.entry import Entry
-import bot.entries.expressions as Expressions
+from bot.entries.base.entry import Entry
+import bot.entries.base.expressions as Expressions
-class _JitenonEntry(Entry):
+class JitenonEntry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.expression = ""
@@ -58,7 +58,7 @@ class _JitenonEntry(Entry):
@abstractmethod
def _get_column_map(self):
- pass
+ raise NotImplementedError
def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
@@ -140,104 +140,3 @@ class _JitenonEntry(Entry):
elif isinstance(attr_val, list):
colvals.append(";".join(attr_val))
return ",".join(colvals)
-
-
-class JitenonYojiEntry(_JitenonEntry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.origin = ""
- self.kanken_level = ""
- self.category = ""
- self.related_expressions = []
-
- def _get_column_map(self):
- return {
- "四字熟語": "expression",
- "読み方": "yomikata",
- "意味": "definition",
- "異形": "other_forms",
- "出典": "origin",
- "漢検級": "kanken_level",
- "場面用途": "category",
- "類義語": "related_expressions",
- }
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
-
-
-class JitenonKotowazaEntry(_JitenonEntry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.origin = ""
- self.example = ""
- self.related_expressions = []
-
- def _get_column_map(self):
- return {
- "言葉": "expression",
- "読み方": "yomikata",
- "意味": "definition",
- "異形": "other_forms",
- "出典": "origin",
- "例文": "example",
- "類句": "related_expressions",
- }
-
- def _get_headwords(self):
- if self.expression == "金棒引き・鉄棒引き":
- headwords = {
- "かなぼうひき": ["金棒引き", "鉄棒引き"]
- }
- else:
- headwords = super()._get_headwords()
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
-
-
-class JitenonKokugoEntry(_JitenonEntry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.example = ""
- self.alt_expression = ""
- self.antonym = ""
- self.attachments = ""
- self.compounds = ""
- self.related_words = ""
-
- def _get_column_map(self):
- return {
- "言葉": "expression",
- "読み方": "yomikata",
- "意味": "definition",
- "例文": "example",
- "別表記": "alt_expression",
- "対義語": "antonym",
- "活用": "attachments",
- "用例": "compounds",
- "類語": "related_words",
- }
-
- def _get_headwords(self):
- headwords = {}
- for reading in self.yomikata.split("・"):
- if reading not in headwords:
- headwords[reading] = []
- for expression in self.expression.split("・"):
- headwords[reading].append(expression)
- if self.alt_expression.strip() != "":
- for expression in self.alt_expression.split("・"):
- headwords[reading].append(expression)
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
diff --git a/bot/entries/base/sanseido_entry.py b/bot/entries/base/sanseido_entry.py
new file mode 100644
index 0000000..bb52431
--- /dev/null
+++ b/bot/entries/base/sanseido_entry.py
@@ -0,0 +1,60 @@
+from abc import abstractmethod
+from bs4 import BeautifulSoup
+
+from bot.entries.base.entry import Entry
+import bot.entries.base.expressions as Expressions
+
+
+class SanseidoEntry(Entry):
+ def set_page(self, page):
+ page = self._decompose_subentries(page)
+ self._page = page
+
+ def get_page_soup(self):
+ soup = BeautifulSoup(self._page, "xml")
+ return soup
+
+ def get_global_identifier(self):
+ parent_part = format(self.entry_id[0], '06')
+ child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
+ return f"@{self.target.value}-{parent_part}-{child_part}"
+
+ def _decompose_subentries(self, page):
+ soup = BeautifulSoup(page, features="xml")
+ for x in self._get_subentry_parameters():
+ subentry_class, tags, subentry_list = x
+ for tag in tags:
+ tag_soup = soup.find(tag)
+ while tag_soup is not None:
+ tag_soup.name = "項目"
+ subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
+ self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
+ subentry = subentry_class(self.target, subentry_id)
+ page = tag_soup.decode()
+ subentry.set_page(page)
+ subentry_list.append(subentry)
+ tag_soup.decompose()
+ tag_soup = soup.find(tag)
+ return soup.decode()
+
+ @abstractmethod
+ def _get_subentry_parameters(self):
+ raise NotImplementedError
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
+ Expressions.add_fullwidth(expressions)
+ Expressions.remove_iteration_mark(expressions)
+ Expressions.add_iteration_mark(expressions)
+
+ @staticmethod
+ def id_string_to_entry_id(id_string):
+ parts = id_string.split("-")
+ if len(parts) == 1:
+ return (int(parts[0]), 0)
+ elif len(parts) == 2:
+ # subentries have a hexadecimal part
+ return (int(parts[0]), int(parts[1], 16))
+ else:
+ raise Exception(f"Invalid entry ID: {id_string}")
diff --git a/bot/entries/daijirin2.py b/bot/entries/daijirin2.py
deleted file mode 100644
index f7a629c..0000000
--- a/bot/entries/daijirin2.py
+++ /dev/null
@@ -1,231 +0,0 @@
-from bs4 import BeautifulSoup
-
-import bot.entries.expressions as Expressions
-import bot.soup as Soup
-from bot.data import load_phrase_readings
-from bot.data import load_daijirin2_kana_abbreviations
-from bot.entries.entry import Entry
-from bot.entries.daijirin2_preprocess import preprocess_page
-
-
-class _BaseDaijirin2Entry(Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.children = []
- self.phrases = []
- self._kana_abbreviations = load_daijirin2_kana_abbreviations()
-
- def get_global_identifier(self):
- parent_part = format(self.entry_id[0], '06')
- child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
- return f"@{self.target.value}-{parent_part}-{child_part}"
-
- def set_page(self, page):
- page = self.__decompose_subentries(page)
- self._page = page
-
- def get_page_soup(self):
- soup = BeautifulSoup(self._page, "xml")
- return soup
-
- def get_part_of_speech_tags(self):
- if self._part_of_speech_tags is not None:
- return self._part_of_speech_tags
- self._part_of_speech_tags = []
- soup = self.get_page_soup()
- for pos_group in soup.find_all("品詞G"):
- if pos_group.parent.name == "大語義":
- self._set_part_of_speech_tags(pos_group)
- return self._part_of_speech_tags
-
- def _set_part_of_speech_tags(self, el):
- pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
- for child in el.children:
- if child.name is not None:
- self._set_part_of_speech_tags(child)
- continue
- pos = str(child)
- if el.name not in pos_names:
- continue
- elif pos in ["[", "]"]:
- continue
- elif pos in self._part_of_speech_tags:
- continue
- else:
- self._part_of_speech_tags.append(pos)
-
- def _get_regular_headwords(self, soup):
- self._fill_alts(soup)
- reading = soup.find("見出仮名").text
- expressions = []
- for el in soup.find_all("標準表記"):
- expression = self._clean_expression(el.text)
- if "—" in expression:
- kana_abbrs = self._kana_abbreviations[self.entry_id]
- for abbr in kana_abbrs:
- expression = expression.replace("—", abbr, 1)
- expressions.append(expression)
- expressions = Expressions.expand_abbreviation_list(expressions)
- if len(expressions) == 0:
- expressions.append(reading)
- headwords = {reading: expressions}
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
-
- def __decompose_subentries(self, page):
- soup = BeautifulSoup(page, features="xml")
- subentry_parameters = [
- [Daijirin2ChildEntry, ["子項目"], self.children],
- [Daijirin2PhraseEntry, ["句項目"], self.phrases],
- ]
- for x in subentry_parameters:
- subentry_class, tags, subentry_list = x
- for tag in tags:
- tag_soup = soup.find(tag)
- while tag_soup is not None:
- tag_soup.name = "項目"
- subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
- self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(self.target, subentry_id)
- page = tag_soup.decode()
- subentry.set_page(page)
- subentry_list.append(subentry)
- tag_soup.decompose()
- tag_soup = soup.find(tag)
- return soup.decode()
-
- @staticmethod
- def id_string_to_entry_id(id_string):
- parts = id_string.split("-")
- if len(parts) == 1:
- return (int(parts[0]), 0)
- elif len(parts) == 2:
- # subentries have a hexadecimal part
- return (int(parts[0]), int(parts[1], 16))
- else:
- raise Exception(f"Invalid entry ID: {id_string}")
-
- @staticmethod
- def _delete_unused_nodes(soup):
- """Remove extra markup elements that appear in the entry
- headword line which are not part of the entry headword"""
- unused_nodes = [
- "漢字音logo", "活用分節", "連語句活用分節", "語構成",
- "表外字マーク", "表外字マーク", "ルビG"
- ]
- for name in unused_nodes:
- Soup.delete_soup_nodes(soup, name)
-
- @staticmethod
- def _clean_expression(expression):
- for x in ["〈", "〉", "《", "》", " "]:
- expression = expression.replace(x, "")
- return expression
-
- @staticmethod
- def _fill_alts(soup):
- for gaiji in soup.find_all(class_="gaiji"):
- if gaiji.name == "img" and gaiji.has_attr("alt"):
- gaiji.name = "span"
- gaiji.string = gaiji.attrs["alt"]
-
-
-class Daijirin2Entry(_BaseDaijirin2Entry):
- def __init__(self, target, page_id):
- entry_id = (page_id, 0)
- super().__init__(target, entry_id)
-
- def set_page(self, page):
- page = preprocess_page(page)
- super().set_page(page)
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- if soup.find("漢字見出") is not None:
- headwords = self._get_kanji_headwords(soup)
- elif soup.find("略語G") is not None:
- headwords = self._get_acronym_headwords(soup)
- else:
- headwords = self._get_regular_headwords(soup)
- return headwords
-
- def _get_kanji_headwords(self, soup):
- readings = []
- for el in soup.find_all("漢字音"):
- hira = Expressions.kata_to_hira(el.text)
- readings.append(hira)
- if soup.find("漢字音") is None:
- readings.append("")
- expressions = []
- for el in soup.find_all("漢字見出"):
- expressions.append(el.text)
- headwords = {}
- for reading in readings:
- headwords[reading] = expressions
- return headwords
-
- def _get_acronym_headwords(self, soup):
- expressions = []
- for el in soup.find_all("略語"):
- expression_parts = []
- for part in el.find_all(["欧字", "和字"]):
- expression_parts.append(part.text)
- expression = "".join(expression_parts)
- expressions.append(expression)
- headwords = {"": expressions}
- return headwords
-
-
-class Daijirin2ChildEntry(_BaseDaijirin2Entry):
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- headwords = self._get_regular_headwords(soup)
- return headwords
-
-
-class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
- def get_part_of_speech_tags(self):
- # phrases do not contain these tags
- return []
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- headwords = {}
- expressions = self._find_expressions(soup)
- readings = self._find_readings()
- for idx, expression in enumerate(expressions):
- reading = readings[idx]
- if reading in headwords:
- headwords[reading].append(expression)
- else:
- headwords[reading] = [expression]
- return headwords
-
- def _find_expressions(self, soup):
- self._delete_unused_nodes(soup)
- text = soup.find("句表記").text
- text = self._clean_expression(text)
- alternatives = Expressions.expand_daijirin_alternatives(text)
- expressions = []
- for alt in alternatives:
- for exp in Expressions.expand_abbreviation(alt):
- expressions.append(exp)
- return expressions
-
- def _find_readings(self):
- phrase_readings = load_phrase_readings(self.target)
- text = phrase_readings[self.entry_id]
- alternatives = Expressions.expand_daijirin_alternatives(text)
- readings = []
- for alt in alternatives:
- for reading in Expressions.expand_abbreviation(alt):
- readings.append(reading)
- return readings
diff --git a/bot/entries/daijirin2/base_entry.py b/bot/entries/daijirin2/base_entry.py
new file mode 100644
index 0000000..1113404
--- /dev/null
+++ b/bot/entries/daijirin2/base_entry.py
@@ -0,0 +1,88 @@
+import bot.soup as Soup
+from bot.data import load_daijirin2_kana_abbreviations
+from bot.entries.base.sanseido_entry import SanseidoEntry
+import bot.entries.base.expressions as Expressions
+
+
+class BaseEntry(SanseidoEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.children = []
+ self.phrases = []
+ self._kana_abbreviations = load_daijirin2_kana_abbreviations()
+
+ def get_part_of_speech_tags(self):
+ if self._part_of_speech_tags is not None:
+ return self._part_of_speech_tags
+ self._part_of_speech_tags = []
+ soup = self.get_page_soup()
+ for pos_group in soup.find_all("品詞G"):
+ if pos_group.parent.name == "大語義":
+ self._set_part_of_speech_tags(pos_group)
+ return self._part_of_speech_tags
+
+ def _set_part_of_speech_tags(self, el):
+ pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
+ for child in el.children:
+ if child.name is not None:
+ self._set_part_of_speech_tags(child)
+ continue
+ pos = str(child)
+ if el.name not in pos_names:
+ continue
+ elif pos in ["[", "]"]:
+ continue
+ elif pos in self._part_of_speech_tags:
+ continue
+ else:
+ self._part_of_speech_tags.append(pos)
+
+ def _get_regular_headwords(self, soup):
+ self._fill_alts(soup)
+ reading = soup.find("見出仮名").text
+ expressions = []
+ for el in soup.find_all("標準表記"):
+ expression = self._clean_expression(el.text)
+ if "—" in expression:
+ kana_abbrs = self._kana_abbreviations[self.entry_id]
+ for abbr in kana_abbrs:
+ expression = expression.replace("—", abbr, 1)
+ expressions.append(expression)
+ expressions = Expressions.expand_abbreviation_list(expressions)
+ if len(expressions) == 0:
+ expressions.append(reading)
+ headwords = {reading: expressions}
+ return headwords
+
+ def _get_subentry_parameters(self):
+ from bot.entries.daijirin2.child_entry import ChildEntry
+ from bot.entries.daijirin2.phrase_entry import PhraseEntry
+ subentry_parameters = [
+ [ChildEntry, ["子項目"], self.children],
+ [PhraseEntry, ["句項目"], self.phrases],
+ ]
+ return subentry_parameters
+
+ @staticmethod
+ def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
+ unused_nodes = [
+ "漢字音logo", "活用分節", "連語句活用分節", "語構成",
+ "表外字マーク", "表外字マーク", "ルビG"
+ ]
+ for name in unused_nodes:
+ Soup.delete_soup_nodes(soup, name)
+
+ @staticmethod
+ def _clean_expression(expression):
+ for x in ["〈", "〉", "《", "》", " "]:
+ expression = expression.replace(x, "")
+ return expression
+
+ @staticmethod
+ def _fill_alts(soup):
+ for gaiji in soup.find_all(class_="gaiji"):
+ if gaiji.name == "img" and gaiji.has_attr("alt"):
+ gaiji.name = "span"
+ gaiji.string = gaiji.attrs["alt"]
diff --git a/bot/entries/daijirin2/child_entry.py b/bot/entries/daijirin2/child_entry.py
new file mode 100644
index 0000000..42685a0
--- /dev/null
+++ b/bot/entries/daijirin2/child_entry.py
@@ -0,0 +1,9 @@
+from bot.entries.daijirin2.base_entry import BaseEntry
+
+
+class ChildEntry(BaseEntry):
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ headwords = self._get_regular_headwords(soup)
+ return headwords
diff --git a/bot/entries/daijirin2/entry.py b/bot/entries/daijirin2/entry.py
new file mode 100644
index 0000000..0b6970f
--- /dev/null
+++ b/bot/entries/daijirin2/entry.py
@@ -0,0 +1,50 @@
+import bot.entries.base.expressions as Expressions
+from bot.entries.daijirin2.base_entry import BaseEntry
+from bot.entries.daijirin2.preprocess import preprocess_page
+
+
+class Entry(BaseEntry):
+ def __init__(self, target, page_id):
+ entry_id = (page_id, 0)
+ super().__init__(target, entry_id)
+
+ def set_page(self, page):
+ page = preprocess_page(page)
+ super().set_page(page)
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ if soup.find("漢字見出") is not None:
+ headwords = self._get_kanji_headwords(soup)
+ elif soup.find("略語G") is not None:
+ headwords = self._get_acronym_headwords(soup)
+ else:
+ headwords = self._get_regular_headwords(soup)
+ return headwords
+
+ def _get_kanji_headwords(self, soup):
+ readings = []
+ for el in soup.find_all("漢字音"):
+ hira = Expressions.kata_to_hira(el.text)
+ readings.append(hira)
+ if soup.find("漢字音") is None:
+ readings.append("")
+ expressions = []
+ for el in soup.find_all("漢字見出"):
+ expressions.append(el.text)
+ headwords = {}
+ for reading in readings:
+ headwords[reading] = expressions
+ return headwords
+
+ def _get_acronym_headwords(self, soup):
+ expressions = []
+ for el in soup.find_all("略語"):
+ expression_parts = []
+ for part in el.find_all(["欧字", "和字"]):
+ expression_parts.append(part.text)
+ expression = "".join(expression_parts)
+ expressions.append(expression)
+ headwords = {"": expressions}
+ return headwords
diff --git a/bot/entries/daijirin2/phrase_entry.py b/bot/entries/daijirin2/phrase_entry.py
new file mode 100644
index 0000000..0470d7d
--- /dev/null
+++ b/bot/entries/daijirin2/phrase_entry.py
@@ -0,0 +1,67 @@
+import re
+
+import bot.entries.base.expressions as Expressions
+from bot.data import load_phrase_readings
+from bot.entries.daijirin2.base_entry import BaseEntry
+
+
+class PhraseEntry(BaseEntry):
+ def get_part_of_speech_tags(self):
+ # phrases do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ headwords = {}
+ expressions = self._find_expressions(soup)
+ readings = self._find_readings()
+ for idx, expression in enumerate(expressions):
+ reading = readings[idx]
+ if reading in headwords:
+ headwords[reading].append(expression)
+ else:
+ headwords[reading] = [expression]
+ return headwords
+
+ def _find_expressions(self, soup):
+ self._delete_unused_nodes(soup)
+ text = soup.find("句表記").text
+ text = self._clean_expression(text)
+ alternatives = parse_phrase(text)
+ expressions = []
+ for alt in alternatives:
+ for exp in Expressions.expand_abbreviation(alt):
+ expressions.append(exp)
+ return expressions
+
+ def _find_readings(self):
+ phrase_readings = load_phrase_readings(self.target)
+ text = phrase_readings[self.entry_id]
+ alternatives = parse_phrase(text)
+ readings = []
+ for alt in alternatives:
+ for reading in Expressions.expand_abbreviation(alt):
+ readings.append(reading)
+ return readings
+
+
+def parse_phrase(text):
+ """Return a list of strings described by = notation."""
+ group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
+ groups = re.findall(group_pattern, text)
+ expressions = [""]
+ for group in groups:
+ new_exps = []
+ for expression in expressions:
+ new_exps.append(expression + group[0])
+ expressions = new_exps.copy()
+ if group[1] == "":
+ continue
+ new_exps = []
+ for expression in expressions:
+ new_exps.append(expression + group[2])
+ for expression in expressions:
+ for alt in group[3].split("・"):
+ new_exps.append(expression + alt)
+ expressions = new_exps.copy()
+ return expressions
diff --git a/bot/entries/daijirin2_preprocess.py b/bot/entries/daijirin2/preprocess.py
similarity index 100%
rename from bot/entries/daijirin2_preprocess.py
rename to bot/entries/daijirin2/preprocess.py
diff --git a/bot/entries/factory.py b/bot/entries/factory.py
deleted file mode 100644
index 162c102..0000000
--- a/bot/entries/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from bot.targets import Targets
-
-from bot.entries.jitenon import JitenonKokugoEntry
-from bot.entries.jitenon import JitenonYojiEntry
-from bot.entries.jitenon import JitenonKotowazaEntry
-from bot.entries.smk8 import Smk8Entry
-from bot.entries.daijirin2 import Daijirin2Entry
-from bot.entries.sankoku8 import Sankoku8Entry
-
-
-def new_entry(target, page_id):
- entry_map = {
- Targets.JITENON_KOKUGO: JitenonKokugoEntry,
- Targets.JITENON_YOJI: JitenonYojiEntry,
- Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
- Targets.SMK8: Smk8Entry,
- Targets.DAIJIRIN2: Daijirin2Entry,
- Targets.SANKOKU8: Sankoku8Entry,
- }
- return entry_map[target](target, page_id)
diff --git a/bot/entries/jitenon_kokugo/entry.py b/bot/entries/jitenon_kokugo/entry.py
new file mode 100644
index 0000000..523ac63
--- /dev/null
+++ b/bot/entries/jitenon_kokugo/entry.py
@@ -0,0 +1,45 @@
+from bot.entries.base.jitenon_entry import JitenonEntry
+import bot.entries.base.expressions as Expressions
+
+
+class Entry(JitenonEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.example = ""
+ self.alt_expression = ""
+ self.antonym = ""
+ self.attachments = ""
+ self.compounds = ""
+ self.related_words = ""
+
+ def _get_column_map(self):
+ return {
+ "言葉": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "例文": "example",
+ "別表記": "alt_expression",
+ "対義語": "antonym",
+ "活用": "attachments",
+ "用例": "compounds",
+ "類語": "related_words",
+ }
+
+ def _get_headwords(self):
+ headwords = {}
+ for reading in self.yomikata.split("・"):
+ if reading not in headwords:
+ headwords[reading] = []
+ for expression in self.expression.split("・"):
+ headwords[reading].append(expression)
+ if self.alt_expression.strip() != "":
+ for expression in self.alt_expression.split("・"):
+ headwords[reading].append(expression)
+ return headwords
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
+ Expressions.add_fullwidth(expressions)
+ Expressions.remove_iteration_mark(expressions)
+ Expressions.add_iteration_mark(expressions)
diff --git a/bot/entries/jitenon_kotowaza/entry.py b/bot/entries/jitenon_kotowaza/entry.py
new file mode 100644
index 0000000..71dc35f
--- /dev/null
+++ b/bot/entries/jitenon_kotowaza/entry.py
@@ -0,0 +1,35 @@
+from bot.entries.base.jitenon_entry import JitenonEntry
+import bot.entries.base.expressions as Expressions
+
+
+class Entry(JitenonEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.origin = ""
+ self.example = ""
+ self.related_expressions = []
+
+ def _get_column_map(self):
+ return {
+ "言葉": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "異形": "other_forms",
+ "出典": "origin",
+ "例文": "example",
+ "類句": "related_expressions",
+ }
+
+ def _get_headwords(self):
+ if self.expression == "金棒引き・鉄棒引き":
+ headwords = {
+ "かなぼうひき": ["金棒引き", "鉄棒引き"]
+ }
+ else:
+ headwords = super()._get_headwords()
+ return headwords
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
+ Expressions.add_fullwidth(expressions)
diff --git a/bot/entries/jitenon_yoji/entry.py b/bot/entries/jitenon_yoji/entry.py
new file mode 100644
index 0000000..e0e8b13
--- /dev/null
+++ b/bot/entries/jitenon_yoji/entry.py
@@ -0,0 +1,27 @@
+import bot.entries.base.expressions as Expressions
+from bot.entries.base.jitenon_entry import JitenonEntry
+
+
+class Entry(JitenonEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.origin = ""
+ self.kanken_level = ""
+ self.category = ""
+ self.related_expressions = []
+
+ def _get_column_map(self):
+ return {
+ "四字熟語": "expression",
+ "読み方": "yomikata",
+ "意味": "definition",
+ "異形": "other_forms",
+ "出典": "origin",
+ "漢検級": "kanken_level",
+ "場面用途": "category",
+ "類義語": "related_expressions",
+ }
+
+ def _add_variant_expressions(self, headwords):
+ for expressions in headwords.values():
+ Expressions.add_variant_kanji(expressions)
diff --git a/bot/entries/sankoku8.py b/bot/entries/sankoku8.py
deleted file mode 100644
index 9653f68..0000000
--- a/bot/entries/sankoku8.py
+++ /dev/null
@@ -1,260 +0,0 @@
-from bs4 import BeautifulSoup
-import bot.entries.expressions as Expressions
-import bot.soup as Soup
-from bot.entries.entry import Entry
-from bot.data import load_phrase_readings
-from bot.entries.sankoku8_preprocess import preprocess_page
-
-
-class _BaseSankoku8Entry(Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.children = []
- self.phrases = []
- self._hyouki_name = "表記"
- self._midashi_name = None
- self._midashi_kana_name = None
-
- def get_global_identifier(self):
- parent_part = format(self.entry_id[0], '06')
- child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
- return f"@{self.target.value}-{parent_part}-{child_part}"
-
- def set_page(self, page):
- page = self.__decompose_subentries(page)
- self._page = page
-
- def get_page_soup(self):
- soup = BeautifulSoup(self._page, "xml")
- return soup
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- readings = self._find_readings(soup)
- expressions = self._find_expressions(soup)
- headwords = {}
- for reading in readings:
- headwords[reading] = []
- if len(readings) == 1:
- reading = readings[0]
- if soup.find(self._midashi_name).find(self._hyouki_name) is None:
- headwords[reading].append(reading)
- for exp in expressions:
- if exp not in headwords[reading]:
- headwords[reading].append(exp)
- elif len(readings) > 1 and len(expressions) == 0:
- for reading in readings:
- headwords[reading].append(reading)
- elif len(readings) > 1 and len(expressions) == 1:
- if soup.find(self._midashi_name).find(self._hyouki_name) is None:
- for reading in readings:
- headwords[reading].append(reading)
- expression = expressions[0]
- for reading in readings:
- if expression not in headwords[reading]:
- headwords[reading].append(expression)
- elif len(readings) > 1 and len(expressions) == len(readings):
- if soup.find(self._midashi_name).find(self._hyouki_name) is None:
- for reading in readings:
- headwords[reading].append(reading)
- for idx, reading in enumerate(readings):
- exp = expressions[idx]
- if exp not in headwords[reading]:
- headwords[reading].append(exp)
- else:
- raise Exception() # shouldn't happen
- return headwords
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
-
- def get_part_of_speech_tags(self):
- if self._part_of_speech_tags is not None:
- return self._part_of_speech_tags
- self._part_of_speech_tags = []
- soup = self.get_page_soup()
- for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
- pos_group = midashi.find("品詞G")
- if pos_group is None:
- continue
- for tag in pos_group.find_all("a"):
- if tag.text not in self._part_of_speech_tags:
- self._part_of_speech_tags.append(tag.text)
- return self._part_of_speech_tags
-
- def _find_expressions(self, soup):
- expressions = []
- for hyouki in soup.find_all(self._hyouki_name):
- for expression in parse_hyouki_soup(hyouki, [""]):
- expressions.append(expression)
- return expressions
-
- def _find_readings(self, soup):
- midasi_kana = soup.find(self._midashi_kana_name)
- readings = parse_hyouki_soup(midasi_kana, [""])
- return readings
-
- def __decompose_subentries(self, page):
- soup = BeautifulSoup(page, features="xml")
- subentry_parameters = [
- [Sankoku8ChildEntry, ["子項目"], self.children],
- [Sankoku8PhraseEntry, ["句項目"], self.phrases],
- ]
- for x in subentry_parameters:
- subentry_class, tags, subentry_list = x
- for tag in tags:
- tag_soup = soup.find(tag)
- while tag_soup is not None:
- tag_soup.name = "項目"
- subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
- self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(self.target, subentry_id)
- page = tag_soup.decode()
- subentry.set_page(page)
- subentry_list.append(subentry)
- tag_soup.decompose()
- tag_soup = soup.find(tag)
- return soup.decode()
-
- @staticmethod
- def id_string_to_entry_id(id_string):
- parts = id_string.split("-")
- if len(parts) == 1:
- return (int(parts[0]), 0)
- elif len(parts) == 2:
- # subentries have a hexadecimal part
- return (int(parts[0]), int(parts[1], 16))
- else:
- raise Exception(f"Invalid entry ID: {id_string}")
-
- @staticmethod
- def _delete_unused_nodes(soup):
- """Remove extra markup elements that appear in the entry
- headword line which are not part of the entry headword"""
- unused_nodes = [
- "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
- "アクセント分節", "活用分節", "ルビG", "分書"
- ]
- for name in unused_nodes:
- Soup.delete_soup_nodes(soup, name)
-
-
-class Sankoku8Entry(_BaseSankoku8Entry):
- def __init__(self, target, page_id):
- entry_id = (page_id, 0)
- super().__init__(target, entry_id)
- self._midashi_name = "見出部"
- self._midashi_kana_name = "見出仮名"
-
- def set_page(self, page):
- page = preprocess_page(page)
- super().set_page(page)
-
-
-class Sankoku8ChildEntry(_BaseSankoku8Entry):
- def __init__(self, target, page_id):
- super().__init__(target, page_id)
- self._midashi_name = "子見出部"
- self._midashi_kana_name = "子見出仮名"
-
-
-class Sankoku8PhraseEntry(_BaseSankoku8Entry):
- def get_part_of_speech_tags(self):
- # phrases do not contain these tags
- return []
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- expressions = self._find_expressions(soup)
- readings = self._find_readings(soup)
- headwords = {}
- if len(expressions) != len(readings):
- raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
- for idx, expression in enumerate(expressions):
- reading = readings[idx]
- if reading in headwords:
- headwords[reading].append(expression)
- else:
- headwords[reading] = [expression]
- return headwords
-
- def _find_expressions(self, soup):
- phrase_soup = soup.find("句表記")
- expressions = parse_hyouki_soup(phrase_soup, [""])
- return expressions
-
- def _find_readings(self, soup):
- reading_patterns = load_phrase_readings(self.target)
- reading_pattern = reading_patterns[self.entry_id]
- readings = parse_hyouki_pattern(reading_pattern)
- return readings
-
-
-def parse_hyouki_soup(soup, base_exps):
- omitted_characters = [
- "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
- ]
- exps = base_exps.copy()
- for child in soup.children:
- new_exps = []
- if child.name == "言換G":
- for alt in child.find_all("言換"):
- parts = parse_hyouki_soup(alt, [""])
- for exp in exps:
- for part in parts:
- new_exps.append(exp + part)
- elif child.name == "補足表記":
- alt1 = child.find("表記対象")
- alt2 = child.find("表記内容G")
- parts1 = parse_hyouki_soup(alt1, [""])
- parts2 = parse_hyouki_soup(alt2, [""])
- for exp in exps:
- for part in parts1:
- new_exps.append(exp + part)
- for part in parts2:
- new_exps.append(exp + part)
- elif child.name == "省略":
- parts = parse_hyouki_soup(child, [""])
- for exp in exps:
- new_exps.append(exp)
- for part in parts:
- new_exps.append(exp + part)
- elif child.name is not None:
- new_exps = parse_hyouki_soup(child, exps)
- else:
- text = child.text
- for char in omitted_characters:
- text = text.replace(char, "")
- for exp in exps:
- new_exps.append(exp + text)
- exps = new_exps.copy()
- return exps
-
-
-def parse_hyouki_pattern(pattern):
- replacements = {
- "(": "<省略>(",
- ")": ")省略>",
- "{": "<補足表記><表記対象>",
- "・": "表記対象><表記内容G>(<表記内容>",
- "}": "表記内容>)表記内容G>補足表記>",
- "〈": "<言換G>〈<言換>",
- "/": "言換>/<言換>",
- "〉": "言換>〉言換G>",
- "⦅": "<補足表記><表記対象>",
- "\": "表記対象><表記内容G>⦅<表記内容>",
- "⦆": "表記内容>⦆表記内容G>補足表記>",
- }
- markup = f"{pattern}"
- for key, val in replacements.items():
- markup = markup.replace(key, val)
- soup = BeautifulSoup(markup, "xml")
- hyouki_soup = soup.find("span")
- exps = parse_hyouki_soup(hyouki_soup, [""])
- return exps
diff --git a/bot/entries/sankoku8/base_entry.py b/bot/entries/sankoku8/base_entry.py
new file mode 100644
index 0000000..8d7a394
--- /dev/null
+++ b/bot/entries/sankoku8/base_entry.py
@@ -0,0 +1,104 @@
+import bot.soup as Soup
+from bot.entries.base.sanseido_entry import SanseidoEntry
+from bot.entries.sankoku8.parse import parse_hyouki_soup
+
+
+class BaseEntry(SanseidoEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.children = []
+ self.phrases = []
+ self._hyouki_name = "表記"
+ self._midashi_name = None
+ self._midashi_kana_name = None
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ readings = self._find_readings(soup)
+ expressions = self._find_expressions(soup)
+ headwords = {}
+ for reading in readings:
+ headwords[reading] = []
+ if len(readings) == 1:
+ reading = readings[0]
+ if soup.find(self._midashi_name).find(self._hyouki_name) is None:
+ headwords[reading].append(reading)
+ for exp in expressions:
+ if exp not in headwords[reading]:
+ headwords[reading].append(exp)
+ elif len(readings) > 1 and len(expressions) == 0:
+ for reading in readings:
+ headwords[reading].append(reading)
+ elif len(readings) > 1 and len(expressions) == 1:
+ if soup.find(self._midashi_name).find(self._hyouki_name) is None:
+ for reading in readings:
+ headwords[reading].append(reading)
+ expression = expressions[0]
+ for reading in readings:
+ if expression not in headwords[reading]:
+ headwords[reading].append(expression)
+ elif len(readings) > 1 and len(expressions) == len(readings):
+ if soup.find(self._midashi_name).find(self._hyouki_name) is None:
+ for reading in readings:
+ headwords[reading].append(reading)
+ for idx, reading in enumerate(readings):
+ exp = expressions[idx]
+ if exp not in headwords[reading]:
+ headwords[reading].append(exp)
+ else:
+ raise Exception() # shouldn't happen
+ return headwords
+
+ def get_part_of_speech_tags(self):
+ if self._part_of_speech_tags is not None:
+ return self._part_of_speech_tags
+ self._part_of_speech_tags = []
+ soup = self.get_page_soup()
+ for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
+ pos_group = midashi.find("品詞G")
+ if pos_group is None:
+ continue
+ for tag in pos_group.find_all("a"):
+ if tag.text not in self._part_of_speech_tags:
+ self._part_of_speech_tags.append(tag.text)
+ return self._part_of_speech_tags
+
+ def _find_expressions(self, soup):
+ expressions = []
+ for hyouki in soup.find_all(self._hyouki_name):
+ self._fill_alts(hyouki)
+ for expression in parse_hyouki_soup(hyouki, [""]):
+ expressions.append(expression)
+ return expressions
+
+ def _find_readings(self, soup):
+ midasi_kana = soup.find(self._midashi_kana_name)
+ readings = parse_hyouki_soup(midasi_kana, [""])
+ return readings
+
+ def _get_subentry_parameters(self):
+ from bot.entries.sankoku8.child_entry import ChildEntry
+ from bot.entries.sankoku8.phrase_entry import PhraseEntry
+ subentry_parameters = [
+ [ChildEntry, ["子項目"], self.children],
+ [PhraseEntry, ["句項目"], self.phrases],
+ ]
+ return subentry_parameters
+
+ @staticmethod
+ def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
+ unused_nodes = [
+ "語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
+ "アクセント分節", "活用分節", "ルビG", "分書"
+ ]
+ for name in unused_nodes:
+ Soup.delete_soup_nodes(soup, name)
+
+ @staticmethod
+ def _fill_alts(soup):
+ for img in soup.find_all("img"):
+ if img.has_attr("alt"):
+ img.string = img.attrs["alt"]
diff --git a/bot/entries/sankoku8/child_entry.py b/bot/entries/sankoku8/child_entry.py
new file mode 100644
index 0000000..9f6b1c1
--- /dev/null
+++ b/bot/entries/sankoku8/child_entry.py
@@ -0,0 +1,8 @@
+from bot.entries.sankoku8.base_entry import BaseEntry
+
+
+class ChildEntry(BaseEntry):
+ def __init__(self, target, page_id):
+ super().__init__(target, page_id)
+ self._midashi_name = "子見出部"
+ self._midashi_kana_name = "子見出仮名"
diff --git a/bot/entries/sankoku8/entry.py b/bot/entries/sankoku8/entry.py
new file mode 100644
index 0000000..533ac66
--- /dev/null
+++ b/bot/entries/sankoku8/entry.py
@@ -0,0 +1,14 @@
+from bot.entries.sankoku8.base_entry import BaseEntry
+from bot.entries.sankoku8.preprocess import preprocess_page
+
+
+class Entry(BaseEntry):
+ def __init__(self, target, page_id):
+ entry_id = (page_id, 0)
+ super().__init__(target, entry_id)
+ self._midashi_name = "見出部"
+ self._midashi_kana_name = "見出仮名"
+
+ def set_page(self, page):
+ page = preprocess_page(page)
+ super().set_page(page)
diff --git a/bot/entries/sankoku8/parse.py b/bot/entries/sankoku8/parse.py
new file mode 100644
index 0000000..a57574b
--- /dev/null
+++ b/bot/entries/sankoku8/parse.py
@@ -0,0 +1,65 @@
+from bs4 import BeautifulSoup
+
+
+def parse_hyouki_soup(soup, base_exps):
+ omitted_characters = [
+ "/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
+ ]
+ exps = base_exps.copy()
+ for child in soup.children:
+ new_exps = []
+ if child.name == "言換G":
+ for alt in child.find_all("言換"):
+ parts = parse_hyouki_soup(alt, [""])
+ for exp in exps:
+ for part in parts:
+ new_exps.append(exp + part)
+ elif child.name == "補足表記":
+ alt1 = child.find("表記対象")
+ alt2 = child.find("表記内容G")
+ parts1 = parse_hyouki_soup(alt1, [""])
+ parts2 = parse_hyouki_soup(alt2, [""])
+ for exp in exps:
+ for part in parts1:
+ new_exps.append(exp + part)
+ for part in parts2:
+ new_exps.append(exp + part)
+ elif child.name == "省略":
+ parts = parse_hyouki_soup(child, [""])
+ for exp in exps:
+ new_exps.append(exp)
+ for part in parts:
+ new_exps.append(exp + part)
+ elif child.name is not None:
+ new_exps = parse_hyouki_soup(child, exps)
+ else:
+ text = child.text
+ for char in omitted_characters:
+ text = text.replace(char, "")
+ for exp in exps:
+ new_exps.append(exp + text)
+ exps = new_exps.copy()
+ return exps
+
+
+def parse_hyouki_pattern(pattern):
+ replacements = {
+ "(": "<省略>(",
+ ")": ")省略>",
+ "{": "<補足表記><表記対象>",
+ "・": "表記対象><表記内容G>(<表記内容>",
+ "}": "表記内容>)表記内容G>補足表記>",
+ "〈": "<言換G>〈<言換>",
+ "/": "言換>/<言換>",
+ "〉": "言換>〉言換G>",
+ "⦅": "<補足表記><表記対象>",
+ "\": "表記対象><表記内容G>⦅<表記内容>",
+ "⦆": "表記内容>⦆表記内容G>補足表記>",
+ }
+ markup = f"{pattern}"
+ for key, val in replacements.items():
+ markup = markup.replace(key, val)
+ soup = BeautifulSoup(markup, "xml")
+ hyouki_soup = soup.find("span")
+ exps = parse_hyouki_soup(hyouki_soup, [""])
+ return exps
diff --git a/bot/entries/sankoku8/phrase_entry.py b/bot/entries/sankoku8/phrase_entry.py
new file mode 100644
index 0000000..e5da208
--- /dev/null
+++ b/bot/entries/sankoku8/phrase_entry.py
@@ -0,0 +1,37 @@
+from bot.data import load_phrase_readings
+from bot.entries.sankoku8.base_entry import BaseEntry
+from bot.entries.sankoku8.parse import parse_hyouki_soup
+from bot.entries.sankoku8.parse import parse_hyouki_pattern
+
+
+class PhraseEntry(BaseEntry):
+ def get_part_of_speech_tags(self):
+ # phrases do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ expressions = self._find_expressions(soup)
+ readings = self._find_readings(soup)
+ headwords = {}
+ if len(expressions) != len(readings):
+ raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
+ for idx, expression in enumerate(expressions):
+ reading = readings[idx]
+ if reading in headwords:
+ headwords[reading].append(expression)
+ else:
+ headwords[reading] = [expression]
+ return headwords
+
+ def _find_expressions(self, soup):
+ phrase_soup = soup.find("句表記")
+ expressions = parse_hyouki_soup(phrase_soup, [""])
+ return expressions
+
+ def _find_readings(self, soup):
+ reading_patterns = load_phrase_readings(self.target)
+ reading_pattern = reading_patterns[self.entry_id]
+ readings = parse_hyouki_pattern(reading_pattern)
+ return readings
diff --git a/bot/entries/sankoku8_preprocess.py b/bot/entries/sankoku8/preprocess.py
similarity index 58%
rename from bot/entries/sankoku8_preprocess.py
rename to bot/entries/sankoku8/preprocess.py
index 73fb31a..1eee32d 100644
--- a/bot/entries/sankoku8_preprocess.py
+++ b/bot/entries/sankoku8/preprocess.py
@@ -4,9 +4,17 @@ from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
+__GAIJI = {
+ "svg-gaiji/byan.svg": "𰻞",
+ "svg-gaiji/G16EF.svg": "篡",
+}
+
+
def preprocess_page(page):
soup = BeautifulSoup(page, features="xml")
__replace_glyph_codes(soup)
+ __add_image_alt_text(soup)
+ __replace_tatehyphen(soup)
page = __strip_page(soup)
return page
@@ -20,6 +28,21 @@ def __replace_glyph_codes(soup):
geta.replace_with(glyph)
+def __add_image_alt_text(soup):
+ for img in soup.find_all("img"):
+ if not img.has_attr("src"):
+ continue
+ src = img.attrs["src"]
+ if src in __GAIJI:
+ img.attrs["alt"] = __GAIJI[src]
+
+
+def __replace_tatehyphen(soup):
+ for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}):
+ img.string = "−"
+ img.unwrap()
+
+
def __strip_page(soup):
koumoku = soup.find(["項目"])
if koumoku is not None:
diff --git a/bot/entries/smk8.py b/bot/entries/smk8.py
deleted file mode 100644
index 2d43e4a..0000000
--- a/bot/entries/smk8.py
+++ /dev/null
@@ -1,221 +0,0 @@
-from bs4 import BeautifulSoup
-
-import bot.entries.expressions as Expressions
-import bot.soup as Soup
-from bot.data import load_phrase_readings
-from bot.entries.entry import Entry
-from bot.entries.smk8_preprocess import preprocess_page
-
-
-class _BaseSmk8Entry(Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.children = []
- self.phrases = []
- self.kanjis = []
-
- def get_global_identifier(self):
- parent_part = format(self.entry_id[0], '06')
- child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
- return f"@{self.target.value}-{parent_part}-{child_part}"
-
- def set_page(self, page):
- page = self.__decompose_subentries(page)
- self._page = page
-
- def get_page_soup(self):
- soup = BeautifulSoup(self._page, "xml")
- return soup
-
- def get_part_of_speech_tags(self):
- if self._part_of_speech_tags is not None:
- return self._part_of_speech_tags
- self._part_of_speech_tags = []
- soup = self.get_page_soup()
- headword_info = soup.find("見出要素")
- if headword_info is None:
- return self._part_of_speech_tags
- for tag in headword_info.find_all("品詞M"):
- if tag.text not in self._part_of_speech_tags:
- self._part_of_speech_tags.append(tag.text)
- return self._part_of_speech_tags
-
- def _add_variant_expressions(self, headwords):
- for expressions in headwords.values():
- Expressions.add_variant_kanji(expressions)
- Expressions.add_fullwidth(expressions)
- Expressions.remove_iteration_mark(expressions)
- Expressions.add_iteration_mark(expressions)
-
- def _find_reading(self, soup):
- midasi_kana = soup.find("見出仮名")
- reading = midasi_kana.text
- for x in [" ", "・"]:
- reading = reading.replace(x, "")
- return reading
-
- def _find_expressions(self, soup):
- clean_expressions = []
- for expression in soup.find_all("標準表記"):
- clean_expression = self._clean_expression(expression.text)
- clean_expressions.append(clean_expression)
- expressions = Expressions.expand_abbreviation_list(clean_expressions)
- return expressions
-
- def __decompose_subentries(self, page):
- soup = BeautifulSoup(page, features="xml")
- subentry_parameters = [
- [Smk8ChildEntry, ["子項目F", "子項目"], self.children],
- [Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
- [Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
- ]
- for x in subentry_parameters:
- subentry_class, tags, subentry_list = x
- for tag in tags:
- tag_soup = soup.find(tag)
- while tag_soup is not None:
- tag_soup.name = "項目"
- subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
- self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
- subentry = subentry_class(self.target, subentry_id)
- page = tag_soup.decode()
- subentry.set_page(page)
- subentry_list.append(subentry)
- tag_soup.decompose()
- tag_soup = soup.find(tag)
- return soup.decode()
-
- @staticmethod
- def id_string_to_entry_id(id_string):
- parts = id_string.split("-")
- if len(parts) == 1:
- return (int(parts[0]), 0)
- elif len(parts) == 2:
- # subentries have a hexadecimal part
- return (int(parts[0]), int(parts[1], 16))
- else:
- raise Exception(f"Invalid entry ID: {id_string}")
-
- @staticmethod
- def _delete_unused_nodes(soup):
- """Remove extra markup elements that appear in the entry
- headword line which are not part of the entry headword"""
- unused_nodes = [
- "表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
- ]
- for name in unused_nodes:
- Soup.delete_soup_nodes(soup, name)
-
- @staticmethod
- def _clean_expression(expression):
- for x in ["〈", "〉", "{", "}", "…", " "]:
- expression = expression.replace(x, "")
- return expression
-
- @staticmethod
- def _fill_alts(soup):
- for el in soup.find_all(["親見出仮名", "親見出表記"]):
- el.string = el.attrs["alt"]
- for gaiji in soup.find_all("外字"):
- gaiji.string = gaiji.img.attrs["alt"]
-
-
-class Smk8Entry(_BaseSmk8Entry):
- def __init__(self, target, page_id):
- entry_id = (page_id, 0)
- super().__init__(target, entry_id)
-
- def set_page(self, page):
- page = preprocess_page(page)
- super().set_page(page)
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- reading = self._find_reading(soup)
- expressions = []
- if soup.find("見出部").find("標準表記") is None:
- expressions.append(reading)
- for expression in self._find_expressions(soup):
- if expression not in expressions:
- expressions.append(expression)
- headwords = {reading: expressions}
- return headwords
-
-
-class Smk8ChildEntry(_BaseSmk8Entry):
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- reading = self._find_reading(soup)
- expressions = []
- if soup.find("子見出部").find("標準表記") is None:
- expressions.append(reading)
- for expression in self._find_expressions(soup):
- if expression not in expressions:
- expressions.append(expression)
- headwords = {reading: expressions}
- return headwords
-
-
-class Smk8PhraseEntry(_BaseSmk8Entry):
- def __init__(self, target, entry_id):
- super().__init__(target, entry_id)
- self.__phrase_readings = load_phrase_readings(self.target)
-
- def get_part_of_speech_tags(self):
- # phrases do not contain these tags
- return []
-
- def _get_headwords(self):
- soup = self.get_page_soup()
- headwords = {}
- expressions = self._find_expressions(soup)
- readings = self._find_readings()
- for idx, expression in enumerate(expressions):
- reading = readings[idx]
- if reading in headwords:
- headwords[reading].append(expression)
- else:
- headwords[reading] = [expression]
- return headwords
-
- def _find_expressions(self, soup):
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- text = soup.find("標準表記").text
- text = self._clean_expression(text)
- alternatives = Expressions.expand_smk_alternatives(text)
- expressions = []
- for alt in alternatives:
- for exp in Expressions.expand_abbreviation(alt):
- expressions.append(exp)
- return expressions
-
- def _find_readings(self):
- text = self.__phrase_readings[self.entry_id]
- alternatives = Expressions.expand_smk_alternatives(text)
- readings = []
- for alt in alternatives:
- for reading in Expressions.expand_abbreviation(alt):
- readings.append(reading)
- return readings
-
-
-class Smk8KanjiEntry(_BaseSmk8Entry):
- def _get_headwords(self):
- soup = self.get_page_soup()
- self._delete_unused_nodes(soup)
- self._fill_alts(soup)
- reading = self.__get_parent_reading()
- expressions = self._find_expressions(soup)
- headwords = {reading: expressions}
- return headwords
-
- def __get_parent_reading(self):
- parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
- parent = self.ID_TO_ENTRY[parent_id]
- reading = parent.get_first_reading()
- return reading
diff --git a/bot/entries/smk8/base_entry.py b/bot/entries/smk8/base_entry.py
new file mode 100644
index 0000000..7bf32c2
--- /dev/null
+++ b/bot/entries/smk8/base_entry.py
@@ -0,0 +1,73 @@
+import bot.soup as Soup
+import bot.entries.base.expressions as Expressions
+from bot.entries.base.sanseido_entry import SanseidoEntry
+
+
+class BaseEntry(SanseidoEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.children = []
+ self.phrases = []
+ self.kanjis = []
+
+ def get_part_of_speech_tags(self):
+ if self._part_of_speech_tags is not None:
+ return self._part_of_speech_tags
+ self._part_of_speech_tags = []
+ soup = self.get_page_soup()
+ headword_info = soup.find("見出要素")
+ if headword_info is None:
+ return self._part_of_speech_tags
+ for tag in headword_info.find_all("品詞M"):
+ if tag.text not in self._part_of_speech_tags:
+ self._part_of_speech_tags.append(tag.text)
+ return self._part_of_speech_tags
+
+ def _find_reading(self, soup):
+ midasi_kana = soup.find("見出仮名")
+ reading = midasi_kana.text
+ for x in [" ", "・"]:
+ reading = reading.replace(x, "")
+ return reading
+
+ def _find_expressions(self, soup):
+ clean_expressions = []
+ for expression in soup.find_all("標準表記"):
+ clean_expression = self._clean_expression(expression.text)
+ clean_expressions.append(clean_expression)
+ expressions = Expressions.expand_abbreviation_list(clean_expressions)
+ return expressions
+
+ def _get_subentry_parameters(self):
+ from bot.entries.smk8.child_entry import ChildEntry
+ from bot.entries.smk8.phrase_entry import PhraseEntry
+ from bot.entries.smk8.kanji_entry import KanjiEntry
+ subentry_parameters = [
+ [ChildEntry, ["子項目F", "子項目"], self.children],
+ [PhraseEntry, ["句項目F", "句項目"], self.phrases],
+ [KanjiEntry, ["造語成分項目"], self.kanjis],
+ ]
+ return subentry_parameters
+
+ @staticmethod
+ def _delete_unused_nodes(soup):
+ """Remove extra markup elements that appear in the entry
+ headword line which are not part of the entry headword"""
+ unused_nodes = [
+ "表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
+ ]
+ for name in unused_nodes:
+ Soup.delete_soup_nodes(soup, name)
+
+ @staticmethod
+ def _clean_expression(expression):
+ for x in ["〈", "〉", "{", "}", "…", " "]:
+ expression = expression.replace(x, "")
+ return expression
+
+ @staticmethod
+ def _fill_alts(soup):
+ for elm in soup.find_all(["親見出仮名", "親見出表記"]):
+ elm.string = elm.attrs["alt"]
+ for gaiji in soup.find_all("外字"):
+ gaiji.string = gaiji.img.attrs["alt"]
diff --git a/bot/entries/smk8/child_entry.py b/bot/entries/smk8/child_entry.py
new file mode 100644
index 0000000..0dbe375
--- /dev/null
+++ b/bot/entries/smk8/child_entry.py
@@ -0,0 +1,17 @@
+from bot.entries.smk8.base_entry import BaseEntry
+
+
+class ChildEntry(BaseEntry):
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ reading = self._find_reading(soup)
+ expressions = []
+ if soup.find("子見出部").find("標準表記") is None:
+ expressions.append(reading)
+ for expression in self._find_expressions(soup):
+ if expression not in expressions:
+ expressions.append(expression)
+ headwords = {reading: expressions}
+ return headwords
diff --git a/bot/entries/smk8/entry.py b/bot/entries/smk8/entry.py
new file mode 100644
index 0000000..4baed42
--- /dev/null
+++ b/bot/entries/smk8/entry.py
@@ -0,0 +1,26 @@
+from bot.entries.smk8.base_entry import BaseEntry
+from bot.entries.smk8.preprocess import preprocess_page
+
+
+class Entry(BaseEntry):
+ def __init__(self, target, page_id):
+ entry_id = (page_id, 0)
+ super().__init__(target, entry_id)
+
+ def set_page(self, page):
+ page = preprocess_page(page)
+ super().set_page(page)
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ reading = self._find_reading(soup)
+ expressions = []
+ if soup.find("見出部").find("標準表記") is None:
+ expressions.append(reading)
+ for expression in self._find_expressions(soup):
+ if expression not in expressions:
+ expressions.append(expression)
+ headwords = {reading: expressions}
+ return headwords
diff --git a/bot/entries/smk8/kanji_entry.py b/bot/entries/smk8/kanji_entry.py
new file mode 100644
index 0000000..3e77faf
--- /dev/null
+++ b/bot/entries/smk8/kanji_entry.py
@@ -0,0 +1,22 @@
+from bot.entries.smk8.base_entry import BaseEntry
+
+
+class KanjiEntry(BaseEntry):
+ def get_part_of_speech_tags(self):
+ # kanji entries do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ reading = self.__get_parent_reading()
+ expressions = self._find_expressions(soup)
+ headwords = {reading: expressions}
+ return headwords
+
+ def __get_parent_reading(self):
+ parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
+ parent = self.ID_TO_ENTRY[parent_id]
+ reading = parent.get_first_reading()
+ return reading
diff --git a/bot/entries/smk8/phrase_entry.py b/bot/entries/smk8/phrase_entry.py
new file mode 100644
index 0000000..aac9b84
--- /dev/null
+++ b/bot/entries/smk8/phrase_entry.py
@@ -0,0 +1,64 @@
+import re
+
+import bot.entries.base.expressions as Expressions
+from bot.data import load_phrase_readings
+from bot.entries.smk8.base_entry import BaseEntry
+
+
+class PhraseEntry(BaseEntry):
+ def __init__(self, target, entry_id):
+ super().__init__(target, entry_id)
+ self.__phrase_readings = load_phrase_readings(self.target)
+
+ def get_part_of_speech_tags(self):
+ # phrase entries do not contain these tags
+ return []
+
+ def _get_headwords(self):
+ soup = self.get_page_soup()
+ headwords = {}
+ expressions = self._find_expressions(soup)
+ readings = self._find_readings()
+ for idx, expression in enumerate(expressions):
+ reading = readings[idx]
+ if reading in headwords:
+ headwords[reading].append(expression)
+ else:
+ headwords[reading] = [expression]
+ return headwords
+
+ def _find_expressions(self, soup):
+ self._delete_unused_nodes(soup)
+ self._fill_alts(soup)
+ text = soup.find("標準表記").text
+ text = self._clean_expression(text)
+ alternatives = parse_phrase(text)
+ expressions = []
+ for alt in alternatives:
+ for exp in Expressions.expand_abbreviation(alt):
+ expressions.append(exp)
+ return expressions
+
+ def _find_readings(self):
+ text = self.__phrase_readings[self.entry_id]
+ alternatives = parse_phrase(text)
+ readings = []
+ for alt in alternatives:
+ for reading in Expressions.expand_abbreviation(alt):
+ readings.append(reading)
+ return readings
+
+
+def parse_phrase(text):
+ """Return a list of strings described by △ notation."""
+ match = re.search(r"△([^(]+)(([^(]+))", text)
+ if match is None:
+ return [text]
+ alt_parts = [match.group(1)]
+ for alt_part in match.group(2).split("・"):
+ alt_parts.append(alt_part)
+ alts = []
+ for alt_part in alt_parts:
+ alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
+ alts.append(alt_exp)
+ return alts
diff --git a/bot/entries/smk8_preprocess.py b/bot/entries/smk8/preprocess.py
similarity index 98%
rename from bot/entries/smk8_preprocess.py
rename to bot/entries/smk8/preprocess.py
index 5c9b924..ebda252 100644
--- a/bot/entries/smk8_preprocess.py
+++ b/bot/entries/smk8/preprocess.py
@@ -6,8 +6,8 @@ from bot.data import get_adobe_glyph
__GAIJI = {
"gaiji/5350.svg": "卐",
- "gaiji/62cb.svg": "抛",
- "gaiji/7be1.svg": "簒",
+ "gaiji/62cb.svg": "拋",
+ "gaiji/7be1.svg": "篡",
}
diff --git a/bot/factory.py b/bot/factory.py
new file mode 100644
index 0000000..7b025d4
--- /dev/null
+++ b/bot/factory.py
@@ -0,0 +1,37 @@
+import importlib
+
+
+def new_crawler(target):
+ module_path = f"bot.crawlers.{target.name.lower()}"
+ module = importlib.import_module(module_path)
+ return module.Crawler(target)
+
+
+def new_entry(target, page_id):
+ module_path = f"bot.entries.{target.name.lower()}.entry"
+ module = importlib.import_module(module_path)
+ return module.Entry(target, page_id)
+
+
+def new_yomichan_exporter(target):
+ module_path = f"bot.yomichan.exporters.{target.name.lower()}"
+ module = importlib.import_module(module_path)
+ return module.Exporter(target)
+
+
+def new_yomichan_terminator(target):
+ module_path = f"bot.yomichan.terms.{target.name.lower()}"
+ module = importlib.import_module(module_path)
+ return module.Terminator(target)
+
+
+def new_mdict_exporter(target):
+ module_path = f"bot.mdict.exporters.{target.name.lower()}"
+ module = importlib.import_module(module_path)
+ return module.Exporter(target)
+
+
+def new_mdict_terminator(target):
+ module_path = f"bot.mdict.terms.{target.name.lower()}"
+ module = importlib.import_module(module_path)
+ return module.Terminator(target)
diff --git a/bot/mdict/exporters/export.py b/bot/mdict/exporters/base/exporter.py
similarity index 74%
rename from bot/mdict/exporters/export.py
rename to bot/mdict/exporters/base/exporter.py
index b8e8347..37ed376 100644
--- a/bot/mdict/exporters/export.py
+++ b/bot/mdict/exporters/base/exporter.py
@@ -1,20 +1,19 @@
-# pylint: disable=too-few-public-methods
-
-import subprocess
import os
import shutil
+import subprocess
from abc import ABC, abstractmethod
from pathlib import Path
-from datetime import datetime
+
from platformdirs import user_documents_dir, user_cache_dir
-from bot.mdict.terms.factory import new_terminator
+from bot.time import timestamp
+from bot.factory import new_mdict_terminator
-class Exporter(ABC):
+class BaseExporter(ABC):
def __init__(self, target):
self._target = target
- self._terminator = new_terminator(target)
+ self._terminator = new_mdict_terminator(target)
self._build_dir = None
self._build_media_dir = None
self._description_file = None
@@ -34,7 +33,7 @@ class Exporter(ABC):
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "mdict_build")
- print(f"Initializing build directory `{build_directory}`")
+ print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
@@ -45,7 +44,7 @@ class Exporter(ABC):
build_dir = self._get_build_dir()
build_media_dir = os.path.join(build_dir, self._target.value)
if media_dir is not None:
- print("Copying media files to build directory...")
+ print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(media_dir, build_media_dir)
else:
os.makedirs(build_media_dir)
@@ -71,7 +70,7 @@ class Exporter(ABC):
def _write_mdx_file(self, entries):
terms = self._get_terms(entries)
- print(f"Exporting {len(terms)} Mdict keys...")
+ print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
params = [
@@ -87,7 +86,7 @@ class Exporter(ABC):
terms = []
entries_len = len(entries)
for idx, entry in enumerate(entries):
- update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
+ update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry)
for term in new_terms:
@@ -126,7 +125,7 @@ class Exporter(ABC):
return self._out_dir
out_dir = os.path.join(
user_documents_dir(), "jitenbot", "mdict", self._target.value)
- print(f"Initializing output directory `{out_dir}`")
+ print(f"{timestamp()} Initializing output directory `{out_dir}`")
if Path(out_dir).is_dir():
shutil.rmtree(out_dir)
os.makedirs(out_dir)
@@ -168,58 +167,8 @@ class Exporter(ABC):
@abstractmethod
def _get_revision(self, entries):
- pass
+ raise NotImplementedError
@abstractmethod
def _get_attribution(self, entries):
- pass
-
-
-class _JitenonExporter(Exporter):
- def _get_revision(self, entries):
- modified_date = None
- for entry in entries:
- if modified_date is None or entry.modified_date > modified_date:
- modified_date = entry.modified_date
- revision = modified_date.strftime("%Y年%m月%d日閲覧")
- return revision
-
- def _get_attribution(self, entries):
- modified_date = None
- for entry in entries:
- if modified_date is None or entry.modified_date > modified_date:
- attribution = entry.attribution
- return attribution
-
-
-class JitenonKokugoExporter(_JitenonExporter):
- pass
-
-
-class JitenonYojiExporter(_JitenonExporter):
- pass
-
-
-class JitenonKotowazaExporter(_JitenonExporter):
- pass
-
-
-class _MonokakidoExporter(Exporter):
- def _get_revision(self, entries):
- timestamp = datetime.now().strftime("%Y年%m月%d日作成")
- return timestamp
-
-
-class Smk8Exporter(_MonokakidoExporter):
- def _get_attribution(self, entries):
- return "© Sanseido Co., LTD. 2020"
-
-
-class Daijirin2Exporter(_MonokakidoExporter):
- def _get_attribution(self, entries):
- return "© Sanseido Co., LTD. 2019"
-
-
-class Sankoku8Exporter(_MonokakidoExporter):
- def _get_attribution(self, entries):
- return "© Sanseido Co., LTD. 2021"
+ raise NotImplementedError
diff --git a/bot/mdict/exporters/base/jitenon.py b/bot/mdict/exporters/base/jitenon.py
new file mode 100644
index 0000000..2e6b1df
--- /dev/null
+++ b/bot/mdict/exporters/base/jitenon.py
@@ -0,0 +1,18 @@
+from bot.mdict.exporters.base.exporter import BaseExporter
+
+
+class JitenonExporter(BaseExporter):
+ def _get_revision(self, entries):
+ modified_date = None
+ for entry in entries:
+ if modified_date is None or entry.modified_date > modified_date:
+ modified_date = entry.modified_date
+ revision = modified_date.strftime("%Y年%m月%d日閲覧")
+ return revision
+
+ def _get_attribution(self, entries):
+ modified_date = None
+ for entry in entries:
+ if modified_date is None or entry.modified_date > modified_date:
+ attribution = entry.attribution
+ return attribution
diff --git a/bot/mdict/exporters/base/monokakido.py b/bot/mdict/exporters/base/monokakido.py
new file mode 100644
index 0000000..b9b9629
--- /dev/null
+++ b/bot/mdict/exporters/base/monokakido.py
@@ -0,0 +1,8 @@
+from datetime import datetime
+from bot.mdict.exporters.base.exporter import BaseExporter
+
+
+class MonokakidoExporter(BaseExporter):
+ def _get_revision(self, entries):
+ timestamp = datetime.now().strftime("%Y年%m月%d日作成")
+ return timestamp
diff --git a/bot/mdict/exporters/daijirin2.py b/bot/mdict/exporters/daijirin2.py
new file mode 100644
index 0000000..4692470
--- /dev/null
+++ b/bot/mdict/exporters/daijirin2.py
@@ -0,0 +1,6 @@
+from bot.mdict.exporters.base.monokakido import MonokakidoExporter
+
+
+class Exporter(MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2019"
diff --git a/bot/mdict/exporters/factory.py b/bot/mdict/exporters/factory.py
deleted file mode 100644
index 5417493..0000000
--- a/bot/mdict/exporters/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from bot.targets import Targets
-
-from bot.mdict.exporters.export import JitenonKokugoExporter
-from bot.mdict.exporters.export import JitenonYojiExporter
-from bot.mdict.exporters.export import JitenonKotowazaExporter
-from bot.mdict.exporters.export import Smk8Exporter
-from bot.mdict.exporters.export import Daijirin2Exporter
-from bot.mdict.exporters.export import Sankoku8Exporter
-
-
-def new_mdict_exporter(target):
- exporter_map = {
- Targets.JITENON_KOKUGO: JitenonKokugoExporter,
- Targets.JITENON_YOJI: JitenonYojiExporter,
- Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
- Targets.SMK8: Smk8Exporter,
- Targets.DAIJIRIN2: Daijirin2Exporter,
- Targets.SANKOKU8: Sankoku8Exporter,
- }
- return exporter_map[target](target)
diff --git a/bot/mdict/exporters/jitenon_kokugo.py b/bot/mdict/exporters/jitenon_kokugo.py
new file mode 100644
index 0000000..5689fa8
--- /dev/null
+++ b/bot/mdict/exporters/jitenon_kokugo.py
@@ -0,0 +1,5 @@
+from bot.mdict.exporters.base.jitenon import JitenonExporter
+
+
+class Exporter(JitenonExporter):
+ pass
diff --git a/bot/mdict/exporters/jitenon_kotowaza.py b/bot/mdict/exporters/jitenon_kotowaza.py
new file mode 100644
index 0000000..5689fa8
--- /dev/null
+++ b/bot/mdict/exporters/jitenon_kotowaza.py
@@ -0,0 +1,5 @@
+from bot.mdict.exporters.base.jitenon import JitenonExporter
+
+
+class Exporter(JitenonExporter):
+ pass
diff --git a/bot/mdict/exporters/jitenon_yoji.py b/bot/mdict/exporters/jitenon_yoji.py
new file mode 100644
index 0000000..5689fa8
--- /dev/null
+++ b/bot/mdict/exporters/jitenon_yoji.py
@@ -0,0 +1,5 @@
+from bot.mdict.exporters.base.jitenon import JitenonExporter
+
+
+class Exporter(JitenonExporter):
+ pass
diff --git a/bot/mdict/exporters/sankoku8.py b/bot/mdict/exporters/sankoku8.py
new file mode 100644
index 0000000..6063864
--- /dev/null
+++ b/bot/mdict/exporters/sankoku8.py
@@ -0,0 +1,6 @@
+from bot.mdict.exporters.base.monokakido import MonokakidoExporter
+
+
+class Exporter(MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2021"
diff --git a/bot/mdict/exporters/smk8.py b/bot/mdict/exporters/smk8.py
new file mode 100644
index 0000000..a030b4b
--- /dev/null
+++ b/bot/mdict/exporters/smk8.py
@@ -0,0 +1,6 @@
+from bot.mdict.exporters.base.monokakido import MonokakidoExporter
+
+
+class Exporter(MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2020"
diff --git a/bot/mdict/terms/base/jitenon.py b/bot/mdict/terms/base/jitenon.py
new file mode 100644
index 0000000..4f255bf
--- /dev/null
+++ b/bot/mdict/terms/base/jitenon.py
@@ -0,0 +1,20 @@
+from bot.mdict.terms.base.terminator import BaseTerminator
+
+
+class JitenonTerminator(BaseTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = None
+
+ def _glossary(self, entry):
+ if entry.entry_id in self._glossary_cache:
+ return self._glossary_cache[entry.entry_id]
+ glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
+ self._glossary_cache[entry.entry_id] = glossary
+ return glossary
+
+ def _link_glossary_parameters(self, entry):
+ return []
+
+ def _subentry_lists(self, entry):
+ return []
diff --git a/bot/mdict/terms/terminator.py b/bot/mdict/terms/base/terminator.py
similarity index 95%
rename from bot/mdict/terms/terminator.py
rename to bot/mdict/terms/base/terminator.py
index ee62411..945a65b 100644
--- a/bot/mdict/terms/terminator.py
+++ b/bot/mdict/terms/base/terminator.py
@@ -2,7 +2,7 @@ import re
from abc import abstractmethod, ABC
-class Terminator(ABC):
+class BaseTerminator(ABC):
def __init__(self, target):
self._target = target
self._glossary_cache = {}
@@ -72,12 +72,12 @@ class Terminator(ABC):
@abstractmethod
def _glossary(self, entry):
- pass
+ raise NotImplementedError
@abstractmethod
def _link_glossary_parameters(self, entry):
- pass
+ raise NotImplementedError
@abstractmethod
def _subentry_lists(self, entry):
- pass
+ raise NotImplementedError
diff --git a/bot/mdict/terms/daijirin2.py b/bot/mdict/terms/daijirin2.py
index 3b5ce68..640b520 100644
--- a/bot/mdict/terms/daijirin2.py
+++ b/bot/mdict/terms/daijirin2.py
@@ -1,8 +1,8 @@
-from bot.mdict.terms.terminator import Terminator
+from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.daijirin2 import make_glossary
-class Daijirin2Terminator(Terminator):
+class Terminator(BaseTerminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
diff --git a/bot/mdict/terms/factory.py b/bot/mdict/terms/factory.py
deleted file mode 100644
index 8cee8e7..0000000
--- a/bot/mdict/terms/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from bot.targets import Targets
-
-from bot.mdict.terms.jitenon import JitenonKokugoTerminator
-from bot.mdict.terms.jitenon import JitenonYojiTerminator
-from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
-from bot.mdict.terms.smk8 import Smk8Terminator
-from bot.mdict.terms.daijirin2 import Daijirin2Terminator
-from bot.mdict.terms.sankoku8 import Sankoku8Terminator
-
-
-def new_terminator(target):
- terminator_map = {
- Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
- Targets.JITENON_YOJI: JitenonYojiTerminator,
- Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
- Targets.SMK8: Smk8Terminator,
- Targets.DAIJIRIN2: Daijirin2Terminator,
- Targets.SANKOKU8: Sankoku8Terminator,
- }
- return terminator_map[target](target)
diff --git a/bot/mdict/terms/jitenon.py b/bot/mdict/terms/jitenon.py
deleted file mode 100644
index 3f9cfc1..0000000
--- a/bot/mdict/terms/jitenon.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from bot.mdict.terms.terminator import Terminator
-
-from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
-from bot.mdict.glossary.jitenon import JitenonYojiGlossary
-from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
-
-
-class JitenonTerminator(Terminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = None
-
- def _glossary(self, entry):
- if entry.entry_id in self._glossary_cache:
- return self._glossary_cache[entry.entry_id]
- glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
- self._glossary_cache[entry.entry_id] = glossary
- return glossary
-
- def _link_glossary_parameters(self, entry):
- return []
-
- def _subentry_lists(self, entry):
- return []
-
-
-class JitenonKokugoTerminator(JitenonTerminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = JitenonKokugoGlossary()
-
-
-class JitenonYojiTerminator(JitenonTerminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = JitenonYojiGlossary()
-
-
-class JitenonKotowazaTerminator(JitenonTerminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = JitenonKotowazaGlossary()
diff --git a/bot/mdict/terms/jitenon_kokugo.py b/bot/mdict/terms/jitenon_kokugo.py
new file mode 100644
index 0000000..2a44b7b
--- /dev/null
+++ b/bot/mdict/terms/jitenon_kokugo.py
@@ -0,0 +1,8 @@
+from bot.mdict.terms.base.jitenon import JitenonTerminator
+from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
+
+
+class Terminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonKokugoGlossary()
diff --git a/bot/mdict/terms/jitenon_kotowaza.py b/bot/mdict/terms/jitenon_kotowaza.py
new file mode 100644
index 0000000..3492a49
--- /dev/null
+++ b/bot/mdict/terms/jitenon_kotowaza.py
@@ -0,0 +1,8 @@
+from bot.mdict.terms.base.jitenon import JitenonTerminator
+from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
+
+
+class Terminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonKotowazaGlossary()
diff --git a/bot/mdict/terms/jitenon_yoji.py b/bot/mdict/terms/jitenon_yoji.py
new file mode 100644
index 0000000..a4175a1
--- /dev/null
+++ b/bot/mdict/terms/jitenon_yoji.py
@@ -0,0 +1,8 @@
+from bot.mdict.terms.base.jitenon import JitenonTerminator
+from bot.mdict.glossary.jitenon import JitenonYojiGlossary
+
+
+class Terminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonYojiGlossary()
diff --git a/bot/mdict/terms/sankoku8.py b/bot/mdict/terms/sankoku8.py
index 5c1bfb7..71a3b8f 100644
--- a/bot/mdict/terms/sankoku8.py
+++ b/bot/mdict/terms/sankoku8.py
@@ -1,8 +1,8 @@
-from bot.mdict.terms.terminator import Terminator
+from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.sankoku8 import make_glossary
-class Sankoku8Terminator(Terminator):
+class Terminator(BaseTerminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
diff --git a/bot/mdict/terms/smk8.py b/bot/mdict/terms/smk8.py
index 22275d5..ef2b7a2 100644
--- a/bot/mdict/terms/smk8.py
+++ b/bot/mdict/terms/smk8.py
@@ -1,8 +1,8 @@
-from bot.mdict.terms.terminator import Terminator
+from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.smk8 import make_glossary
-class Smk8Terminator(Terminator):
+class Terminator(BaseTerminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
diff --git a/bot/time.py b/bot/time.py
new file mode 100644
index 0000000..f8dae94
--- /dev/null
+++ b/bot/time.py
@@ -0,0 +1,5 @@
+import time
+
+
+def timestamp():
+ return time.strftime('%X')
diff --git a/bot/yomichan/exporters/export.py b/bot/yomichan/exporters/base/exporter.py
similarity index 69%
rename from bot/yomichan/exporters/export.py
rename to bot/yomichan/exporters/base/exporter.py
index d348fed..5e4e870 100644
--- a/bot/yomichan/exporters/export.py
+++ b/bot/yomichan/exporters/base/exporter.py
@@ -1,24 +1,23 @@
-# pylint: disable=too-few-public-methods
-
import json
import os
import shutil
import copy
from pathlib import Path
-from datetime import datetime
from abc import ABC, abstractmethod
-from platformdirs import user_documents_dir, user_cache_dir
import fastjsonschema
+from platformdirs import user_documents_dir, user_cache_dir
+
+from bot.time import timestamp
from bot.data import load_yomichan_metadata
-from bot.yomichan.terms.factory import new_terminator
from bot.data import load_yomichan_term_schema
+from bot.factory import new_yomichan_terminator
-class Exporter(ABC):
+class BaseExporter(ABC):
def __init__(self, target):
self._target = target
- self._terminator = new_terminator(target)
+ self._terminator = new_yomichan_terminator(target)
self._build_dir = None
self._terms_per_file = 2000
@@ -36,18 +35,18 @@ class Exporter(ABC):
@abstractmethod
def _get_revision(self, entries):
- pass
+ raise NotImplementedError
@abstractmethod
def _get_attribution(self, entries):
- pass
+ raise NotImplementedError
def _get_build_dir(self):
if self._build_dir is not None:
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "yomichan_build")
- print(f"Initializing build directory `{build_directory}`")
+ print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
@@ -66,8 +65,9 @@ class Exporter(ABC):
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None:
- print("Copying media files to build directory...")
+ print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(image_dir, build_img_dir)
+ print(f"{timestamp()} Finished copying files")
else:
os.makedirs(build_img_dir)
self._terminator.set_image_dir(build_img_dir)
@@ -76,7 +76,7 @@ class Exporter(ABC):
terms = []
entries_len = len(entries)
for idx, entry in enumerate(entries):
- update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
+ update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry)
for term in new_terms:
@@ -85,7 +85,7 @@ class Exporter(ABC):
return terms
def __validate_terms(self, terms):
- print("Making a copy of term data for validation...")
+ print(f"{timestamp()} Making a copy of term data for validation...")
terms_copy = copy.deepcopy(terms) # because validator will alter data!
term_count = len(terms_copy)
log_dir = self.__get_invalid_term_dir()
@@ -93,7 +93,7 @@ class Exporter(ABC):
validator = fastjsonschema.compile(schema)
failure_count = 0
for idx, term in enumerate(terms_copy):
- update = f"Validating term {idx+1}/{term_count}"
+ update = f"\tValidating term {idx+1}/{term_count}"
print(update, end='\r', flush=True)
try:
validator([term])
@@ -102,9 +102,9 @@ class Exporter(ABC):
term_file = os.path.join(log_dir, f"{idx}.json")
with open(term_file, "w", encoding='utf8') as f:
json.dump([term], f, indent=4, ensure_ascii=False)
- print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
+ print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
if failure_count > 0:
- print(f"Invalid terms saved to `{log_dir}` for debugging")
+ print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
def __make_dictionary(self, terms, index, tags):
self.__write_term_banks(terms)
@@ -114,14 +114,14 @@ class Exporter(ABC):
self.__rm_build_dir()
def __write_term_banks(self, terms):
- print(f"Exporting {len(terms)} JSON terms")
+ print(f"{timestamp()} Exporting {len(terms)} JSON terms")
build_dir = self._get_build_dir()
max_i = int(len(terms) / self._terms_per_file) + 1
for i in range(max_i):
+ update = f"\tWriting terms to term bank {i+1}/{max_i}"
+ print(update, end='\r', flush=True)
start = self._terms_per_file * i
end = self._terms_per_file * (i + 1)
- update = f"Writing terms to term banks {start} - {end}"
- print(update, end='\r', flush=True)
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
@@ -142,8 +142,8 @@ class Exporter(ABC):
json.dump(tags, f, indent=4, ensure_ascii=False)
def __write_archive(self, filename):
- print("Archiving data to ZIP file...")
archive_format = "zip"
+ print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir():
os.makedirs(out_dir)
@@ -154,58 +154,8 @@ class Exporter(ABC):
base_filename = os.path.join(out_dir, filename)
build_dir = self._get_build_dir()
shutil.make_archive(base_filename, archive_format, build_dir)
- print(f"Dictionary file saved to {out_filepath}")
+ print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
def __rm_build_dir(self):
build_dir = self._get_build_dir()
shutil.rmtree(build_dir)
-
-
-class _JitenonExporter(Exporter):
- def _get_revision(self, entries):
- modified_date = None
- for entry in entries:
- if modified_date is None or entry.modified_date > modified_date:
- modified_date = entry.modified_date
- revision = f"{self._target.value};{modified_date}"
- return revision
-
- def _get_attribution(self, entries):
- modified_date = None
- for entry in entries:
- if modified_date is None or entry.modified_date > modified_date:
- attribution = entry.attribution
- return attribution
-
-
-class JitenonKokugoExporter(_JitenonExporter):
- pass
-
-
-class JitenonYojiExporter(_JitenonExporter):
- pass
-
-
-class JitenonKotowazaExporter(_JitenonExporter):
- pass
-
-
-class _MonokakidoExporter(Exporter):
- def _get_revision(self, entries):
- timestamp = datetime.now().strftime("%Y-%m-%d")
- return f"{self._target.value};{timestamp}"
-
-
-class Smk8Exporter(_MonokakidoExporter):
- def _get_attribution(self, entries):
- return "© Sanseido Co., LTD. 2020"
-
-
-class Daijirin2Exporter(_MonokakidoExporter):
- def _get_attribution(self, entries):
- return "© Sanseido Co., LTD. 2019"
-
-
-class Sankoku8Exporter(_MonokakidoExporter):
- def _get_attribution(self, entries):
- return "© Sanseido Co., LTD. 2021"
diff --git a/bot/yomichan/exporters/base/jitenon.py b/bot/yomichan/exporters/base/jitenon.py
new file mode 100644
index 0000000..80f0175
--- /dev/null
+++ b/bot/yomichan/exporters/base/jitenon.py
@@ -0,0 +1,18 @@
+from bot.yomichan.exporters.base.exporter import BaseExporter
+
+
+class JitenonExporter(BaseExporter):
+ def _get_revision(self, entries):
+ modified_date = None
+ for entry in entries:
+ if modified_date is None or entry.modified_date > modified_date:
+ modified_date = entry.modified_date
+ revision = f"{self._target.value};{modified_date}"
+ return revision
+
+ def _get_attribution(self, entries):
+ modified_date = None
+ for entry in entries:
+ if modified_date is None or entry.modified_date > modified_date:
+ attribution = entry.attribution
+ return attribution
diff --git a/bot/yomichan/exporters/base/monokakido.py b/bot/yomichan/exporters/base/monokakido.py
new file mode 100644
index 0000000..5c5f3fa
--- /dev/null
+++ b/bot/yomichan/exporters/base/monokakido.py
@@ -0,0 +1,8 @@
+from datetime import datetime
+from bot.yomichan.exporters.base.exporter import BaseExporter
+
+
+class MonokakidoExporter(BaseExporter):
+ def _get_revision(self, entries):
+ timestamp = datetime.now().strftime("%Y-%m-%d")
+ return f"{self._target.value};{timestamp}"
diff --git a/bot/yomichan/exporters/daijirin2.py b/bot/yomichan/exporters/daijirin2.py
new file mode 100644
index 0000000..7115342
--- /dev/null
+++ b/bot/yomichan/exporters/daijirin2.py
@@ -0,0 +1,6 @@
+from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
+
+
+class Exporter(MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2019"
diff --git a/bot/yomichan/exporters/factory.py b/bot/yomichan/exporters/factory.py
deleted file mode 100644
index afed7fd..0000000
--- a/bot/yomichan/exporters/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from bot.targets import Targets
-
-from bot.yomichan.exporters.export import JitenonKokugoExporter
-from bot.yomichan.exporters.export import JitenonYojiExporter
-from bot.yomichan.exporters.export import JitenonKotowazaExporter
-from bot.yomichan.exporters.export import Smk8Exporter
-from bot.yomichan.exporters.export import Daijirin2Exporter
-from bot.yomichan.exporters.export import Sankoku8Exporter
-
-
-def new_yomi_exporter(target):
- exporter_map = {
- Targets.JITENON_KOKUGO: JitenonKokugoExporter,
- Targets.JITENON_YOJI: JitenonYojiExporter,
- Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
- Targets.SMK8: Smk8Exporter,
- Targets.DAIJIRIN2: Daijirin2Exporter,
- Targets.SANKOKU8: Sankoku8Exporter,
- }
- return exporter_map[target](target)
diff --git a/bot/yomichan/exporters/jitenon_kokugo.py b/bot/yomichan/exporters/jitenon_kokugo.py
new file mode 100644
index 0000000..0a3ef7a
--- /dev/null
+++ b/bot/yomichan/exporters/jitenon_kokugo.py
@@ -0,0 +1,5 @@
+from bot.yomichan.exporters.base.jitenon import JitenonExporter
+
+
+class Exporter(JitenonExporter):
+ pass
diff --git a/bot/yomichan/exporters/jitenon_kotowaza.py b/bot/yomichan/exporters/jitenon_kotowaza.py
new file mode 100644
index 0000000..0a3ef7a
--- /dev/null
+++ b/bot/yomichan/exporters/jitenon_kotowaza.py
@@ -0,0 +1,5 @@
+from bot.yomichan.exporters.base.jitenon import JitenonExporter
+
+
+class Exporter(JitenonExporter):
+ pass
diff --git a/bot/yomichan/exporters/jitenon_yoji.py b/bot/yomichan/exporters/jitenon_yoji.py
new file mode 100644
index 0000000..0a3ef7a
--- /dev/null
+++ b/bot/yomichan/exporters/jitenon_yoji.py
@@ -0,0 +1,5 @@
+from bot.yomichan.exporters.base.jitenon import JitenonExporter
+
+
+class Exporter(JitenonExporter):
+ pass
diff --git a/bot/yomichan/exporters/sankoku8.py b/bot/yomichan/exporters/sankoku8.py
new file mode 100644
index 0000000..b33c389
--- /dev/null
+++ b/bot/yomichan/exporters/sankoku8.py
@@ -0,0 +1,6 @@
+from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
+
+
+class Exporter(MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2021"
diff --git a/bot/yomichan/exporters/smk8.py b/bot/yomichan/exporters/smk8.py
new file mode 100644
index 0000000..7f71aa3
--- /dev/null
+++ b/bot/yomichan/exporters/smk8.py
@@ -0,0 +1,6 @@
+from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
+
+
+class Exporter(MonokakidoExporter):
+ def _get_attribution(self, entries):
+ return "© Sanseido Co., LTD. 2020"
diff --git a/bot/yomichan/glossary/daijirin2.py b/bot/yomichan/glossary/daijirin2.py
index 0adaa96..178de00 100644
--- a/bot/yomichan/glossary/daijirin2.py
+++ b/bot/yomichan/glossary/daijirin2.py
@@ -1,9 +1,10 @@
import re
import os
-from bs4 import BeautifulSoup
from functools import cache
from pathlib import Path
+from bs4 import BeautifulSoup
+
import bot.yomichan.glossary.icons as Icons
from bot.soup import delete_soup_nodes
from bot.data import load_yomichan_name_conversion
diff --git a/bot/yomichan/terms/base/jitenon.py b/bot/yomichan/terms/base/jitenon.py
new file mode 100644
index 0000000..d0d5388
--- /dev/null
+++ b/bot/yomichan/terms/base/jitenon.py
@@ -0,0 +1,26 @@
+from bot.yomichan.terms.base.terminator import BaseTerminator
+
+
+class JitenonTerminator(BaseTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = None
+
+ def _definition_tags(self, entry):
+ return None
+
+ def _glossary(self, entry):
+ if entry.entry_id in self._glossary_cache:
+ return self._glossary_cache[entry.entry_id]
+ glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
+ self._glossary_cache[entry.entry_id] = glossary
+ return glossary
+
+ def _sequence(self, entry):
+ return entry.entry_id
+
+ def _link_glossary_parameters(self, entry):
+ return []
+
+ def _subentry_lists(self, entry):
+ return []
diff --git a/bot/yomichan/terms/terminator.py b/bot/yomichan/terms/base/terminator.py
similarity index 91%
rename from bot/yomichan/terms/terminator.py
rename to bot/yomichan/terms/base/terminator.py
index dd0c02d..f57c4cc 100644
--- a/bot/yomichan/terms/terminator.py
+++ b/bot/yomichan/terms/base/terminator.py
@@ -2,7 +2,7 @@ from abc import abstractmethod, ABC
from bot.data import load_yomichan_inflection_categories
-class Terminator(ABC):
+class BaseTerminator(ABC):
def __init__(self, target):
self._target = target
self._glossary_cache = {}
@@ -66,28 +66,28 @@ class Terminator(ABC):
@abstractmethod
def _definition_tags(self, entry):
- pass
+ raise NotImplementedError
@abstractmethod
def _inflection_rules(self, entry, expression):
- pass
+ raise NotImplementedError
@abstractmethod
def _glossary(self, entry):
- pass
+ raise NotImplementedError
@abstractmethod
def _sequence(self, entry):
- pass
+ raise NotImplementedError
@abstractmethod
def _term_tags(self, entry):
- pass
+ raise NotImplementedError
@abstractmethod
def _link_glossary_parameters(self, entry):
- pass
+ raise NotImplementedError
@abstractmethod
def _subentry_lists(self, entry):
- pass
+ raise NotImplementedError
diff --git a/bot/yomichan/terms/daijirin2.py b/bot/yomichan/terms/daijirin2.py
index 10aaa76..7cf06fb 100644
--- a/bot/yomichan/terms/daijirin2.py
+++ b/bot/yomichan/terms/daijirin2.py
@@ -1,14 +1,10 @@
-from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
-
-from bot.yomichan.terms.terminator import Terminator
+from bot.entries.daijirin2.phrase_entry import PhraseEntry
+from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.glossary.daijirin2 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
-class Daijirin2Terminator(Terminator):
- def __init__(self, target):
- super().__init__(target)
-
+class Terminator(BaseTerminator):
def _definition_tags(self, entry):
return ""
diff --git a/bot/yomichan/terms/factory.py b/bot/yomichan/terms/factory.py
deleted file mode 100644
index 8c596cb..0000000
--- a/bot/yomichan/terms/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from bot.targets import Targets
-
-from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
-from bot.yomichan.terms.jitenon import JitenonYojiTerminator
-from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
-from bot.yomichan.terms.smk8 import Smk8Terminator
-from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
-from bot.yomichan.terms.sankoku8 import Sankoku8Terminator
-
-
-def new_terminator(target):
- terminator_map = {
- Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
- Targets.JITENON_YOJI: JitenonYojiTerminator,
- Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
- Targets.SMK8: Smk8Terminator,
- Targets.DAIJIRIN2: Daijirin2Terminator,
- Targets.SANKOKU8: Sankoku8Terminator,
- }
- return terminator_map[target](target)
diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py
deleted file mode 100644
index 66bbed7..0000000
--- a/bot/yomichan/terms/jitenon.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from bot.yomichan.grammar import sudachi_rules
-from bot.yomichan.terms.terminator import Terminator
-
-from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
-from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
-from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
-
-
-class JitenonTerminator(Terminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = None
-
- def _definition_tags(self, entry):
- return None
-
- def _glossary(self, entry):
- if entry.entry_id in self._glossary_cache:
- return self._glossary_cache[entry.entry_id]
- glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
- self._glossary_cache[entry.entry_id] = glossary
- return glossary
-
- def _sequence(self, entry):
- return entry.entry_id
-
- def _link_glossary_parameters(self, entry):
- return []
-
- def _subentry_lists(self, entry):
- return []
-
-
-class JitenonKokugoTerminator(JitenonTerminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = JitenonKokugoGlossary()
-
- def _inflection_rules(self, entry, expression):
- return sudachi_rules(expression)
-
- def _term_tags(self, entry):
- return ""
-
-
-class JitenonYojiTerminator(JitenonTerminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = JitenonYojiGlossary()
-
- def _inflection_rules(self, entry, expression):
- return ""
-
- def _term_tags(self, entry):
- tags = entry.kanken_level.split("/")
- return " ".join(tags)
-
-
-class JitenonKotowazaTerminator(JitenonTerminator):
- def __init__(self, target):
- super().__init__(target)
- self._glossary_maker = JitenonKotowazaGlossary()
-
- def _inflection_rules(self, entry, expression):
- return sudachi_rules(expression)
-
- def _term_tags(self, entry):
- return ""
diff --git a/bot/yomichan/terms/jitenon_kokugo.py b/bot/yomichan/terms/jitenon_kokugo.py
new file mode 100644
index 0000000..3e33b77
--- /dev/null
+++ b/bot/yomichan/terms/jitenon_kokugo.py
@@ -0,0 +1,15 @@
+from bot.yomichan.grammar import sudachi_rules
+from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
+from bot.yomichan.terms.base.jitenon import JitenonTerminator
+
+
+class Terminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonKokugoGlossary()
+
+ def _inflection_rules(self, entry, expression):
+ return sudachi_rules(expression)
+
+ def _term_tags(self, entry):
+ return ""
diff --git a/bot/yomichan/terms/jitenon_kotowaza.py b/bot/yomichan/terms/jitenon_kotowaza.py
new file mode 100644
index 0000000..a0651b9
--- /dev/null
+++ b/bot/yomichan/terms/jitenon_kotowaza.py
@@ -0,0 +1,15 @@
+from bot.yomichan.grammar import sudachi_rules
+from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
+from bot.yomichan.terms.base.jitenon import JitenonTerminator
+
+
+class Terminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonKotowazaGlossary()
+
+ def _inflection_rules(self, entry, expression):
+ return sudachi_rules(expression)
+
+ def _term_tags(self, entry):
+ return ""
diff --git a/bot/yomichan/terms/jitenon_yoji.py b/bot/yomichan/terms/jitenon_yoji.py
new file mode 100644
index 0000000..5087539
--- /dev/null
+++ b/bot/yomichan/terms/jitenon_yoji.py
@@ -0,0 +1,15 @@
+from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
+from bot.yomichan.terms.base.jitenon import JitenonTerminator
+
+
+class Terminator(JitenonTerminator):
+ def __init__(self, target):
+ super().__init__(target)
+ self._glossary_maker = JitenonYojiGlossary()
+
+ def _inflection_rules(self, entry, expression):
+ return ""
+
+ def _term_tags(self, entry):
+ tags = entry.kanken_level.split("/")
+ return " ".join(tags)
diff --git a/bot/yomichan/terms/sankoku8.py b/bot/yomichan/terms/sankoku8.py
index 613f3bb..d6e6afd 100644
--- a/bot/yomichan/terms/sankoku8.py
+++ b/bot/yomichan/terms/sankoku8.py
@@ -1,14 +1,10 @@
-from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry
-
-from bot.yomichan.terms.terminator import Terminator
+from bot.entries.sankoku8.phrase_entry import PhraseEntry
+from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.glossary.sankoku8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
-class Sankoku8Terminator(Terminator):
- def __init__(self, target):
- super().__init__(target)
-
+class Terminator(BaseTerminator):
def _definition_tags(self, entry):
return ""
diff --git a/bot/yomichan/terms/smk8.py b/bot/yomichan/terms/smk8.py
index d1e3ca7..9e85c17 100644
--- a/bot/yomichan/terms/smk8.py
+++ b/bot/yomichan/terms/smk8.py
@@ -1,12 +1,11 @@
-from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
-from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
-
-from bot.yomichan.terms.terminator import Terminator
+from bot.entries.smk8.kanji_entry import KanjiEntry
+from bot.entries.smk8.phrase_entry import PhraseEntry
+from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.glossary.smk8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
-class Smk8Terminator(Terminator):
+class Terminator(BaseTerminator):
def __init__(self, target):
super().__init__(target)
diff --git a/data/entries/variant_kanji.csv b/data/entries/variant_kanji.csv
index 849eec3..0272164 100644
--- a/data/entries/variant_kanji.csv
+++ b/data/entries/variant_kanji.csv
@@ -1,65 +1,61 @@
-亙,亘
-俠,侠
-俱,倶
-儘,侭
-凜,凛
-剝,剥
+𠮟,叱
吞,呑
+靭,靱
+臈,﨟
啞,唖
-噓,嘘
嚙,噛
-囊,嚢
-塡,填
-壺,壷
屛,屏
-屢,屡
幷,并
彎,弯
搔,掻
-摑,掴
攪,撹
-曾,曽
枡,桝
-檜,桧
-檮,梼
-潑,溌
-濤,涛
濾,沪
-瀆,涜
-灌,潅
-焰,焔
-瘦,痩
-禰,祢
-禱,祷
-穎,頴
-竈,竃
-簞,箪
-籠,篭
繡,繍
-繫,繋
-萊,莱
蔣,蒋
-藪,薮
-蘆,芦
-蟬,蝉
-蠅,蝿
蠟,蝋
-蠣,蛎
-賤,賎
-軀,躯
-邇,迩
醬,醤
-醱,醗
-靱,靭
-頰,頬
-頸,頚
-顚,顛
-驒,騨
-鰺,鯵
-鶯,鴬
+穎,頴
鷗,鴎
鹼,鹸
麴,麹
-麵,麺
-﨟,臈
-𠮟,叱
+俠,侠
+俱,倶
+剝,剥
+噓,嘘
+囊,嚢
+塡,填
+屢,屡
+摑,掴
+瀆,涜
+潑,溌
+焰,焔
+簞,箪
+繫,繋
+萊,莱
+蟬,蝉
+軀,躯
+醱,醗
+頰,頬
+顚,顛
+驒,騨
+姸,妍
+攢,攅
+𣜜,杤
+檔,档
+槶,椢
+櫳,槞
+纊,絋
+纘,纉
+隯,陦
+筓,笄
+逬,迸
+腁,胼
+騈,駢
+拋,抛
+篡,簒
+檜,桧
+禰,祢
+禱,祷
+蘆,芦
+凜,凛
\ No newline at end of file
diff --git a/jitenbot.py b/jitenbot.py
index da44905..f0a2719 100644
--- a/jitenbot.py
+++ b/jitenbot.py
@@ -21,7 +21,7 @@ import sys
import argparse
import subprocess
from bot.targets import Targets
-from bot.crawlers.factory import new_crawler
+from bot.factory import new_crawler
def filename(f):
diff --git a/run_all.sh b/run_all.sh
index 706a911..9dcdfda 100755
--- a/run_all.sh
+++ b/run_all.sh
@@ -1,5 +1,7 @@
#!/bin/sh
+export PYTHONPYCACHEPREFIX=/tmp/pycache
+
python -m unittest discover -s tests
python jitenbot.py jitenon-kokugo
diff --git a/tests/test_daijirin_phrases.py b/tests/test_daijirin_phrases.py
new file mode 100644
index 0000000..3ab02dd
--- /dev/null
+++ b/tests/test_daijirin_phrases.py
@@ -0,0 +1,21 @@
+import unittest
+from bot.entries.daijirin2.phrase_entry import parse_phrase
+
+
+class TestDaijirin2PhraseParse(unittest.TestCase):
+ def test1(self):
+ text = "同じ穴の=狢(=狐・狸)"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 3)
+ self.assertIn("同じ穴の狢", exps)
+ self.assertIn("同じ穴の狐", exps)
+ self.assertIn("同じ穴の狸", exps)
+
+ def test2(self):
+ text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 4)
+ self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
+ self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
+ self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
+ self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
diff --git a/tests/test_expressions.py b/tests/test_expressions.py
index b2ebc26..9091dda 100644
--- a/tests/test_expressions.py
+++ b/tests/test_expressions.py
@@ -1,5 +1,5 @@
import unittest
-import bot.entries.expressions as Expressions
+import bot.entries.base.expressions as Expressions
class TestExpressions(unittest.TestCase):
@@ -34,8 +34,8 @@ class TestExpressions(unittest.TestCase):
self.assertIn("凶々しい", exps)
self.assertIn("凶凶しい", exps)
- def test_add_variant_kanji(self):
- exps = ["剝く", "掴む", "摑む"]
+ def test_add_variant_kanji1(self):
+ exps = ["剥く", "摑む"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
self.assertIn("剥く", exps)
@@ -44,6 +44,15 @@ class TestExpressions(unittest.TestCase):
self.assertIn("摑む", exps)
def test_add_variant_kanji2(self):
+ exps = ["剝く", "掴む", "摑む"]
+ Expressions.add_variant_kanji(exps)
+ self.assertEqual(len(exps), 4)
+ self.assertIn("剥く", exps)
+ self.assertIn("剝く", exps)
+ self.assertIn("掴む", exps)
+ self.assertIn("摑む", exps)
+
+ def test_add_variant_kanji3(self):
exps = ["剝摑"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
@@ -52,6 +61,15 @@ class TestExpressions(unittest.TestCase):
self.assertIn("剥掴", exps)
self.assertIn("剥摑", exps)
+ def test_add_variant_kanji4(self):
+ exps = ["剥掴"]
+ Expressions.add_variant_kanji(exps)
+ self.assertEqual(len(exps), 4)
+ self.assertIn("剝摑", exps)
+ self.assertIn("剝掴", exps)
+ self.assertIn("剥掴", exps)
+ self.assertIn("剥摑", exps)
+
def test_expand_abbreviation(self):
text = "有(り)合(わ)せ"
abbrs = Expressions.expand_abbreviation(text)
@@ -69,28 +87,3 @@ class TestExpressions(unittest.TestCase):
self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs)
-
- def test_smk_expand_alternatives(self):
- text = "△金(時間・暇)に飽かして"
- exps = Expressions.expand_smk_alternatives(text)
- self.assertEqual(len(exps), 3)
- self.assertIn("金に飽かして", exps)
- self.assertIn("時間に飽かして", exps)
- self.assertIn("暇に飽かして", exps)
-
- def test_daijirin_expand_alternatives(self):
- text = "同じ穴の=狢(=狐・狸)"
- exps = Expressions.expand_daijirin_alternatives(text)
- self.assertEqual(len(exps), 3)
- self.assertIn("同じ穴の狢", exps)
- self.assertIn("同じ穴の狐", exps)
- self.assertIn("同じ穴の狸", exps)
-
- def test_daijirin_expand_alternatives2(self):
- text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
- exps = Expressions.expand_daijirin_alternatives(text)
- self.assertEqual(len(exps), 4)
- self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
- self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
- self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
- self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
diff --git a/tests/test_sankoku_phrases.py b/tests/test_sankoku_phrases.py
index 7faf289..c3894e9 100644
--- a/tests/test_sankoku_phrases.py
+++ b/tests/test_sankoku_phrases.py
@@ -1,16 +1,16 @@
import unittest
-from bot.entries.sankoku8 import parse_hyouki_pattern
+from bot.entries.sankoku8.parse import parse_hyouki_pattern
-class TestSankokuPhrases(unittest.TestCase):
- def test_sankoku_phrases1(self):
+class TestSankoku8PhraseParse(unittest.TestCase):
+ def test1(self):
pattern = '耳にたこ(ができる)'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("耳にたこ", exps)
self.assertIn("耳にたこができる", exps)
- def test_sankoku_phrases2(self):
+ def test2(self):
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 4)
@@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("一斑をもって全豹を卜す", exps)
self.assertIn("一斑をもって全豹を推す", exps)
- def test_sankoku_phrases3(self):
+ def test3(self):
pattern = '{かじ・舵}を切る'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2)
self.assertIn("かじを切る", exps)
self.assertIn("舵を切る", exps)
- def test_sankoku_phrases4(self):
+ def test4(self):
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 6)
@@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("重箱の隅をようじでほじくる", exps)
self.assertIn("重箱の隅を楊枝でほじくる", exps)
- def test_sankoku_phrases5(self):
+ def test5(self):
pattern = '群盲象を〈{な・撫}でる/評する〉'
exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 3)
diff --git a/tests/test_smk_phrases.py b/tests/test_smk_phrases.py
new file mode 100644
index 0000000..e5ce231
--- /dev/null
+++ b/tests/test_smk_phrases.py
@@ -0,0 +1,19 @@
+import unittest
+from bot.entries.smk8.phrase_entry import parse_phrase
+
+
+class TestSmk8PhraseParse(unittest.TestCase):
+ def test1(self):
+ text = "目と鼻の△先(間)"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 2)
+ self.assertIn("目と鼻の先", exps)
+ self.assertIn("目と鼻の間", exps)
+
+ def test2(self):
+ text = "△金(時間・暇)に飽かして"
+ exps = parse_phrase(text)
+ self.assertEqual(len(exps), 3)
+ self.assertIn("金に飽かして", exps)
+ self.assertIn("時間に飽かして", exps)
+ self.assertIn("暇に飽かして", exps)