Merge branch 'main' into patch-2

This commit is contained in:
epistularum 2023-07-31 16:55:10 +09:00 committed by GitHub
commit 09b585c49d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
92 changed files with 1514 additions and 1531 deletions

View file

@ -1,7 +1,7 @@
### Todo ### Todo
- [x] Add factory classes to reduce the amount of class import statements - [x] Add factory classes to reduce the amount of class import statements
- [ ] Add dynamic import functionality to factory classes to reduce boilerplate - [x] Add dynamic import functionality to factory classes to reduce boilerplate
- [x] Support exporting to MDict (.MDX) dictionary format - [x] Support exporting to MDict (.MDX) dictionary format
- [x] Validate JSON schema of Yomichan terms during export - [x] Validate JSON schema of Yomichan terms during export
- [ ] Add support for monokakido search keys from index files - [ ] Add support for monokakido search keys from index files

View file

@ -0,0 +1,54 @@
import re
from abc import ABC, abstractmethod
from bot.factory import new_entry
from bot.factory import new_yomichan_exporter
from bot.factory import new_mdict_exporter
class BaseCrawler(ABC):
def __init__(self, target):
self._target = target
self._page_map = {}
self._entries = []
self._page_id_pattern = None
@abstractmethod
def collect_pages(self, page_dir):
raise NotImplementedError
def read_pages(self):
pages_len = len(self._page_map)
items = self._page_map.items()
for idx, (page_id, page_path) in enumerate(items):
update = f"\tReading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True)
entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f:
page = f.read()
try:
entry.set_page(page)
except ValueError as err:
print(err)
print("Try deleting and redownloading file:")
print(f"\t{page_path}\n")
continue
self._entries.append(entry)
print()
def make_yomichan_dictionary(self, media_dir, validate):
exporter = new_yomichan_exporter(self._target)
exporter.export(self._entries, media_dir, validate)
def make_mdict_dictionary(self, media_dir, icon_file):
exporter = new_mdict_exporter(self._target)
exporter.export(self._entries, media_dir, icon_file)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
if m is None:
return None
page_id = int(m.group(1))
if page_id in self._page_map:
return None
return page_id

View file

@ -0,0 +1,30 @@
from bs4 import BeautifulSoup
from bot.time import timestamp
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
from bot.crawlers.base.crawler import BaseCrawler
class JitenonCrawler(BaseCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = None
def collect_pages(self, page_dir):
print(f"{timestamp()} Scraping {self._gojuon_url}")
jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"\n{timestamp()} Found {pages_len} entry pages")

View file

@ -0,0 +1,20 @@
import os
from bot.time import timestamp
from bot.crawlers.base.crawler import BaseCrawler
class MonokakidoCrawler(BaseCrawler):
def __init__(self, target):
super().__init__(target)
self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self, page_dir):
print(f"{timestamp()} Searching for page files in `{page_dir}`")
for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0:
continue
path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path
pages_len = len(self._page_map)
print(f"{timestamp()} Found {pages_len} page files for processing")

View file

@ -1,158 +0,0 @@
import os
import re
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
import bot.crawlers.scraper as Scraper
from bot.entries.factory import new_entry
from bot.yomichan.exporters.factory import new_yomi_exporter
from bot.mdict.exporters.factory import new_mdict_exporter
class Crawler(ABC):
def __init__(self, target):
self._target = target
self._page_map = {}
self._entries = []
self._page_id_pattern = None
@abstractmethod
def collect_pages(self, page_dir):
pass
def read_pages(self):
pages_len = len(self._page_map)
items = self._page_map.items()
for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True)
entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f:
page = f.read()
try:
entry.set_page(page)
except ValueError as err:
print(err)
print("Try deleting and redownloading file:")
print(f"\t{page_path}\n")
continue
self._entries.append(entry)
print()
def make_yomichan_dictionary(self, media_dir, validate):
exporter = new_yomi_exporter(self._target)
exporter.export(self._entries, media_dir, validate)
def make_mdict_dictionary(self, media_dir, icon_file):
exporter = new_mdict_exporter(self._target)
exporter.export(self._entries, media_dir, icon_file)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
if m is None:
return None
page_id = int(m.group(1))
if page_id in self._page_map:
return None
return page_id
class JitenonKokugoCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self, page_dir):
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
max_kana_page = 1
current_kana_page = 1
while current_kana_page <= max_kana_page:
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
current_kana_page += 1
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
page_total = kana_soup.find(class_="page_total").text
m = re.search(r"全([0-9]+)件", page_total)
if m:
max_kana_page = int(m.group(1))
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
class _JitenonCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = None
def collect_pages(self, page_dir):
print("Scraping jitenon.jp")
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
class JitenonYojiCrawler(_JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._page_id_pattern = r"([0-9]+)\.html$"
class JitenonKotowazaCrawler(_JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._page_id_pattern = r"([0-9]+)\.php$"
class _MonokakidoCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self, page_dir):
print(f"Searching for page files in `{page_dir}`")
for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0:
continue
path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path
pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing")
class Smk8Crawler(_MonokakidoCrawler):
pass
class Daijirin2Crawler(_MonokakidoCrawler):
pass
class Sankoku8Crawler(_MonokakidoCrawler):
pass

View file

@ -0,0 +1,5 @@
from bot.crawlers.base.monokakido import MonokakidoCrawler
class Crawler(MonokakidoCrawler):
pass

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.crawlers.crawlers import JitenonKokugoCrawler
from bot.crawlers.crawlers import JitenonYojiCrawler
from bot.crawlers.crawlers import JitenonKotowazaCrawler
from bot.crawlers.crawlers import Smk8Crawler
from bot.crawlers.crawlers import Daijirin2Crawler
from bot.crawlers.crawlers import Sankoku8Crawler
def new_crawler(target):
crawler_map = {
Targets.JITENON_KOKUGO: JitenonKokugoCrawler,
Targets.JITENON_YOJI: JitenonYojiCrawler,
Targets.JITENON_KOTOWAZA: JitenonKotowazaCrawler,
Targets.SMK8: Smk8Crawler,
Targets.DAIJIRIN2: Daijirin2Crawler,
Targets.SANKOKU8: Sankoku8Crawler,
}
return crawler_map[target](target)

View file

@ -0,0 +1,40 @@
import re
from bs4 import BeautifulSoup
from bot.time import timestamp
from bot.crawlers.base.crawler import BaseCrawler
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
class Crawler(BaseCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self, page_dir):
print(f"{timestamp()} Scraping {self._gojuon_url}")
jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
max_kana_page = 1
current_kana_page = 1
while current_kana_page <= max_kana_page:
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
current_kana_page += 1
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
page_total = kana_soup.find(class_="page_total").text
m = re.search(r"全([0-9]+)件", page_total)
if m:
max_kana_page = int(m.group(1))
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"\n{timestamp()} Found {pages_len} entry pages")

View file

@ -0,0 +1,8 @@
from bot.crawlers.base.jitenon import JitenonCrawler
class Crawler(JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._page_id_pattern = r"([0-9]+)\.php$"

View file

@ -0,0 +1,8 @@
from bot.crawlers.base.jitenon import JitenonCrawler
class Crawler(JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._page_id_pattern = r"([0-9]+)\.html$"

5
bot/crawlers/sankoku8.py Normal file
View file

@ -0,0 +1,5 @@
from bot.crawlers.base.monokakido import MonokakidoCrawler
class Crawler(MonokakidoCrawler):
pass

View file

@ -0,0 +1,10 @@
import re
from bot.crawlers.scrapers.scraper import BaseScraper
class Jitenon(BaseScraper):
def _get_netloc_re(self):
domain = r"jitenon\.jp"
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + domain + r"$"
netloc_re = re.compile(pattern)
return netloc_re

View file

@ -1,24 +1,28 @@
import time import time
import requests
import re import re
import os import os
import hashlib import hashlib
import random
import math
from datetime import datetime from datetime import datetime
from pathlib import Path
from platformdirs import user_cache_dir
from urllib.parse import urlparse from urllib.parse import urlparse
from pathlib import Path
from abc import ABC, abstractmethod
import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from platformdirs import user_cache_dir
from bot.time import timestamp
from bot.data import load_config from bot.data import load_config
class Scraper(): class BaseScraper(ABC):
def __init__(self): def __init__(self):
self.cache_count = 0
self._config = load_config() self._config = load_config()
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$" self.netloc_re = self._get_netloc_re()
self.netloc_re = re.compile(pattern)
self.__set_session() self.__set_session()
def scrape(self, urlstring): def scrape(self, urlstring):
@ -31,9 +35,14 @@ class Scraper():
with open(cache_path, "w", encoding="utf-8") as f: with open(cache_path, "w", encoding="utf-8") as f:
f.write(html) f.write(html)
else: else:
print("Discovering cached files...", end='\r', flush=True) self.cache_count += 1
print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
return html, cache_path return html, cache_path
@abstractmethod
def _get_netloc_re(self):
raise NotImplementedError
def __set_session(self): def __set_session(self):
retry_strategy = Retry( retry_strategy = Retry(
total=3, total=3,
@ -87,21 +96,14 @@ class Scraper():
def __get(self, urlstring): def __get(self, urlstring):
delay = 10 delay = 10
time.sleep(delay) time.sleep(delay)
now = datetime.now().strftime("%H:%M:%S") print(f"{timestamp()} Scraping {urlstring} ...", end='')
print(f"{now} scraping {urlstring} ...", end='')
try: try:
response = self.session.get(urlstring, timeout=10) response = self.session.get(urlstring, timeout=10)
print("OK") print(f"{timestamp()} OK")
return response.text return response.text
except Exception: except Exception as ex:
print("failed") print(f"\tFailed: {str(ex)}")
print("resetting session and trying again") print(f"{timestamp()} Resetting session and trying again")
self.__set_session() self.__set_session()
response = self.session.get(urlstring, timeout=10) response = self.session.get(urlstring, timeout=10)
return response.text return response.text
class Jitenon(Scraper):
def __init__(self):
self.domain = r"jitenon\.jp"
super().__init__()

5
bot/crawlers/smk8.py Normal file
View file

@ -0,0 +1,5 @@
from bot.crawlers.base.monokakido import MonokakidoCrawler
class Crawler(MonokakidoCrawler):
pass

View file

@ -18,15 +18,15 @@ class Entry(ABC):
@abstractmethod @abstractmethod
def get_global_identifier(self): def get_global_identifier(self):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def set_page(self, page): def set_page(self, page):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def get_page_soup(self): def get_page_soup(self):
pass raise NotImplementedError
def get_headwords(self): def get_headwords(self):
if self._headwords is not None: if self._headwords is not None:
@ -38,15 +38,15 @@ class Entry(ABC):
@abstractmethod @abstractmethod
def _get_headwords(self): def _get_headwords(self):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _add_variant_expressions(self, headwords): def _add_variant_expressions(self, headwords):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def get_part_of_speech_tags(self): def get_part_of_speech_tags(self):
pass raise NotImplementedError
def get_parent(self): def get_parent(self):
if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID: if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:

View file

@ -31,11 +31,14 @@ def add_fullwidth(expressions):
def add_variant_kanji(expressions): def add_variant_kanji(expressions):
variant_kanji = load_variant_kanji() variant_kanji = load_variant_kanji()
for old_kanji, new_kanji in variant_kanji.items(): for kyuuji, shinji in variant_kanji.items():
new_exps = [] new_exps = []
for expression in expressions: for expression in expressions:
if old_kanji in expression: if kyuuji in expression:
new_exp = expression.replace(old_kanji, new_kanji) new_exp = expression.replace(kyuuji, shinji)
new_exps.append(new_exp)
if shinji in expression:
new_exp = expression.replace(shinji, kyuuji)
new_exps.append(new_exp) new_exps.append(new_exp)
for new_exp in new_exps: for new_exp in new_exps:
if new_exp not in expressions: if new_exp not in expressions:
@ -85,40 +88,3 @@ def expand_abbreviation_list(expressions):
if new_exp not in new_exps: if new_exp not in new_exps:
new_exps.append(new_exp) new_exps.append(new_exp)
return new_exps return new_exps
def expand_smk_alternatives(text):
"""Return a list of strings described by △ notation."""
m = re.search(r"△([^]+)([^]+)", text)
if m is None:
return [text]
alt_parts = [m.group(1)]
for alt_part in m.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, text)
alts.append(alt_exp)
return alts
def expand_daijirin_alternatives(text):
"""Return a list of strings described by notation."""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, text)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -3,11 +3,11 @@ from abc import abstractmethod
from datetime import datetime, date from datetime import datetime, date
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bot.entries.entry import Entry from bot.entries.base.entry import Entry
import bot.entries.expressions as Expressions import bot.entries.base.expressions as Expressions
class _JitenonEntry(Entry): class JitenonEntry(Entry):
def __init__(self, target, entry_id): def __init__(self, target, entry_id):
super().__init__(target, entry_id) super().__init__(target, entry_id)
self.expression = "" self.expression = ""
@ -58,7 +58,7 @@ class _JitenonEntry(Entry):
@abstractmethod @abstractmethod
def _get_column_map(self): def _get_column_map(self):
pass raise NotImplementedError
def __set_modified_date(self, page): def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
@ -140,104 +140,3 @@ class _JitenonEntry(Entry):
elif isinstance(attr_val, list): elif isinstance(attr_val, list):
colvals.append("".join(attr_val)) colvals.append("".join(attr_val))
return ",".join(colvals) return ",".join(colvals)
class JitenonYojiEntry(_JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.kanken_level = ""
self.category = ""
self.related_expressions = []
def _get_column_map(self):
return {
"四字熟語": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"漢検級": "kanken_level",
"場面用途": "category",
"類義語": "related_expressions",
}
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
class JitenonKotowazaEntry(_JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.example = ""
self.related_expressions = []
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"例文": "example",
"類句": "related_expressions",
}
def _get_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
headwords = super()._get_headwords()
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
class JitenonKokugoEntry(_JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.example = ""
self.alt_expression = ""
self.antonym = ""
self.attachments = ""
self.compounds = ""
self.related_words = ""
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"例文": "example",
"別表記": "alt_expression",
"対義語": "antonym",
"活用": "attachments",
"用例": "compounds",
"類語": "related_words",
}
def _get_headwords(self):
headwords = {}
for reading in self.yomikata.split(""):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split(""):
headwords[reading].append(expression)
if self.alt_expression.strip() != "":
for expression in self.alt_expression.split(""):
headwords[reading].append(expression)
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)

View file

@ -0,0 +1,60 @@
from abc import abstractmethod
from bs4 import BeautifulSoup
from bot.entries.base.entry import Entry
import bot.entries.base.expressions as Expressions
class SanseidoEntry(Entry):
def set_page(self, page):
page = self._decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def _decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
for x in self._get_subentry_parameters():
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@abstractmethod
def _get_subentry_parameters(self):
raise NotImplementedError
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")

View file

@ -1,231 +0,0 @@
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_phrase_readings
from bot.data import load_daijirin2_kana_abbreviations
from bot.entries.entry import Entry
from bot.entries.daijirin2_preprocess import preprocess_page
class _BaseDaijirin2Entry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for pos_group in soup.find_all("品詞G"):
if pos_group.parent.name == "大語義":
self._set_part_of_speech_tags(pos_group)
return self._part_of_speech_tags
def _set_part_of_speech_tags(self, el):
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
for child in el.children:
if child.name is not None:
self._set_part_of_speech_tags(child)
continue
pos = str(child)
if el.name not in pos_names:
continue
elif pos in ["", ""]:
continue
elif pos in self._part_of_speech_tags:
continue
else:
self._part_of_speech_tags.append(pos)
def _get_regular_headwords(self, soup):
self._fill_alts(soup)
reading = soup.find("見出仮名").text
expressions = []
for el in soup.find_all("標準表記"):
expression = self._clean_expression(el.text)
if "" in expression:
kana_abbrs = self._kana_abbreviations[self.entry_id]
for abbr in kana_abbrs:
expression = expression.replace("", abbr, 1)
expressions.append(expression)
expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0:
expressions.append(reading)
headwords = {reading: expressions}
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Daijirin2ChildEntry, ["子項目"], self.children],
[Daijirin2PhraseEntry, ["句項目"], self.phrases],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for gaiji in soup.find_all(class_="gaiji"):
if gaiji.name == "img" and gaiji.has_attr("alt"):
gaiji.name = "span"
gaiji.string = gaiji.attrs["alt"]
class Daijirin2Entry(_BaseDaijirin2Entry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None:
headwords = self._get_acronym_headwords(soup)
else:
headwords = self._get_regular_headwords(soup)
return headwords
def _get_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
readings.append(hira)
if soup.find("漢字音") is None:
readings.append("")
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
headwords = {}
for reading in readings:
headwords[reading] = expressions
return headwords
def _get_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
for part in el.find_all(["欧字", "和字"]):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
headwords = {"": expressions}
return headwords
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
headwords = self._get_regular_headwords(soup)
return headwords
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
text = soup.find("句表記").text
text = self._clean_expression(text)
alternatives = Expressions.expand_daijirin_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
phrase_readings = load_phrase_readings(self.target)
text = phrase_readings[self.entry_id]
alternatives = Expressions.expand_daijirin_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings

View file

@ -0,0 +1,88 @@
import bot.soup as Soup
from bot.data import load_daijirin2_kana_abbreviations
from bot.entries.base.sanseido_entry import SanseidoEntry
import bot.entries.base.expressions as Expressions
class BaseEntry(SanseidoEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for pos_group in soup.find_all("品詞G"):
if pos_group.parent.name == "大語義":
self._set_part_of_speech_tags(pos_group)
return self._part_of_speech_tags
def _set_part_of_speech_tags(self, el):
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
for child in el.children:
if child.name is not None:
self._set_part_of_speech_tags(child)
continue
pos = str(child)
if el.name not in pos_names:
continue
elif pos in ["", ""]:
continue
elif pos in self._part_of_speech_tags:
continue
else:
self._part_of_speech_tags.append(pos)
def _get_regular_headwords(self, soup):
self._fill_alts(soup)
reading = soup.find("見出仮名").text
expressions = []
for el in soup.find_all("標準表記"):
expression = self._clean_expression(el.text)
if "" in expression:
kana_abbrs = self._kana_abbreviations[self.entry_id]
for abbr in kana_abbrs:
expression = expression.replace("", abbr, 1)
expressions.append(expression)
expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0:
expressions.append(reading)
headwords = {reading: expressions}
return headwords
def _get_subentry_parameters(self):
from bot.entries.daijirin2.child_entry import ChildEntry
from bot.entries.daijirin2.phrase_entry import PhraseEntry
subentry_parameters = [
[ChildEntry, ["子項目"], self.children],
[PhraseEntry, ["句項目"], self.phrases],
]
return subentry_parameters
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for gaiji in soup.find_all(class_="gaiji"):
if gaiji.name == "img" and gaiji.has_attr("alt"):
gaiji.name = "span"
gaiji.string = gaiji.attrs["alt"]

View file

@ -0,0 +1,9 @@
from bot.entries.daijirin2.base_entry import BaseEntry
class ChildEntry(BaseEntry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
headwords = self._get_regular_headwords(soup)
return headwords

View file

@ -0,0 +1,50 @@
import bot.entries.base.expressions as Expressions
from bot.entries.daijirin2.base_entry import BaseEntry
from bot.entries.daijirin2.preprocess import preprocess_page
class Entry(BaseEntry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
headwords = self._get_kanji_headwords(soup)
elif soup.find("略語G") is not None:
headwords = self._get_acronym_headwords(soup)
else:
headwords = self._get_regular_headwords(soup)
return headwords
def _get_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
readings.append(hira)
if soup.find("漢字音") is None:
readings.append("")
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
headwords = {}
for reading in readings:
headwords[reading] = expressions
return headwords
def _get_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
for part in el.find_all(["欧字", "和字"]):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
headwords = {"": expressions}
return headwords

View file

@ -0,0 +1,67 @@
import re
import bot.entries.base.expressions as Expressions
from bot.data import load_phrase_readings
from bot.entries.daijirin2.base_entry import BaseEntry
class PhraseEntry(BaseEntry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
text = soup.find("句表記").text
text = self._clean_expression(text)
alternatives = parse_phrase(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
phrase_readings = load_phrase_readings(self.target)
text = phrase_readings[self.entry_id]
alternatives = parse_phrase(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
def parse_phrase(text):
"""Return a list of strings described by notation."""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, text)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
from bot.entries.sankoku8 import Sankoku8Entry
def new_entry(target, page_id):
entry_map = {
Targets.JITENON_KOKUGO: JitenonKokugoEntry,
Targets.JITENON_YOJI: JitenonYojiEntry,
Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
Targets.SMK8: Smk8Entry,
Targets.DAIJIRIN2: Daijirin2Entry,
Targets.SANKOKU8: Sankoku8Entry,
}
return entry_map[target](target, page_id)

View file

@ -0,0 +1,45 @@
from bot.entries.base.jitenon_entry import JitenonEntry
import bot.entries.base.expressions as Expressions
class Entry(JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.example = ""
self.alt_expression = ""
self.antonym = ""
self.attachments = ""
self.compounds = ""
self.related_words = ""
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"例文": "example",
"別表記": "alt_expression",
"対義語": "antonym",
"活用": "attachments",
"用例": "compounds",
"類語": "related_words",
}
def _get_headwords(self):
headwords = {}
for reading in self.yomikata.split(""):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split(""):
headwords[reading].append(expression)
if self.alt_expression.strip() != "":
for expression in self.alt_expression.split(""):
headwords[reading].append(expression)
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)

View file

@ -0,0 +1,35 @@
from bot.entries.base.jitenon_entry import JitenonEntry
import bot.entries.base.expressions as Expressions
class Entry(JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.example = ""
self.related_expressions = []
def _get_column_map(self):
return {
"言葉": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"例文": "example",
"類句": "related_expressions",
}
def _get_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
headwords = super()._get_headwords()
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)

View file

@ -0,0 +1,27 @@
import bot.entries.base.expressions as Expressions
from bot.entries.base.jitenon_entry import JitenonEntry
class Entry(JitenonEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.origin = ""
self.kanken_level = ""
self.category = ""
self.related_expressions = []
def _get_column_map(self):
return {
"四字熟語": "expression",
"読み方": "yomikata",
"意味": "definition",
"異形": "other_forms",
"出典": "origin",
"漢検級": "kanken_level",
"場面用途": "category",
"類義語": "related_expressions",
}
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)

View file

@ -1,260 +0,0 @@
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.entries.entry import Entry
from bot.data import load_phrase_readings
from bot.entries.sankoku8_preprocess import preprocess_page
class _BaseSankoku8Entry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._hyouki_name = "表記"
self._midashi_name = None
self._midashi_kana_name = None
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
readings = self._find_readings(soup)
expressions = self._find_expressions(soup)
headwords = {}
for reading in readings:
headwords[reading] = []
if len(readings) == 1:
reading = readings[0]
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
headwords[reading].append(reading)
for exp in expressions:
if exp not in headwords[reading]:
headwords[reading].append(exp)
elif len(readings) > 1 and len(expressions) == 0:
for reading in readings:
headwords[reading].append(reading)
elif len(readings) > 1 and len(expressions) == 1:
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
expression = expressions[0]
for reading in readings:
if expression not in headwords[reading]:
headwords[reading].append(expression)
elif len(readings) > 1 and len(expressions) == len(readings):
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
for idx, reading in enumerate(readings):
exp = expressions[idx]
if exp not in headwords[reading]:
headwords[reading].append(exp)
else:
raise Exception() # shouldn't happen
return headwords
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
pos_group = midashi.find("品詞G")
if pos_group is None:
continue
for tag in pos_group.find_all("a"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _find_expressions(self, soup):
expressions = []
for hyouki in soup.find_all(self._hyouki_name):
for expression in parse_hyouki_soup(hyouki, [""]):
expressions.append(expression)
return expressions
def _find_readings(self, soup):
midasi_kana = soup.find(self._midashi_kana_name)
readings = parse_hyouki_soup(midasi_kana, [""])
return readings
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Sankoku8ChildEntry, ["子項目"], self.children],
[Sankoku8PhraseEntry, ["句項目"], self.phrases],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
"アクセント分節", "活用分節", "ルビG", "分書"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
class Sankoku8Entry(_BaseSankoku8Entry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
self._midashi_name = "見出部"
self._midashi_kana_name = "見出仮名"
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
class Sankoku8ChildEntry(_BaseSankoku8Entry):
def __init__(self, target, page_id):
super().__init__(target, page_id)
self._midashi_name = "子見出部"
self._midashi_kana_name = "子見出仮名"
class Sankoku8PhraseEntry(_BaseSankoku8Entry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
expressions = self._find_expressions(soup)
readings = self._find_readings(soup)
headwords = {}
if len(expressions) != len(readings):
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
phrase_soup = soup.find("句表記")
expressions = parse_hyouki_soup(phrase_soup, [""])
return expressions
def _find_readings(self, soup):
reading_patterns = load_phrase_readings(self.target)
reading_pattern = reading_patterns[self.entry_id]
readings = parse_hyouki_pattern(reading_pattern)
return readings
def parse_hyouki_soup(soup, base_exps):
omitted_characters = [
"", "", "", "", "", "", "", "", ""
]
exps = base_exps.copy()
for child in soup.children:
new_exps = []
if child.name == "言換G":
for alt in child.find_all("言換"):
parts = parse_hyouki_soup(alt, [""])
for exp in exps:
for part in parts:
new_exps.append(exp + part)
elif child.name == "補足表記":
alt1 = child.find("表記対象")
alt2 = child.find("表記内容G")
parts1 = parse_hyouki_soup(alt1, [""])
parts2 = parse_hyouki_soup(alt2, [""])
for exp in exps:
for part in parts1:
new_exps.append(exp + part)
for part in parts2:
new_exps.append(exp + part)
elif child.name == "省略":
parts = parse_hyouki_soup(child, [""])
for exp in exps:
new_exps.append(exp)
for part in parts:
new_exps.append(exp + part)
elif child.name is not None:
new_exps = parse_hyouki_soup(child, exps)
else:
text = child.text
for char in omitted_characters:
text = text.replace(char, "")
for exp in exps:
new_exps.append(exp + text)
exps = new_exps.copy()
return exps
def parse_hyouki_pattern(pattern):
replacements = {
"": "<省略>",
"": "</省略>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G><表記内容>",
"": "</表記内容></表記内容G></補足表記>",
"": "<言換G>〈<言換>",
"": "</言換><言換>",
"": "</言換>〉</言換G>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G>⦅<表記内容>",
"": "</表記内容>⦆</表記内容G></補足表記>",
}
markup = f"<span>{pattern}</span>"
for key, val in replacements.items():
markup = markup.replace(key, val)
soup = BeautifulSoup(markup, "xml")
hyouki_soup = soup.find("span")
exps = parse_hyouki_soup(hyouki_soup, [""])
return exps

View file

@ -0,0 +1,104 @@
import bot.soup as Soup
from bot.entries.base.sanseido_entry import SanseidoEntry
from bot.entries.sankoku8.parse import parse_hyouki_soup
class BaseEntry(SanseidoEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self._hyouki_name = "表記"
self._midashi_name = None
self._midashi_kana_name = None
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
readings = self._find_readings(soup)
expressions = self._find_expressions(soup)
headwords = {}
for reading in readings:
headwords[reading] = []
if len(readings) == 1:
reading = readings[0]
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
headwords[reading].append(reading)
for exp in expressions:
if exp not in headwords[reading]:
headwords[reading].append(exp)
elif len(readings) > 1 and len(expressions) == 0:
for reading in readings:
headwords[reading].append(reading)
elif len(readings) > 1 and len(expressions) == 1:
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
expression = expressions[0]
for reading in readings:
if expression not in headwords[reading]:
headwords[reading].append(expression)
elif len(readings) > 1 and len(expressions) == len(readings):
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
for reading in readings:
headwords[reading].append(reading)
for idx, reading in enumerate(readings):
exp = expressions[idx]
if exp not in headwords[reading]:
headwords[reading].append(exp)
else:
raise Exception() # shouldn't happen
return headwords
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
pos_group = midashi.find("品詞G")
if pos_group is None:
continue
for tag in pos_group.find_all("a"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _find_expressions(self, soup):
expressions = []
for hyouki in soup.find_all(self._hyouki_name):
self._fill_alts(hyouki)
for expression in parse_hyouki_soup(hyouki, [""]):
expressions.append(expression)
return expressions
def _find_readings(self, soup):
midasi_kana = soup.find(self._midashi_kana_name)
readings = parse_hyouki_soup(midasi_kana, [""])
return readings
def _get_subentry_parameters(self):
from bot.entries.sankoku8.child_entry import ChildEntry
from bot.entries.sankoku8.phrase_entry import PhraseEntry
subentry_parameters = [
[ChildEntry, ["子項目"], self.children],
[PhraseEntry, ["句項目"], self.phrases],
]
return subentry_parameters
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
"アクセント分節", "活用分節", "ルビG", "分書"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _fill_alts(soup):
for img in soup.find_all("img"):
if img.has_attr("alt"):
img.string = img.attrs["alt"]

View file

@ -0,0 +1,8 @@
from bot.entries.sankoku8.base_entry import BaseEntry
class ChildEntry(BaseEntry):
def __init__(self, target, page_id):
super().__init__(target, page_id)
self._midashi_name = "子見出部"
self._midashi_kana_name = "子見出仮名"

View file

@ -0,0 +1,14 @@
from bot.entries.sankoku8.base_entry import BaseEntry
from bot.entries.sankoku8.preprocess import preprocess_page
class Entry(BaseEntry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
self._midashi_name = "見出部"
self._midashi_kana_name = "見出仮名"
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)

View file

@ -0,0 +1,65 @@
from bs4 import BeautifulSoup
def parse_hyouki_soup(soup, base_exps):
omitted_characters = [
"", "", "", "", "", "", "", "", ""
]
exps = base_exps.copy()
for child in soup.children:
new_exps = []
if child.name == "言換G":
for alt in child.find_all("言換"):
parts = parse_hyouki_soup(alt, [""])
for exp in exps:
for part in parts:
new_exps.append(exp + part)
elif child.name == "補足表記":
alt1 = child.find("表記対象")
alt2 = child.find("表記内容G")
parts1 = parse_hyouki_soup(alt1, [""])
parts2 = parse_hyouki_soup(alt2, [""])
for exp in exps:
for part in parts1:
new_exps.append(exp + part)
for part in parts2:
new_exps.append(exp + part)
elif child.name == "省略":
parts = parse_hyouki_soup(child, [""])
for exp in exps:
new_exps.append(exp)
for part in parts:
new_exps.append(exp + part)
elif child.name is not None:
new_exps = parse_hyouki_soup(child, exps)
else:
text = child.text
for char in omitted_characters:
text = text.replace(char, "")
for exp in exps:
new_exps.append(exp + text)
exps = new_exps.copy()
return exps
def parse_hyouki_pattern(pattern):
replacements = {
"": "<省略>",
"": "</省略>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G><表記内容>",
"": "</表記内容></表記内容G></補足表記>",
"": "<言換G>〈<言換>",
"": "</言換><言換>",
"": "</言換>〉</言換G>",
"": "<補足表記><表記対象>",
"": "</表記対象><表記内容G>⦅<表記内容>",
"": "</表記内容>⦆</表記内容G></補足表記>",
}
markup = f"<span>{pattern}</span>"
for key, val in replacements.items():
markup = markup.replace(key, val)
soup = BeautifulSoup(markup, "xml")
hyouki_soup = soup.find("span")
exps = parse_hyouki_soup(hyouki_soup, [""])
return exps

View file

@ -0,0 +1,37 @@
from bot.data import load_phrase_readings
from bot.entries.sankoku8.base_entry import BaseEntry
from bot.entries.sankoku8.parse import parse_hyouki_soup
from bot.entries.sankoku8.parse import parse_hyouki_pattern
class PhraseEntry(BaseEntry):
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
expressions = self._find_expressions(soup)
readings = self._find_readings(soup)
headwords = {}
if len(expressions) != len(readings):
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
phrase_soup = soup.find("句表記")
expressions = parse_hyouki_soup(phrase_soup, [""])
return expressions
def _find_readings(self, soup):
reading_patterns = load_phrase_readings(self.target)
reading_pattern = reading_patterns[self.entry_id]
readings = parse_hyouki_pattern(reading_pattern)
return readings

View file

@ -4,9 +4,17 @@ from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph from bot.data import get_adobe_glyph
__GAIJI = {
"svg-gaiji/byan.svg": "𰻞",
"svg-gaiji/G16EF.svg": "",
}
def preprocess_page(page): def preprocess_page(page):
soup = BeautifulSoup(page, features="xml") soup = BeautifulSoup(page, features="xml")
__replace_glyph_codes(soup) __replace_glyph_codes(soup)
__add_image_alt_text(soup)
__replace_tatehyphen(soup)
page = __strip_page(soup) page = __strip_page(soup)
return page return page
@ -20,6 +28,21 @@ def __replace_glyph_codes(soup):
geta.replace_with(glyph) geta.replace_with(glyph)
def __add_image_alt_text(soup):
for img in soup.find_all("img"):
if not img.has_attr("src"):
continue
src = img.attrs["src"]
if src in __GAIJI:
img.attrs["alt"] = __GAIJI[src]
def __replace_tatehyphen(soup):
for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}):
img.string = ""
img.unwrap()
def __strip_page(soup): def __strip_page(soup):
koumoku = soup.find(["項目"]) koumoku = soup.find(["項目"])
if koumoku is not None: if koumoku is not None:

View file

@ -1,221 +0,0 @@
from bs4 import BeautifulSoup
import bot.entries.expressions as Expressions
import bot.soup as Soup
from bot.data import load_phrase_readings
from bot.entries.entry import Entry
from bot.entries.smk8_preprocess import preprocess_page
class _BaseSmk8Entry(Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self.kanjis = []
def get_global_identifier(self):
parent_part = format(self.entry_id[0], '06')
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
return f"@{self.target.value}-{parent_part}-{child_part}"
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
headword_info = soup.find("見出要素")
if headword_info is None:
return self._part_of_speech_tags
for tag in headword_info.find_all("品詞M"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _add_variant_expressions(self, headwords):
for expressions in headwords.values():
Expressions.add_variant_kanji(expressions)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def _find_reading(self, soup):
midasi_kana = soup.find("見出仮名")
reading = midasi_kana.text
for x in [" ", ""]:
reading = reading.replace(x, "")
return reading
def _find_expressions(self, soup):
clean_expressions = []
for expression in soup.find_all("標準表記"):
clean_expression = self._clean_expression(expression.text)
clean_expressions.append(clean_expression)
expressions = Expressions.expand_abbreviation_list(clean_expressions)
return expressions
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(self.target, subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for el in soup.find_all(["親見出仮名", "親見出表記"]):
el.string = el.attrs["alt"]
for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"]
class Smk8Entry(_BaseSmk8Entry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords
class Smk8ChildEntry(_BaseSmk8Entry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("子見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords
class Smk8PhraseEntry(_BaseSmk8Entry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.__phrase_readings = load_phrase_readings(self.target)
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
alternatives = Expressions.expand_smk_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = Expressions.expand_smk_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
class Smk8KanjiEntry(_BaseSmk8Entry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self.__get_parent_reading()
expressions = self._find_expressions(soup)
headwords = {reading: expressions}
return headwords
def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
parent = self.ID_TO_ENTRY[parent_id]
reading = parent.get_first_reading()
return reading

View file

@ -0,0 +1,73 @@
import bot.soup as Soup
import bot.entries.base.expressions as Expressions
from bot.entries.base.sanseido_entry import SanseidoEntry
class BaseEntry(SanseidoEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.children = []
self.phrases = []
self.kanjis = []
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
headword_info = soup.find("見出要素")
if headword_info is None:
return self._part_of_speech_tags
for tag in headword_info.find_all("品詞M"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _find_reading(self, soup):
midasi_kana = soup.find("見出仮名")
reading = midasi_kana.text
for x in [" ", ""]:
reading = reading.replace(x, "")
return reading
def _find_expressions(self, soup):
clean_expressions = []
for expression in soup.find_all("標準表記"):
clean_expression = self._clean_expression(expression.text)
clean_expressions.append(clean_expression)
expressions = Expressions.expand_abbreviation_list(clean_expressions)
return expressions
def _get_subentry_parameters(self):
from bot.entries.smk8.child_entry import ChildEntry
from bot.entries.smk8.phrase_entry import PhraseEntry
from bot.entries.smk8.kanji_entry import KanjiEntry
subentry_parameters = [
[ChildEntry, ["子項目F", "子項目"], self.children],
[PhraseEntry, ["句項目F", "句項目"], self.phrases],
[KanjiEntry, ["造語成分項目"], self.kanjis],
]
return subentry_parameters
@staticmethod
def _delete_unused_nodes(soup):
"""Remove extra markup elements that appear in the entry
headword line which are not part of the entry headword"""
unused_nodes = [
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for elm in soup.find_all(["親見出仮名", "親見出表記"]):
elm.string = elm.attrs["alt"]
for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"]

View file

@ -0,0 +1,17 @@
from bot.entries.smk8.base_entry import BaseEntry
class ChildEntry(BaseEntry):
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("子見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords

26
bot/entries/smk8/entry.py Normal file
View file

@ -0,0 +1,26 @@
from bot.entries.smk8.base_entry import BaseEntry
from bot.entries.smk8.preprocess import preprocess_page
class Entry(BaseEntry):
def __init__(self, target, page_id):
entry_id = (page_id, 0)
super().__init__(target, entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
headwords = {reading: expressions}
return headwords

View file

@ -0,0 +1,22 @@
from bot.entries.smk8.base_entry import BaseEntry
class KanjiEntry(BaseEntry):
def get_part_of_speech_tags(self):
# kanji entries do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._fill_alts(soup)
reading = self.__get_parent_reading()
expressions = self._find_expressions(soup)
headwords = {reading: expressions}
return headwords
def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
parent = self.ID_TO_ENTRY[parent_id]
reading = parent.get_first_reading()
return reading

View file

@ -0,0 +1,64 @@
import re
import bot.entries.base.expressions as Expressions
from bot.data import load_phrase_readings
from bot.entries.smk8.base_entry import BaseEntry
class PhraseEntry(BaseEntry):
def __init__(self, target, entry_id):
super().__init__(target, entry_id)
self.__phrase_readings = load_phrase_readings(self.target)
def get_part_of_speech_tags(self):
# phrase entries do not contain these tags
return []
def _get_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
return headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
alternatives = parse_phrase(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = parse_phrase(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
def parse_phrase(text):
"""Return a list of strings described by △ notation."""
match = re.search(r"△([^]+)([^]+)", text)
if match is None:
return [text]
alt_parts = [match.group(1)]
for alt_part in match.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, text)
alts.append(alt_exp)
return alts

View file

@ -6,8 +6,8 @@ from bot.data import get_adobe_glyph
__GAIJI = { __GAIJI = {
"gaiji/5350.svg": "", "gaiji/5350.svg": "",
"gaiji/62cb.svg": "", "gaiji/62cb.svg": "",
"gaiji/7be1.svg": "", "gaiji/7be1.svg": "",
} }

37
bot/factory.py Normal file
View file

@ -0,0 +1,37 @@
import importlib
def new_crawler(target):
module_path = f"bot.crawlers.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Crawler(target)
def new_entry(target, page_id):
module_path = f"bot.entries.{target.name.lower()}.entry"
module = importlib.import_module(module_path)
return module.Entry(target, page_id)
def new_yomichan_exporter(target):
module_path = f"bot.yomichan.exporters.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Exporter(target)
def new_yomichan_terminator(target):
module_path = f"bot.yomichan.terms.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Terminator(target)
def new_mdict_exporter(target):
module_path = f"bot.mdict.exporters.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Exporter(target)
def new_mdict_terminator(target):
module_path = f"bot.mdict.terms.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Terminator(target)

View file

@ -1,20 +1,19 @@
# pylint: disable=too-few-public-methods
import subprocess
import os import os
import shutil import shutil
import subprocess
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir from platformdirs import user_documents_dir, user_cache_dir
from bot.mdict.terms.factory import new_terminator from bot.time import timestamp
from bot.factory import new_mdict_terminator
class Exporter(ABC): class BaseExporter(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._terminator = new_terminator(target) self._terminator = new_mdict_terminator(target)
self._build_dir = None self._build_dir = None
self._build_media_dir = None self._build_media_dir = None
self._description_file = None self._description_file = None
@ -34,7 +33,7 @@ class Exporter(ABC):
return self._build_dir return self._build_dir
cache_dir = user_cache_dir("jitenbot") cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "mdict_build") build_directory = os.path.join(cache_dir, "mdict_build")
print(f"Initializing build directory `{build_directory}`") print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir(): if Path(build_directory).is_dir():
shutil.rmtree(build_directory) shutil.rmtree(build_directory)
os.makedirs(build_directory) os.makedirs(build_directory)
@ -45,7 +44,7 @@ class Exporter(ABC):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
build_media_dir = os.path.join(build_dir, self._target.value) build_media_dir = os.path.join(build_dir, self._target.value)
if media_dir is not None: if media_dir is not None:
print("Copying media files to build directory...") print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(media_dir, build_media_dir) shutil.copytree(media_dir, build_media_dir)
else: else:
os.makedirs(build_media_dir) os.makedirs(build_media_dir)
@ -71,7 +70,7 @@ class Exporter(ABC):
def _write_mdx_file(self, entries): def _write_mdx_file(self, entries):
terms = self._get_terms(entries) terms = self._get_terms(entries)
print(f"Exporting {len(terms)} Mdict keys...") print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
out_dir = self._get_out_dir() out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdx") out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
params = [ params = [
@ -87,7 +86,7 @@ class Exporter(ABC):
terms = [] terms = []
entries_len = len(entries) entries_len = len(entries)
for idx, entry in enumerate(entries): for idx, entry in enumerate(entries):
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}" update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry) new_terms = self._terminator.make_terms(entry)
for term in new_terms: for term in new_terms:
@ -126,7 +125,7 @@ class Exporter(ABC):
return self._out_dir return self._out_dir
out_dir = os.path.join( out_dir = os.path.join(
user_documents_dir(), "jitenbot", "mdict", self._target.value) user_documents_dir(), "jitenbot", "mdict", self._target.value)
print(f"Initializing output directory `{out_dir}`") print(f"{timestamp()} Initializing output directory `{out_dir}`")
if Path(out_dir).is_dir(): if Path(out_dir).is_dir():
shutil.rmtree(out_dir) shutil.rmtree(out_dir)
os.makedirs(out_dir) os.makedirs(out_dir)
@ -168,58 +167,8 @@ class Exporter(ABC):
@abstractmethod @abstractmethod
def _get_revision(self, entries): def _get_revision(self, entries):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _get_attribution(self, entries): def _get_attribution(self, entries):
pass raise NotImplementedError
class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = modified_date.strftime("%Y年%m月%d日閲覧")
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution
class JitenonKokugoExporter(_JitenonExporter):
pass
class JitenonYojiExporter(_JitenonExporter):
pass
class JitenonKotowazaExporter(_JitenonExporter):
pass
class _MonokakidoExporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
return timestamp
class Smk8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"
class Sankoku8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,18 @@
from bot.mdict.exporters.base.exporter import BaseExporter
class JitenonExporter(BaseExporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = modified_date.strftime("%Y年%m月%d日閲覧")
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution

View file

@ -0,0 +1,8 @@
from datetime import datetime
from bot.mdict.exporters.base.exporter import BaseExporter
class MonokakidoExporter(BaseExporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
return timestamp

View file

@ -0,0 +1,6 @@
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.mdict.exporters.export import JitenonKokugoExporter
from bot.mdict.exporters.export import JitenonYojiExporter
from bot.mdict.exporters.export import JitenonKotowazaExporter
from bot.mdict.exporters.export import Smk8Exporter
from bot.mdict.exporters.export import Daijirin2Exporter
from bot.mdict.exporters.export import Sankoku8Exporter
def new_mdict_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
Targets.SANKOKU8: Sankoku8Exporter,
}
return exporter_map[target](target)

View file

@ -0,0 +1,5 @@
from bot.mdict.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.mdict.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.mdict.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,6 @@
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,6 @@
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"

View file

@ -0,0 +1,20 @@
from bot.mdict.terms.base.terminator import BaseTerminator
class JitenonTerminator(BaseTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []

View file

@ -2,7 +2,7 @@ import re
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
class Terminator(ABC): class BaseTerminator(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._glossary_cache = {} self._glossary_cache = {}
@ -72,12 +72,12 @@ class Terminator(ABC):
@abstractmethod @abstractmethod
def _glossary(self, entry): def _glossary(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _link_glossary_parameters(self, entry): def _link_glossary_parameters(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _subentry_lists(self, entry): def _subentry_lists(self, entry):
pass raise NotImplementedError

View file

@ -1,8 +1,8 @@
from bot.mdict.terms.terminator import Terminator from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.daijirin2 import make_glossary from bot.mdict.glossary.daijirin2 import make_glossary
class Daijirin2Terminator(Terminator): class Terminator(BaseTerminator):
def _glossary(self, entry): def _glossary(self, entry):
if entry.entry_id in self._glossary_cache: if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id] return self._glossary_cache[entry.entry_id]

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.mdict.terms.jitenon import JitenonKokugoTerminator
from bot.mdict.terms.jitenon import JitenonYojiTerminator
from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
from bot.mdict.terms.smk8 import Smk8Terminator
from bot.mdict.terms.daijirin2 import Daijirin2Terminator
from bot.mdict.terms.sankoku8 import Sankoku8Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
Targets.SANKOKU8: Sankoku8Terminator,
}
return terminator_map[target](target)

View file

@ -1,42 +0,0 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()

View file

@ -0,0 +1,8 @@
from bot.mdict.terms.base.jitenon import JitenonTerminator
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()

View file

@ -0,0 +1,8 @@
from bot.mdict.terms.base.jitenon import JitenonTerminator
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()

View file

@ -0,0 +1,8 @@
from bot.mdict.terms.base.jitenon import JitenonTerminator
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()

View file

@ -1,8 +1,8 @@
from bot.mdict.terms.terminator import Terminator from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.sankoku8 import make_glossary from bot.mdict.glossary.sankoku8 import make_glossary
class Sankoku8Terminator(Terminator): class Terminator(BaseTerminator):
def _glossary(self, entry): def _glossary(self, entry):
if entry.entry_id in self._glossary_cache: if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id] return self._glossary_cache[entry.entry_id]

View file

@ -1,8 +1,8 @@
from bot.mdict.terms.terminator import Terminator from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.smk8 import make_glossary from bot.mdict.glossary.smk8 import make_glossary
class Smk8Terminator(Terminator): class Terminator(BaseTerminator):
def _glossary(self, entry): def _glossary(self, entry):
if entry.entry_id in self._glossary_cache: if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id] return self._glossary_cache[entry.entry_id]

5
bot/time.py Normal file
View file

@ -0,0 +1,5 @@
import time
def timestamp():
return time.strftime('%X')

View file

@ -1,24 +1,23 @@
# pylint: disable=too-few-public-methods
import json import json
import os import os
import shutil import shutil
import copy import copy
from pathlib import Path from pathlib import Path
from datetime import datetime
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from platformdirs import user_documents_dir, user_cache_dir
import fastjsonschema import fastjsonschema
from platformdirs import user_documents_dir, user_cache_dir
from bot.time import timestamp
from bot.data import load_yomichan_metadata from bot.data import load_yomichan_metadata
from bot.yomichan.terms.factory import new_terminator
from bot.data import load_yomichan_term_schema from bot.data import load_yomichan_term_schema
from bot.factory import new_yomichan_terminator
class Exporter(ABC): class BaseExporter(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._terminator = new_terminator(target) self._terminator = new_yomichan_terminator(target)
self._build_dir = None self._build_dir = None
self._terms_per_file = 2000 self._terms_per_file = 2000
@ -36,18 +35,18 @@ class Exporter(ABC):
@abstractmethod @abstractmethod
def _get_revision(self, entries): def _get_revision(self, entries):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _get_attribution(self, entries): def _get_attribution(self, entries):
pass raise NotImplementedError
def _get_build_dir(self): def _get_build_dir(self):
if self._build_dir is not None: if self._build_dir is not None:
return self._build_dir return self._build_dir
cache_dir = user_cache_dir("jitenbot") cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "yomichan_build") build_directory = os.path.join(cache_dir, "yomichan_build")
print(f"Initializing build directory `{build_directory}`") print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir(): if Path(build_directory).is_dir():
shutil.rmtree(build_directory) shutil.rmtree(build_directory)
os.makedirs(build_directory) os.makedirs(build_directory)
@ -66,8 +65,9 @@ class Exporter(ABC):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value) build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None: if image_dir is not None:
print("Copying media files to build directory...") print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(image_dir, build_img_dir) shutil.copytree(image_dir, build_img_dir)
print(f"{timestamp()} Finished copying files")
else: else:
os.makedirs(build_img_dir) os.makedirs(build_img_dir)
self._terminator.set_image_dir(build_img_dir) self._terminator.set_image_dir(build_img_dir)
@ -76,7 +76,7 @@ class Exporter(ABC):
terms = [] terms = []
entries_len = len(entries) entries_len = len(entries)
for idx, entry in enumerate(entries): for idx, entry in enumerate(entries):
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}" update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry) new_terms = self._terminator.make_terms(entry)
for term in new_terms: for term in new_terms:
@ -85,7 +85,7 @@ class Exporter(ABC):
return terms return terms
def __validate_terms(self, terms): def __validate_terms(self, terms):
print("Making a copy of term data for validation...") print(f"{timestamp()} Making a copy of term data for validation...")
terms_copy = copy.deepcopy(terms) # because validator will alter data! terms_copy = copy.deepcopy(terms) # because validator will alter data!
term_count = len(terms_copy) term_count = len(terms_copy)
log_dir = self.__get_invalid_term_dir() log_dir = self.__get_invalid_term_dir()
@ -93,7 +93,7 @@ class Exporter(ABC):
validator = fastjsonschema.compile(schema) validator = fastjsonschema.compile(schema)
failure_count = 0 failure_count = 0
for idx, term in enumerate(terms_copy): for idx, term in enumerate(terms_copy):
update = f"Validating term {idx+1}/{term_count}" update = f"\tValidating term {idx+1}/{term_count}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
try: try:
validator([term]) validator([term])
@ -102,9 +102,9 @@ class Exporter(ABC):
term_file = os.path.join(log_dir, f"{idx}.json") term_file = os.path.join(log_dir, f"{idx}.json")
with open(term_file, "w", encoding='utf8') as f: with open(term_file, "w", encoding='utf8') as f:
json.dump([term], f, indent=4, ensure_ascii=False) json.dump([term], f, indent=4, ensure_ascii=False)
print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}") print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
if failure_count > 0: if failure_count > 0:
print(f"Invalid terms saved to `{log_dir}` for debugging") print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
def __make_dictionary(self, terms, index, tags): def __make_dictionary(self, terms, index, tags):
self.__write_term_banks(terms) self.__write_term_banks(terms)
@ -114,14 +114,14 @@ class Exporter(ABC):
self.__rm_build_dir() self.__rm_build_dir()
def __write_term_banks(self, terms): def __write_term_banks(self, terms):
print(f"Exporting {len(terms)} JSON terms") print(f"{timestamp()} Exporting {len(terms)} JSON terms")
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
max_i = int(len(terms) / self._terms_per_file) + 1 max_i = int(len(terms) / self._terms_per_file) + 1
for i in range(max_i): for i in range(max_i):
update = f"\tWriting terms to term bank {i+1}/{max_i}"
print(update, end='\r', flush=True)
start = self._terms_per_file * i start = self._terms_per_file * i
end = self._terms_per_file * (i + 1) end = self._terms_per_file * (i + 1)
update = f"Writing terms to term banks {start} - {end}"
print(update, end='\r', flush=True)
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json") term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f: with open(term_file, "w", encoding='utf8') as f:
json.dump(terms[start:end], f, indent=4, ensure_ascii=False) json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
@ -142,8 +142,8 @@ class Exporter(ABC):
json.dump(tags, f, indent=4, ensure_ascii=False) json.dump(tags, f, indent=4, ensure_ascii=False)
def __write_archive(self, filename): def __write_archive(self, filename):
print("Archiving data to ZIP file...")
archive_format = "zip" archive_format = "zip"
print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan") out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir(): if not Path(out_dir).is_dir():
os.makedirs(out_dir) os.makedirs(out_dir)
@ -154,58 +154,8 @@ class Exporter(ABC):
base_filename = os.path.join(out_dir, filename) base_filename = os.path.join(out_dir, filename)
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
shutil.make_archive(base_filename, archive_format, build_dir) shutil.make_archive(base_filename, archive_format, build_dir)
print(f"Dictionary file saved to {out_filepath}") print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
def __rm_build_dir(self): def __rm_build_dir(self):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
shutil.rmtree(build_dir) shutil.rmtree(build_dir)
class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = f"{self._target.value};{modified_date}"
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution
class JitenonKokugoExporter(_JitenonExporter):
pass
class JitenonYojiExporter(_JitenonExporter):
pass
class JitenonKotowazaExporter(_JitenonExporter):
pass
class _MonokakidoExporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"
class Smk8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"
class Sankoku8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,18 @@
from bot.yomichan.exporters.base.exporter import BaseExporter
class JitenonExporter(BaseExporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = f"{self._target.value};{modified_date}"
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution

View file

@ -0,0 +1,8 @@
from datetime import datetime
from bot.yomichan.exporters.base.exporter import BaseExporter
class MonokakidoExporter(BaseExporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"

View file

@ -0,0 +1,6 @@
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.yomichan.exporters.export import JitenonKokugoExporter
from bot.yomichan.exporters.export import JitenonYojiExporter
from bot.yomichan.exporters.export import JitenonKotowazaExporter
from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter
from bot.yomichan.exporters.export import Sankoku8Exporter
def new_yomi_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
Targets.SANKOKU8: Sankoku8Exporter,
}
return exporter_map[target](target)

View file

@ -0,0 +1,5 @@
from bot.yomichan.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.yomichan.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.yomichan.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,6 @@
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,6 @@
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"

View file

@ -1,9 +1,10 @@
import re import re
import os import os
from bs4 import BeautifulSoup
from functools import cache from functools import cache
from pathlib import Path from pathlib import Path
from bs4 import BeautifulSoup
import bot.yomichan.glossary.icons as Icons import bot.yomichan.glossary.icons as Icons
from bot.soup import delete_soup_nodes from bot.soup import delete_soup_nodes
from bot.data import load_yomichan_name_conversion from bot.data import load_yomichan_name_conversion

View file

@ -0,0 +1,26 @@
from bot.yomichan.terms.base.terminator import BaseTerminator
class JitenonTerminator(BaseTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _definition_tags(self, entry):
return None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []

View file

@ -2,7 +2,7 @@ from abc import abstractmethod, ABC
from bot.data import load_yomichan_inflection_categories from bot.data import load_yomichan_inflection_categories
class Terminator(ABC): class BaseTerminator(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._glossary_cache = {} self._glossary_cache = {}
@ -66,28 +66,28 @@ class Terminator(ABC):
@abstractmethod @abstractmethod
def _definition_tags(self, entry): def _definition_tags(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _inflection_rules(self, entry, expression): def _inflection_rules(self, entry, expression):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _glossary(self, entry): def _glossary(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _sequence(self, entry): def _sequence(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _term_tags(self, entry): def _term_tags(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _link_glossary_parameters(self, entry): def _link_glossary_parameters(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _subentry_lists(self, entry): def _subentry_lists(self, entry):
pass raise NotImplementedError

View file

@ -1,14 +1,10 @@
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry from bot.entries.daijirin2.phrase_entry import PhraseEntry
from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.daijirin2 import make_glossary from bot.yomichan.glossary.daijirin2 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Daijirin2Terminator(Terminator): class Terminator(BaseTerminator):
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry): def _definition_tags(self, entry):
return "" return ""

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
from bot.yomichan.terms.sankoku8 import Sankoku8Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
Targets.SANKOKU8: Sankoku8Terminator,
}
return terminator_map[target](target)

View file

@ -1,68 +0,0 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _definition_tags(self, entry):
return None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression):
return ""
def _term_tags(self, entry):
tags = entry.kanken_level.split("/")
return " ".join(tags)
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""

View file

@ -0,0 +1,15 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
from bot.yomichan.terms.base.jitenon import JitenonTerminator
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""

View file

@ -0,0 +1,15 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
from bot.yomichan.terms.base.jitenon import JitenonTerminator
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""

View file

@ -0,0 +1,15 @@
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
from bot.yomichan.terms.base.jitenon import JitenonTerminator
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression):
return ""
def _term_tags(self, entry):
tags = entry.kanken_level.split("/")
return " ".join(tags)

View file

@ -1,14 +1,10 @@
from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry from bot.entries.sankoku8.phrase_entry import PhraseEntry
from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.sankoku8 import make_glossary from bot.yomichan.glossary.sankoku8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Sankoku8Terminator(Terminator): class Terminator(BaseTerminator):
def __init__(self, target):
super().__init__(target)
def _definition_tags(self, entry): def _definition_tags(self, entry):
return "" return ""

View file

@ -1,12 +1,11 @@
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry from bot.entries.smk8.kanji_entry import KanjiEntry
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry from bot.entries.smk8.phrase_entry import PhraseEntry
from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.smk8 import make_glossary from bot.yomichan.glossary.smk8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Smk8Terminator(Terminator): class Terminator(BaseTerminator):
def __init__(self, target): def __init__(self, target):
super().__init__(target) super().__init__(target)

View file

@ -1,65 +1,61 @@
亙,亘 𠮟,叱
俠,侠
俱,倶
儘,侭
凜,凛
剝,剥
吞,呑 吞,呑
靭,靱
臈,﨟
啞,唖 啞,唖
噓,嘘
嚙,噛 嚙,噛
囊,嚢
塡,填
壺,壷
屛,屏 屛,屏
屢,屡
幷,并 幷,并
彎,弯 彎,弯
搔,掻 搔,掻
摑,掴
攪,撹 攪,撹
曾,曽
枡,桝 枡,桝
檜,桧
檮,梼
潑,溌
濤,涛
濾,沪 濾,沪
瀆,涜
灌,潅
焰,焔
瘦,痩
禰,祢
禱,祷
穎,頴
竈,竃
簞,箪
籠,篭
繡,繍 繡,繍
繫,繋
萊,莱
蔣,蒋 蔣,蒋
藪,薮
蘆,芦
蟬,蝉
蠅,蝿
蠟,蝋 蠟,蝋
蠣,蛎
賤,賎
軀,躯
邇,迩
醬,醤 醬,醤
醱,醗 穎,頴
靱,靭
頰,頬
頸,頚
顚,顛
驒,騨
鰺,鯵
鶯,鴬
鷗,鴎 鷗,鴎
鹼,鹸 鹼,鹸
麴,麹 麴,麹
麵,麺 俠,侠
﨟,臈 俱,倶
𠮟,叱 剝,剥
噓,嘘
囊,嚢
塡,填
屢,屡
摑,掴
瀆,涜
潑,溌
焰,焔
簞,箪
繫,繋
萊,莱
蟬,蝉
軀,躯
醱,醗
頰,頬
顚,顛
驒,騨
姸,妍
攢,攅
𣜜,杤
檔,档
槶,椢
櫳,槞
纊,絋
纘,纉
隯,陦
筓,笄
逬,迸
腁,胼
騈,駢
拋,抛
篡,簒
檜,桧
禰,祢
禱,祷
蘆,芦
凜,凛
1 𠮟
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 𠮟
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44 𣜜
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

View file

@ -21,7 +21,7 @@ import sys
import argparse import argparse
import subprocess import subprocess
from bot.targets import Targets from bot.targets import Targets
from bot.crawlers.factory import new_crawler from bot.factory import new_crawler
def filename(f): def filename(f):

View file

@ -1,5 +1,7 @@
#!/bin/sh #!/bin/sh
export PYTHONPYCACHEPREFIX=/tmp/pycache
python -m unittest discover -s tests python -m unittest discover -s tests
python jitenbot.py jitenon-kokugo python jitenbot.py jitenon-kokugo

View file

@ -0,0 +1,21 @@
import unittest
from bot.entries.daijirin2.phrase_entry import parse_phrase
class TestDaijirin2PhraseParse(unittest.TestCase):
def test1(self):
text = "同じ穴の=狢(=狐・狸)"
exps = parse_phrase(text)
self.assertEqual(len(exps), 3)
self.assertIn("同じ穴の狢", exps)
self.assertIn("同じ穴の狐", exps)
self.assertIn("同じ穴の狸", exps)
def test2(self):
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
exps = parse_phrase(text)
self.assertEqual(len(exps), 4)
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)

View file

@ -1,5 +1,5 @@
import unittest import unittest
import bot.entries.expressions as Expressions import bot.entries.base.expressions as Expressions
class TestExpressions(unittest.TestCase): class TestExpressions(unittest.TestCase):
@ -34,8 +34,8 @@ class TestExpressions(unittest.TestCase):
self.assertIn("凶々しい", exps) self.assertIn("凶々しい", exps)
self.assertIn("凶凶しい", exps) self.assertIn("凶凶しい", exps)
def test_add_variant_kanji(self): def test_add_variant_kanji1(self):
exps = ["剝く", "掴む", "摑む"] exps = ["剥く", "摑む"]
Expressions.add_variant_kanji(exps) Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4) self.assertEqual(len(exps), 4)
self.assertIn("剥く", exps) self.assertIn("剥く", exps)
@ -44,6 +44,15 @@ class TestExpressions(unittest.TestCase):
self.assertIn("摑む", exps) self.assertIn("摑む", exps)
def test_add_variant_kanji2(self): def test_add_variant_kanji2(self):
exps = ["剝く", "掴む", "摑む"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
self.assertIn("剥く", exps)
self.assertIn("剝く", exps)
self.assertIn("掴む", exps)
self.assertIn("摑む", exps)
def test_add_variant_kanji3(self):
exps = ["剝摑"] exps = ["剝摑"]
Expressions.add_variant_kanji(exps) Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4) self.assertEqual(len(exps), 4)
@ -52,6 +61,15 @@ class TestExpressions(unittest.TestCase):
self.assertIn("剥掴", exps) self.assertIn("剥掴", exps)
self.assertIn("剥摑", exps) self.assertIn("剥摑", exps)
def test_add_variant_kanji4(self):
exps = ["剥掴"]
Expressions.add_variant_kanji(exps)
self.assertEqual(len(exps), 4)
self.assertIn("剝摑", exps)
self.assertIn("剝掴", exps)
self.assertIn("剥掴", exps)
self.assertIn("剥摑", exps)
def test_expand_abbreviation(self): def test_expand_abbreviation(self):
text = "有(り)合(わ)せ" text = "有(り)合(わ)せ"
abbrs = Expressions.expand_abbreviation(text) abbrs = Expressions.expand_abbreviation(text)
@ -69,28 +87,3 @@ class TestExpressions(unittest.TestCase):
self.assertIn("有合わせ", abbrs) self.assertIn("有合わせ", abbrs)
self.assertIn("有り合せ", abbrs) self.assertIn("有り合せ", abbrs)
self.assertIn("有合せ", abbrs) self.assertIn("有合せ", abbrs)
def test_smk_expand_alternatives(self):
text = "△金(時間・暇)に飽かして"
exps = Expressions.expand_smk_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("金に飽かして", exps)
self.assertIn("時間に飽かして", exps)
self.assertIn("暇に飽かして", exps)
def test_daijirin_expand_alternatives(self):
text = "同じ穴の=狢(=狐・狸)"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 3)
self.assertIn("同じ穴の狢", exps)
self.assertIn("同じ穴の狐", exps)
self.assertIn("同じ穴の狸", exps)
def test_daijirin_expand_alternatives2(self):
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
exps = Expressions.expand_daijirin_alternatives(text)
self.assertEqual(len(exps), 4)
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)

View file

@ -1,16 +1,16 @@
import unittest import unittest
from bot.entries.sankoku8 import parse_hyouki_pattern from bot.entries.sankoku8.parse import parse_hyouki_pattern
class TestSankokuPhrases(unittest.TestCase): class TestSankoku8PhraseParse(unittest.TestCase):
def test_sankoku_phrases1(self): def test1(self):
pattern = '耳にたこ(ができる)' pattern = '耳にたこ(ができる)'
exps = parse_hyouki_pattern(pattern) exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2) self.assertEqual(len(exps), 2)
self.assertIn("耳にたこ", exps) self.assertIn("耳にたこ", exps)
self.assertIn("耳にたこができる", exps) self.assertIn("耳にたこができる", exps)
def test_sankoku_phrases2(self): def test2(self):
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉' pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
exps = parse_hyouki_pattern(pattern) exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 4) self.assertEqual(len(exps), 4)
@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("一斑をもって全豹を卜す", exps) self.assertIn("一斑をもって全豹を卜す", exps)
self.assertIn("一斑をもって全豹を推す", exps) self.assertIn("一斑をもって全豹を推す", exps)
def test_sankoku_phrases3(self): def test3(self):
pattern = '{かじ・舵}を切る' pattern = '{かじ・舵}を切る'
exps = parse_hyouki_pattern(pattern) exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 2) self.assertEqual(len(exps), 2)
self.assertIn("かじを切る", exps) self.assertIn("かじを切る", exps)
self.assertIn("舵を切る", exps) self.assertIn("舵を切る", exps)
def test_sankoku_phrases4(self): def test4(self):
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉' pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
exps = parse_hyouki_pattern(pattern) exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 6) self.assertEqual(len(exps), 6)
@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase):
self.assertIn("重箱の隅をようじでほじくる", exps) self.assertIn("重箱の隅をようじでほじくる", exps)
self.assertIn("重箱の隅を楊枝でほじくる", exps) self.assertIn("重箱の隅を楊枝でほじくる", exps)
def test_sankoku_phrases5(self): def test5(self):
pattern = '群盲象を〈{な・撫}でる/評する〉' pattern = '群盲象を〈{な・撫}でる/評する〉'
exps = parse_hyouki_pattern(pattern) exps = parse_hyouki_pattern(pattern)
self.assertEqual(len(exps), 3) self.assertEqual(len(exps), 3)

19
tests/test_smk_phrases.py Normal file
View file

@ -0,0 +1,19 @@
import unittest
from bot.entries.smk8.phrase_entry import parse_phrase
class TestSmk8PhraseParse(unittest.TestCase):
def test1(self):
text = "目と鼻の△先(間)"
exps = parse_phrase(text)
self.assertEqual(len(exps), 2)
self.assertIn("目と鼻の先", exps)
self.assertIn("目と鼻の間", exps)
def test2(self):
text = "△金(時間・暇)に飽かして"
exps = parse_phrase(text)
self.assertEqual(len(exps), 3)
self.assertIn("金に飽かして", exps)
self.assertIn("時間に飽かして", exps)
self.assertIn("暇に飽かして", exps)