diff --git a/bot/crawlers.py b/bot/crawlers.py
index e520f76..7ba495c 100644
--- a/bot/crawlers.py
+++ b/bot/crawlers.py
@@ -4,11 +4,13 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper
+from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
+from bot.yomichan.export import JitenonKokugoExporter
from bot.yomichan.export import JitenonKotowazaExporter
from bot.yomichan.export import JitenonYojiExporter
from bot.yomichan.export import Smk8Exporter
@@ -48,6 +50,41 @@ class _Crawler():
return page_id
+class JitenonKokugoCrawler(_Crawler):
+ def __init__(self, args):
+ super().__init__(args)
+ self._entry_class = JitenonKokugoEntry
+ self._yomi_exporter = JitenonKokugoExporter(args.target)
+ self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
+ self._page_id_pattern = r"word/p([0-9]+)$"
+
+ def collect_pages(self):
+ jitenon = Scraper.Jitenon()
+ gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
+ gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+ for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+ gojuon_href = gojuon_a['href']
+ max_kana_page = 1
+ current_kana_page = 1
+ while current_kana_page <= max_kana_page:
+ kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
+ current_kana_page += 1
+ kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+ page_total = kana_soup.find(class_="page_total").text
+ m = re.search(r"全([0-9]+)件", page_total)
+ if m:
+ max_kana_page = int(m.group(1))
+ for kana_a in kana_soup.select(".word_box a", href=True):
+ page_link = kana_a['href']
+ page_id = self._parse_page_id(page_link)
+ if page_id is None:
+ continue
+ _, page_path = jitenon.scrape(page_link)
+ self._page_map[page_id] = page_path
+ pages_len = len(self._page_map)
+ print(f"Finished scraping {pages_len} pages")
+
+
class _JitenonCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py
index afff5b7..e1e17b4 100644
--- a/bot/entries/jitenon.py
+++ b/bot/entries/jitenon.py
@@ -7,8 +7,14 @@ import bot.expressions as Expressions
class _JitenonEntry(Entry):
+ ID_TO_ENTRY = {}
+
def __init__(self, entry_id):
super().__init__(entry_id)
+ if entry_id not in self.ID_TO_ENTRY:
+ self.ID_TO_ENTRY[entry_id] = self
+ else:
+ raise Exception(f"Duplicate entry ID: {entry_id}")
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self._COLUMNS.values():
@@ -44,9 +50,9 @@ class _JitenonEntry(Entry):
def _set_headwords(self):
headwords = {}
- for yomikata in self.__yomikatas():
+ for yomikata in self._yomikatas():
headwords[yomikata] = [self.expression]
- ikei_headwords = self.__ikei_headwords()
+ ikei_headwords = self._ikei_headwords()
for reading, expressions in ikei_headwords.items():
if reading not in headwords:
headwords[reading] = []
@@ -73,7 +79,7 @@ class _JitenonEntry(Entry):
else:
attr_value.append(colval)
- def __yomikatas(self):
+ def _yomikatas(self):
yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
@@ -94,7 +100,7 @@ class _JitenonEntry(Entry):
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""]
- def __ikei_headwords(self):
+ def _ikei_headwords(self):
ikei_headwords = {}
for val in self.ikei:
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
@@ -174,3 +180,39 @@ class JitenonKotowazaEntry(_JitenonEntry):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions)
+
+
+class JitenonKokugoEntry(_JitenonEntry):
+ _COLUMNS = {
+ "言葉": ["expression", ""],
+ "読み方": ["yomikata", ""],
+ "意味": ["imi", ""],
+ "例文": ["reibun", ""],
+ "別表記": ["betsuhyouki", ""],
+ "対義語": ["taigigo", ""],
+ "活用": ["katsuyou", ""],
+ "用例": ["yourei", ""],
+ "類語": ["ruigo", ""],
+ }
+
+ def __init__(self, entry_id):
+ super().__init__(entry_id)
+
+ def _set_headwords(self):
+ headwords = {}
+ for reading in self.yomikata.split("・"):
+ if reading not in headwords:
+ headwords[reading] = []
+ for expression in self.expression.split("・"):
+ headwords[reading].append(expression)
+ if self.betsuhyouki.strip() != "":
+ for expression in self.betsuhyouki.split("・"):
+ headwords[reading].append(expression)
+ self._headwords = headwords
+
+ def _set_variant_headwords(self):
+ for expressions in self._headwords.values():
+ Expressions.add_variant_kanji(expressions, self._variant_kanji)
+ Expressions.add_fullwidth(expressions)
+ Expressions.remove_iteration_mark(expressions)
+ Expressions.add_iteration_mark(expressions)
diff --git a/bot/yomichan/export.py b/bot/yomichan/export.py
index 15b61c5..a2acf81 100644
--- a/bot/yomichan/export.py
+++ b/bot/yomichan/export.py
@@ -7,6 +7,7 @@ from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata
+from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
@@ -20,8 +21,7 @@ class Exporter:
self._terms_per_file = 2000
def export(self, entries, image_dir):
- if image_dir is not None:
- self.__init_build_image_dir(image_dir)
+ self.__init_build_image_dir(image_dir)
meta = load_yomichan_metadata()
index = meta[self._name]["index"]
index["revision"] = self._get_revision(entries)
@@ -42,10 +42,13 @@ class Exporter:
return self._build_dir
def __init_build_image_dir(self, image_dir):
- print("Copying image files to build directory...")
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._name)
- shutil.copytree(image_dir, build_img_dir)
+ if image_dir is not None:
+ print("Copying image files to build directory...")
+ shutil.copytree(image_dir, build_img_dir)
+ else:
+ os.makedirs(build_img_dir)
self._terminator.set_image_dir(build_img_dir)
def __get_terms(self, entries):
@@ -131,6 +134,12 @@ class JitenonExporter(Exporter):
return attribution
+class JitenonKokugoExporter(JitenonExporter):
+ def __init__(self, name):
+ super().__init__(name)
+ self._terminator = JitenonKokugoTerminator(name)
+
+
class JitenonYojiExporter(JitenonExporter):
def __init__(self, name):
super().__init__(name)
diff --git a/bot/yomichan/glossary/jitenon.py b/bot/yomichan/glossary/jitenon.py
index ebee87c..6e3a192 100644
--- a/bot/yomichan/glossary/jitenon.py
+++ b/bot/yomichan/glossary/jitenon.py
@@ -1,93 +1,176 @@
import re
+import os
+from bs4 import BeautifulSoup
+import bot.icons as Icons
from bot.yomichan.glossary.gloss import make_gloss
-def make_glossary(entry):
- soup = entry.get_page_soup()
- __replace_punctuation(soup)
- __add_internal_links(soup)
- __convert_paragraphs(soup)
- __style_table_headers(soup)
- __unwrap_table_body(soup)
- __decompose_table_rows(soup, entry)
- __insert_headword_line(soup, entry)
- gloss = make_gloss(soup.body)
- glossary = [gloss]
- return glossary
+class JitenonGlossary():
+ def __init__(self):
+ self._id_pattern = None
+ self._expression_header = None
+ def _replace_punctuation(self, soup):
+ punctuation = {
+ "/": "/",
+ ",": "、",
+ }
+ for el in soup.find_all(string=True):
+ text = el.text
+ for old, new in punctuation.items():
+ text = text.replace(old, new)
+ el.replace_with(text)
-def __replace_punctuation(soup):
- punctuation = {
- "/": "/",
- ",": "、",
- }
- for el in soup.find_all(string=True):
- text = el.text
- for old, new in punctuation.items():
- text = text.replace(old, new)
- el.replace_with(text)
+ def _add_internal_links(self, soup, entry):
+ for el in soup.find_all("a"):
+ href = el.attrs["href"]
+ m = re.search(self._id_pattern, href)
+ if m is not None:
+ ref_entry_id = int(m.group(1))
+ ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
+ expression = ref_entry.get_first_expression()
+ el.attrs["href"] = f"?query={expression}&wildcards=off"
+ elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
+ pass
+ else:
+ raise Exception(f"Invalid href format: {href}")
+ def _convert_paragraphs(self, soup):
+ for p in soup.find_all("p"):
+ p.name = "div"
-def __add_internal_links(soup):
- patterns = [
- r"^(.+)([ぁ-ヿ、\s]+)$",
- r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
- ]
- for a in soup.find_all("a"):
- for pattern in patterns:
- m = re.search(pattern, a.text)
- if m:
- a['href'] = f"?query={m.group(1)}&wildcards=off"
- break
+ def _style_table_headers(self, soup):
+ for th in soup.find_all("th"):
+ th['style'] = "vertical-align: middle; text-align: center;"
+ def _unwrap_table_body(self, soup):
+ if soup.find("tbody") is not None:
+ soup.tbody.unwrap()
-def __convert_paragraphs(soup):
- for p in soup.find_all("p"):
- p.name = "span"
-
-
-def __style_table_headers(soup):
- for th in soup.find_all("th"):
- th['style'] = "vertical-align: middle; text-align: center;"
-
-
-def __unwrap_table_body(soup):
- if soup.find("tbody") is not None:
- soup.tbody.unwrap()
-
-
-def __decompose_table_rows(soup, entry):
- for tr in soup.find_all("tr"):
- if tr.find("th") is None:
- continue
- elif tr.th.text in ["四字熟語", "言葉"]:
- tr.decompose()
- elif tr.th.text == "読み方":
- if __do_display_yomikata_in_headword(entry):
+ def _decompose_table_rows(self, soup, entry):
+ for tr in soup.find_all("tr"):
+ if tr.find("th") is None:
+ continue
+ elif tr.th.text == self._expression_header:
tr.decompose()
- elif tr.th.text == "意味":
- imi = tr.td
- imi.name = "div"
- soup.body.insert(0, imi)
- tr.decompose()
- if soup.find("tr") is None:
- soup.table.decompose()
+ elif tr.th.text == "読み方":
+ if self._do_display_yomikata_in_headword(entry):
+ tr.decompose()
+ elif tr.th.text == "意味":
+ imi = tr.td
+ imi.name = "div"
+ soup.body.insert(0, imi)
+ tr.decompose()
+ if soup.find("tr") is None:
+ soup.table.decompose()
+
+ def _insert_headword_line(self, soup, entry):
+ headword_line = soup.new_tag("span")
+ if self._do_display_yomikata_in_headword(entry):
+ headword_line.string = f"{entry.yomikata}【{entry.expression}】"
+ else:
+ headword_line.string = f"【{entry.expression}】"
+ soup.body.insert(0, headword_line)
+
+ def _do_display_yomikata_in_headword(self, entry):
+ if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
+ return False
+ elif len(entry.yomikata) > 10:
+ return False
+ else:
+ return True
-def __insert_headword_line(soup, entry):
- headword_line = soup.new_tag("span")
- if __do_display_yomikata_in_headword(entry):
- headword_line.string = f"{entry.yomikata}【{entry.expression}】"
- else:
- headword_line.string = f"【{entry.expression}】"
- soup.body.insert(0, headword_line)
+class JitenonKokugoGlossary(JitenonGlossary):
+ def __init__(self):
+ super().__init__()
+ self._expression_header = "言葉"
+ self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
+
+ def make_glossary(self, entry, image_dir):
+ soup = entry.get_page_soup()
+ self._remove_antonym_list_item(soup)
+ self._replace_number_icons(soup, image_dir)
+ self._replace_punctuation(soup)
+ self._add_internal_links(soup, entry)
+ self._convert_paragraphs(soup)
+ self._style_table_headers(soup)
+ self._unwrap_table_body(soup)
+ self._decompose_table_rows(soup, entry)
+ self._insert_headword_line(soup, entry)
+ gloss = make_gloss(soup.body)
+ glossary = [gloss]
+ return glossary
+
+ def _remove_antonym_list_item(self, soup):
+ for el in soup.find_all("li"):
+ if el.text == "対義語辞典":
+ el.decompose()
+
+ def _replace_number_icons(self, soup, image_dir):
+ for el in soup.find_all("img"):
+ alt = el.attrs["alt"]
+ text = re.search(r"[0-9]+", alt).group(0)
+ filename = f"{text}-fill.svg"
+ path = os.path.join(image_dir, filename)
+ Icons.make_monochrome_fill_rectangle(path, text)
+ ratio = Icons.calculate_ratio(path)
+ img = BeautifulSoup("", "xml").img
+ img.attrs = {
+ "height": 1.0 if ratio > 1.0 else ratio,
+ "width": ratio if ratio > 1.0 else 1.0,
+ "sizeUnits": "em",
+ "collapsible": False,
+ "collapsed": False,
+ "background": False,
+ "appearance": "monochrome",
+ "title": alt,
+ "path": f"{os.path.basename(image_dir)}/{filename}",
+ }
+ el.name = "span"
+ el.append(img)
+ el.attrs["style"] = "margin-right: 0.25em;"
+
+ def _do_display_yomikata_in_headword(self, entry):
+ return len(entry.yomikata) <= 10
-def __do_display_yomikata_in_headword(entry):
- if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
- return False
- elif len(entry.yomikata) > 10:
- return False
- else:
- return True
+class JitenonYojiGlossary(JitenonGlossary):
+ def __init__(self):
+ super().__init__()
+ self._expression_header = "四字熟語"
+ self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
+
+ def make_glossary(self, entry, image_dir):
+ soup = entry.get_page_soup()
+ self._replace_punctuation(soup)
+ self._add_internal_links(soup, entry)
+ self._convert_paragraphs(soup)
+ self._style_table_headers(soup)
+ self._unwrap_table_body(soup)
+ self._decompose_table_rows(soup, entry)
+ self._insert_headword_line(soup, entry)
+ gloss = make_gloss(soup.body)
+ glossary = [gloss]
+ return glossary
+
+
+class JitenonKotowazaGlossary(JitenonGlossary):
+ def __init__(self):
+ super().__init__()
+ self._expression_header = "言葉"
+ self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
+
+ def make_glossary(self, entry, image_dir):
+ soup = entry.get_page_soup()
+ self._replace_punctuation(soup)
+ self._add_internal_links(soup, entry)
+ self._convert_paragraphs(soup)
+ self._style_table_headers(soup)
+ self._unwrap_table_body(soup)
+ self._decompose_table_rows(soup, entry)
+ self._insert_headword_line(soup, entry)
+ gloss = make_gloss(soup.body)
+ glossary = [gloss]
+ return glossary
diff --git a/bot/yomichan/terms/jitenon.py b/bot/yomichan/terms/jitenon.py
index 75a3a5f..45f4d5b 100644
--- a/bot/yomichan/terms/jitenon.py
+++ b/bot/yomichan/terms/jitenon.py
@@ -1,6 +1,9 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.terms.terminator import Terminator
-from bot.yomichan.glossary.jitenon import make_glossary
+
+from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
+from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
+from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
@@ -13,7 +16,7 @@ class JitenonTerminator(Terminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
- glossary = make_glossary(entry)
+ glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
@@ -27,9 +30,22 @@ class JitenonTerminator(Terminator):
return []
+class JitenonKokugoTerminator(JitenonTerminator):
+ def __init__(self, name):
+ super().__init__(name)
+ self._glossary_maker = JitenonKokugoGlossary()
+
+ def _inflection_rules(self, entry, expression):
+ return sudachi_rules(expression)
+
+ def _term_tags(self, entry):
+ return ""
+
+
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
+ self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression):
return ""
@@ -42,6 +58,7 @@ class JitenonYojiTerminator(JitenonTerminator):
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
+ self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
diff --git a/data/yomichan_metadata.json b/data/yomichan_metadata.json
index c892015..3fa8cb4 100644
--- a/data/yomichan_metadata.json
+++ b/data/yomichan_metadata.json
@@ -1,4 +1,13 @@
{
+ "jitenon-kokugo": {
+ "index": {
+ "title": "国語辞典オンライン",
+ "sequenced": true,
+ "format": 3,
+ "url": "https://kokugo.jitenon.jp/"
+ },
+ "tags": []
+ },
"jitenon-yoji": {
"index": {
"title": "四字熟語辞典オンライン",
diff --git a/jitenbot.py b/jitenbot.py
index acd73a4..be42f5b 100644
--- a/jitenbot.py
+++ b/jitenbot.py
@@ -18,6 +18,7 @@ along with this program. If not, see .
import os
import argparse
+from bot.crawlers import JitenonKokugoCrawler
from bot.crawlers import JitenonYojiCrawler
from bot.crawlers import JitenonKotowazaCrawler
from bot.crawlers import Smk8Crawler
@@ -59,6 +60,7 @@ def parse_args(targets):
def main():
crawlers = {
+ "jitenon-kokugo": JitenonKokugoCrawler,
"jitenon-yoji": JitenonYojiCrawler,
"jitenon-kotowaza": JitenonKotowazaCrawler,
"smk8": Smk8Crawler,