Add support for jitenon-kokugo

This commit is contained in:
stephenmk 2023-05-05 22:53:17 -05:00
parent fa86377404
commit 7ad72a6e4f
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
7 changed files with 286 additions and 87 deletions

View file

@ -4,11 +4,13 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper import bot.scraper as Scraper
from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonKotowazaEntry from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.jitenon import JitenonYojiEntry from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.smk8 import Smk8Entry from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry from bot.entries.daijirin2 import Daijirin2Entry
from bot.yomichan.export import JitenonKokugoExporter
from bot.yomichan.export import JitenonKotowazaExporter from bot.yomichan.export import JitenonKotowazaExporter
from bot.yomichan.export import JitenonYojiExporter from bot.yomichan.export import JitenonYojiExporter
from bot.yomichan.export import Smk8Exporter from bot.yomichan.export import Smk8Exporter
@ -48,6 +50,41 @@ class _Crawler():
return page_id return page_id
class JitenonKokugoCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = JitenonKokugoEntry
self._yomi_exporter = JitenonKokugoExporter(args.target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self):
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
max_kana_page = 1
current_kana_page = 1
while current_kana_page <= max_kana_page:
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
current_kana_page += 1
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
page_total = kana_soup.find(class_="page_total").text
m = re.search(r"全([0-9]+)件", page_total)
if m:
max_kana_page = int(m.group(1))
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
class _JitenonCrawler(_Crawler): class _JitenonCrawler(_Crawler):
def __init__(self, args): def __init__(self, args):
super().__init__(args) super().__init__(args)

View file

@ -7,8 +7,14 @@ import bot.expressions as Expressions
class _JitenonEntry(Entry): class _JitenonEntry(Entry):
ID_TO_ENTRY = {}
def __init__(self, entry_id): def __init__(self, entry_id):
super().__init__(entry_id) super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.modified_date = date(1970, 1, 1) self.modified_date = date(1970, 1, 1)
self.attribution = "" self.attribution = ""
for column in self._COLUMNS.values(): for column in self._COLUMNS.values():
@ -44,9 +50,9 @@ class _JitenonEntry(Entry):
def _set_headwords(self): def _set_headwords(self):
headwords = {} headwords = {}
for yomikata in self.__yomikatas(): for yomikata in self._yomikatas():
headwords[yomikata] = [self.expression] headwords[yomikata] = [self.expression]
ikei_headwords = self.__ikei_headwords() ikei_headwords = self._ikei_headwords()
for reading, expressions in ikei_headwords.items(): for reading, expressions in ikei_headwords.items():
if reading not in headwords: if reading not in headwords:
headwords[reading] = [] headwords[reading] = []
@ -73,7 +79,7 @@ class _JitenonEntry(Entry):
else: else:
attr_value.append(colval) attr_value.append(colval)
def __yomikatas(self): def _yomikatas(self):
yomikata = self.yomikata yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata) m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m: if m:
@ -94,7 +100,7 @@ class _JitenonEntry(Entry):
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n") print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""] return [""]
def __ikei_headwords(self): def _ikei_headwords(self):
ikei_headwords = {} ikei_headwords = {}
for val in self.ikei: for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val) m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
@ -174,3 +180,39 @@ class JitenonKotowazaEntry(_JitenonEntry):
for expressions in self._headwords.values(): for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji) Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions) Expressions.add_fullwidth(expressions)
class JitenonKokugoEntry(_JitenonEntry):
_COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"例文": ["reibun", ""],
"別表記": ["betsuhyouki", ""],
"対義語": ["taigigo", ""],
"活用": ["katsuyou", ""],
"用例": ["yourei", ""],
"類語": ["ruigo", ""],
}
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
headwords = {}
for reading in self.yomikata.split(""):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split(""):
headwords[reading].append(expression)
if self.betsuhyouki.strip() != "":
for expression in self.betsuhyouki.split(""):
headwords[reading].append(expression)
self._headwords = headwords
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)

View file

@ -7,6 +7,7 @@ from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata from bot.data import load_yomichan_metadata
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator from bot.yomichan.terms.smk8 import Smk8Terminator
@ -20,8 +21,7 @@ class Exporter:
self._terms_per_file = 2000 self._terms_per_file = 2000
def export(self, entries, image_dir): def export(self, entries, image_dir):
if image_dir is not None: self.__init_build_image_dir(image_dir)
self.__init_build_image_dir(image_dir)
meta = load_yomichan_metadata() meta = load_yomichan_metadata()
index = meta[self._name]["index"] index = meta[self._name]["index"]
index["revision"] = self._get_revision(entries) index["revision"] = self._get_revision(entries)
@ -42,10 +42,13 @@ class Exporter:
return self._build_dir return self._build_dir
def __init_build_image_dir(self, image_dir): def __init_build_image_dir(self, image_dir):
print("Copying image files to build directory...")
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._name) build_img_dir = os.path.join(build_dir, self._name)
shutil.copytree(image_dir, build_img_dir) if image_dir is not None:
print("Copying image files to build directory...")
shutil.copytree(image_dir, build_img_dir)
else:
os.makedirs(build_img_dir)
self._terminator.set_image_dir(build_img_dir) self._terminator.set_image_dir(build_img_dir)
def __get_terms(self, entries): def __get_terms(self, entries):
@ -131,6 +134,12 @@ class JitenonExporter(Exporter):
return attribution return attribution
class JitenonKokugoExporter(JitenonExporter):
def __init__(self, name):
super().__init__(name)
self._terminator = JitenonKokugoTerminator(name)
class JitenonYojiExporter(JitenonExporter): class JitenonYojiExporter(JitenonExporter):
def __init__(self, name): def __init__(self, name):
super().__init__(name) super().__init__(name)

View file

@ -1,93 +1,176 @@
import re import re
import os
from bs4 import BeautifulSoup
import bot.icons as Icons
from bot.yomichan.glossary.gloss import make_gloss from bot.yomichan.glossary.gloss import make_gloss
def make_glossary(entry): class JitenonGlossary():
soup = entry.get_page_soup() def __init__(self):
__replace_punctuation(soup) self._id_pattern = None
__add_internal_links(soup) self._expression_header = None
__convert_paragraphs(soup)
__style_table_headers(soup)
__unwrap_table_body(soup)
__decompose_table_rows(soup, entry)
__insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary
def _replace_punctuation(self, soup):
punctuation = {
"/": "",
",": "",
}
for el in soup.find_all(string=True):
text = el.text
for old, new in punctuation.items():
text = text.replace(old, new)
el.replace_with(text)
def __replace_punctuation(soup): def _add_internal_links(self, soup, entry):
punctuation = { for el in soup.find_all("a"):
"/": "", href = el.attrs["href"]
",": "", m = re.search(self._id_pattern, href)
} if m is not None:
for el in soup.find_all(string=True): ref_entry_id = int(m.group(1))
text = el.text ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
for old, new in punctuation.items(): expression = ref_entry.get_first_expression()
text = text.replace(old, new) el.attrs["href"] = f"?query={expression}&wildcards=off"
el.replace_with(text) elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def _convert_paragraphs(self, soup):
for p in soup.find_all("p"):
p.name = "div"
def __add_internal_links(soup): def _style_table_headers(self, soup):
patterns = [ for th in soup.find_all("th"):
r"^(.+)[ぁ-ヿ、\s]+$", th['style'] = "vertical-align: middle; text-align: center;"
r"^(.+)[ぁ-ヿ、\s]+[ぁ-ヿ、\s][ぁ-ヿ、\s]+$"
]
for a in soup.find_all("a"):
for pattern in patterns:
m = re.search(pattern, a.text)
if m:
a['href'] = f"?query={m.group(1)}&wildcards=off"
break
def _unwrap_table_body(self, soup):
if soup.find("tbody") is not None:
soup.tbody.unwrap()
def __convert_paragraphs(soup): def _decompose_table_rows(self, soup, entry):
for p in soup.find_all("p"): for tr in soup.find_all("tr"):
p.name = "span" if tr.find("th") is None:
continue
elif tr.th.text == self._expression_header:
def __style_table_headers(soup):
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
def __unwrap_table_body(soup):
if soup.find("tbody") is not None:
soup.tbody.unwrap()
def __decompose_table_rows(soup, entry):
for tr in soup.find_all("tr"):
if tr.find("th") is None:
continue
elif tr.th.text in ["四字熟語", "言葉"]:
tr.decompose()
elif tr.th.text == "読み方":
if __do_display_yomikata_in_headword(entry):
tr.decompose() tr.decompose()
elif tr.th.text == "意味": elif tr.th.text == "読み方":
imi = tr.td if self._do_display_yomikata_in_headword(entry):
imi.name = "div" tr.decompose()
soup.body.insert(0, imi) elif tr.th.text == "意味":
tr.decompose() imi = tr.td
if soup.find("tr") is None: imi.name = "div"
soup.table.decompose() soup.body.insert(0, imi)
tr.decompose()
if soup.find("tr") is None:
soup.table.decompose()
def _insert_headword_line(self, soup, entry):
headword_line = soup.new_tag("span")
if self._do_display_yomikata_in_headword(entry):
headword_line.string = f"{entry.yomikata}{entry.expression}"
else:
headword_line.string = f"{entry.expression}"
soup.body.insert(0, headword_line)
def _do_display_yomikata_in_headword(self, entry):
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
return False
elif len(entry.yomikata) > 10:
return False
else:
return True
def __insert_headword_line(soup, entry): class JitenonKokugoGlossary(JitenonGlossary):
headword_line = soup.new_tag("span") def __init__(self):
if __do_display_yomikata_in_headword(entry): super().__init__()
headword_line.string = f"{entry.yomikata}{entry.expression}" self._expression_header = "言葉"
else: self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
headword_line.string = f"{entry.expression}"
soup.body.insert(0, headword_line) def make_glossary(self, entry, image_dir):
soup = entry.get_page_soup()
self._remove_antonym_list_item(soup)
self._replace_number_icons(soup, image_dir)
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._convert_paragraphs(soup)
self._style_table_headers(soup)
self._unwrap_table_body(soup)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary
def _remove_antonym_list_item(self, soup):
for el in soup.find_all("li"):
if el.text == "対義語辞典":
el.decompose()
def _replace_number_icons(self, soup, image_dir):
for el in soup.find_all("img"):
alt = el.attrs["alt"]
text = re.search(r"[-]+", alt).group(0)
filename = f"{text}-fill.svg"
path = os.path.join(image_dir, filename)
Icons.make_monochrome_fill_rectangle(path, text)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": alt,
"path": f"{os.path.basename(image_dir)}/{filename}",
}
el.name = "span"
el.append(img)
el.attrs["style"] = "margin-right: 0.25em;"
def _do_display_yomikata_in_headword(self, entry):
return len(entry.yomikata) <= 10
def __do_display_yomikata_in_headword(entry): class JitenonYojiGlossary(JitenonGlossary):
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata): def __init__(self):
return False super().__init__()
elif len(entry.yomikata) > 10: self._expression_header = "四字熟語"
return False self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
else:
return True def make_glossary(self, entry, image_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._convert_paragraphs(soup)
self._style_table_headers(soup)
self._unwrap_table_body(soup)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary
class JitenonKotowazaGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "言葉"
self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
def make_glossary(self, entry, image_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._convert_paragraphs(soup)
self._style_table_headers(soup)
self._unwrap_table_body(soup)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary

View file

@ -1,6 +1,9 @@
from bot.yomichan.grammar import sudachi_rules from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.terms.terminator import Terminator from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.jitenon import make_glossary
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator): class JitenonTerminator(Terminator):
@ -13,7 +16,7 @@ class JitenonTerminator(Terminator):
def _glossary(self, entry): def _glossary(self, entry):
if entry.entry_id in self._glossary_cache: if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id] return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry) glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary self._glossary_cache[entry.entry_id] = glossary
return glossary return glossary
@ -27,9 +30,22 @@ class JitenonTerminator(Terminator):
return [] return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""
class JitenonYojiTerminator(JitenonTerminator): class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, name): def __init__(self, name):
super().__init__(name) super().__init__(name)
self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression): def _inflection_rules(self, entry, expression):
return "" return ""
@ -42,6 +58,7 @@ class JitenonYojiTerminator(JitenonTerminator):
class JitenonKotowazaTerminator(JitenonTerminator): class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, name): def __init__(self, name):
super().__init__(name) super().__init__(name)
self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression): def _inflection_rules(self, entry, expression):
return sudachi_rules(expression) return sudachi_rules(expression)

View file

@ -1,4 +1,13 @@
{ {
"jitenon-kokugo": {
"index": {
"title": "国語辞典オンライン",
"sequenced": true,
"format": 3,
"url": "https://kokugo.jitenon.jp/"
},
"tags": []
},
"jitenon-yoji": { "jitenon-yoji": {
"index": { "index": {
"title": "四字熟語辞典オンライン", "title": "四字熟語辞典オンライン",

View file

@ -18,6 +18,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
import os import os
import argparse import argparse
from bot.crawlers import JitenonKokugoCrawler
from bot.crawlers import JitenonYojiCrawler from bot.crawlers import JitenonYojiCrawler
from bot.crawlers import JitenonKotowazaCrawler from bot.crawlers import JitenonKotowazaCrawler
from bot.crawlers import Smk8Crawler from bot.crawlers import Smk8Crawler
@ -59,6 +60,7 @@ def parse_args(targets):
def main(): def main():
crawlers = { crawlers = {
"jitenon-kokugo": JitenonKokugoCrawler,
"jitenon-yoji": JitenonYojiCrawler, "jitenon-yoji": JitenonYojiCrawler,
"jitenon-kotowaza": JitenonKotowazaCrawler, "jitenon-kotowaza": JitenonKotowazaCrawler,
"smk8": Smk8Crawler, "smk8": Smk8Crawler,