Add support for jitenon-kokugo

This commit is contained in:
stephenmk 2023-05-05 22:53:17 -05:00
parent fa86377404
commit 7ad72a6e4f
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
7 changed files with 286 additions and 87 deletions

View file

@ -4,11 +4,13 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper
from bot.entries.jitenon import JitenonKokugoEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
from bot.yomichan.export import JitenonKokugoExporter
from bot.yomichan.export import JitenonKotowazaExporter
from bot.yomichan.export import JitenonYojiExporter
from bot.yomichan.export import Smk8Exporter
@ -48,6 +50,41 @@ class _Crawler():
return page_id
class JitenonKokugoCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = JitenonKokugoEntry
self._yomi_exporter = JitenonKokugoExporter(args.target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self):
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
max_kana_page = 1
current_kana_page = 1
while current_kana_page <= max_kana_page:
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
current_kana_page += 1
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
page_total = kana_soup.find(class_="page_total").text
m = re.search(r"全([0-9]+)件", page_total)
if m:
max_kana_page = int(m.group(1))
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
class _JitenonCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)

View file

@ -7,8 +7,14 @@ import bot.expressions as Expressions
class _JitenonEntry(Entry):
ID_TO_ENTRY = {}
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self._COLUMNS.values():
@ -44,9 +50,9 @@ class _JitenonEntry(Entry):
def _set_headwords(self):
headwords = {}
for yomikata in self.__yomikatas():
for yomikata in self._yomikatas():
headwords[yomikata] = [self.expression]
ikei_headwords = self.__ikei_headwords()
ikei_headwords = self._ikei_headwords()
for reading, expressions in ikei_headwords.items():
if reading not in headwords:
headwords[reading] = []
@ -73,7 +79,7 @@ class _JitenonEntry(Entry):
else:
attr_value.append(colval)
def __yomikatas(self):
def _yomikatas(self):
yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m:
@ -94,7 +100,7 @@ class _JitenonEntry(Entry):
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return [""]
def __ikei_headwords(self):
def _ikei_headwords(self):
ikei_headwords = {}
for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
@ -174,3 +180,39 @@ class JitenonKotowazaEntry(_JitenonEntry):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions)
class JitenonKokugoEntry(_JitenonEntry):
_COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"例文": ["reibun", ""],
"別表記": ["betsuhyouki", ""],
"対義語": ["taigigo", ""],
"活用": ["katsuyou", ""],
"用例": ["yourei", ""],
"類語": ["ruigo", ""],
}
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
headwords = {}
for reading in self.yomikata.split(""):
if reading not in headwords:
headwords[reading] = []
for expression in self.expression.split(""):
headwords[reading].append(expression)
if self.betsuhyouki.strip() != "":
for expression in self.betsuhyouki.split(""):
headwords[reading].append(expression)
self._headwords = headwords
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)

View file

@ -7,6 +7,7 @@ from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
@ -20,8 +21,7 @@ class Exporter:
self._terms_per_file = 2000
def export(self, entries, image_dir):
if image_dir is not None:
self.__init_build_image_dir(image_dir)
self.__init_build_image_dir(image_dir)
meta = load_yomichan_metadata()
index = meta[self._name]["index"]
index["revision"] = self._get_revision(entries)
@ -42,10 +42,13 @@ class Exporter:
return self._build_dir
def __init_build_image_dir(self, image_dir):
print("Copying image files to build directory...")
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._name)
shutil.copytree(image_dir, build_img_dir)
if image_dir is not None:
print("Copying image files to build directory...")
shutil.copytree(image_dir, build_img_dir)
else:
os.makedirs(build_img_dir)
self._terminator.set_image_dir(build_img_dir)
def __get_terms(self, entries):
@ -131,6 +134,12 @@ class JitenonExporter(Exporter):
return attribution
class JitenonKokugoExporter(JitenonExporter):
def __init__(self, name):
super().__init__(name)
self._terminator = JitenonKokugoTerminator(name)
class JitenonYojiExporter(JitenonExporter):
def __init__(self, name):
super().__init__(name)

View file

@ -1,93 +1,176 @@
import re
import os
from bs4 import BeautifulSoup
import bot.icons as Icons
from bot.yomichan.glossary.gloss import make_gloss
def make_glossary(entry):
soup = entry.get_page_soup()
__replace_punctuation(soup)
__add_internal_links(soup)
__convert_paragraphs(soup)
__style_table_headers(soup)
__unwrap_table_body(soup)
__decompose_table_rows(soup, entry)
__insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary
class JitenonGlossary():
def __init__(self):
self._id_pattern = None
self._expression_header = None
def _replace_punctuation(self, soup):
punctuation = {
"/": "",
",": "",
}
for el in soup.find_all(string=True):
text = el.text
for old, new in punctuation.items():
text = text.replace(old, new)
el.replace_with(text)
def __replace_punctuation(soup):
punctuation = {
"/": "",
",": "",
}
for el in soup.find_all(string=True):
text = el.text
for old, new in punctuation.items():
text = text.replace(old, new)
el.replace_with(text)
def _add_internal_links(self, soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
m = re.search(self._id_pattern, href)
if m is not None:
ref_entry_id = int(m.group(1))
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
expression = ref_entry.get_first_expression()
el.attrs["href"] = f"?query={expression}&wildcards=off"
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def _convert_paragraphs(self, soup):
for p in soup.find_all("p"):
p.name = "div"
def __add_internal_links(soup):
patterns = [
r"^(.+)[ぁ-ヿ、\s]+$",
r"^(.+)[ぁ-ヿ、\s]+[ぁ-ヿ、\s][ぁ-ヿ、\s]+$"
]
for a in soup.find_all("a"):
for pattern in patterns:
m = re.search(pattern, a.text)
if m:
a['href'] = f"?query={m.group(1)}&wildcards=off"
break
def _style_table_headers(self, soup):
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
def _unwrap_table_body(self, soup):
if soup.find("tbody") is not None:
soup.tbody.unwrap()
def __convert_paragraphs(soup):
for p in soup.find_all("p"):
p.name = "span"
def __style_table_headers(soup):
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
def __unwrap_table_body(soup):
if soup.find("tbody") is not None:
soup.tbody.unwrap()
def __decompose_table_rows(soup, entry):
for tr in soup.find_all("tr"):
if tr.find("th") is None:
continue
elif tr.th.text in ["四字熟語", "言葉"]:
tr.decompose()
elif tr.th.text == "読み方":
if __do_display_yomikata_in_headword(entry):
def _decompose_table_rows(self, soup, entry):
for tr in soup.find_all("tr"):
if tr.find("th") is None:
continue
elif tr.th.text == self._expression_header:
tr.decompose()
elif tr.th.text == "意味":
imi = tr.td
imi.name = "div"
soup.body.insert(0, imi)
tr.decompose()
if soup.find("tr") is None:
soup.table.decompose()
elif tr.th.text == "読み方":
if self._do_display_yomikata_in_headword(entry):
tr.decompose()
elif tr.th.text == "意味":
imi = tr.td
imi.name = "div"
soup.body.insert(0, imi)
tr.decompose()
if soup.find("tr") is None:
soup.table.decompose()
def _insert_headword_line(self, soup, entry):
headword_line = soup.new_tag("span")
if self._do_display_yomikata_in_headword(entry):
headword_line.string = f"{entry.yomikata}{entry.expression}"
else:
headword_line.string = f"{entry.expression}"
soup.body.insert(0, headword_line)
def _do_display_yomikata_in_headword(self, entry):
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
return False
elif len(entry.yomikata) > 10:
return False
else:
return True
def __insert_headword_line(soup, entry):
headword_line = soup.new_tag("span")
if __do_display_yomikata_in_headword(entry):
headword_line.string = f"{entry.yomikata}{entry.expression}"
else:
headword_line.string = f"{entry.expression}"
soup.body.insert(0, headword_line)
class JitenonKokugoGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "言葉"
self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
def make_glossary(self, entry, image_dir):
soup = entry.get_page_soup()
self._remove_antonym_list_item(soup)
self._replace_number_icons(soup, image_dir)
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._convert_paragraphs(soup)
self._style_table_headers(soup)
self._unwrap_table_body(soup)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary
def _remove_antonym_list_item(self, soup):
for el in soup.find_all("li"):
if el.text == "対義語辞典":
el.decompose()
def _replace_number_icons(self, soup, image_dir):
for el in soup.find_all("img"):
alt = el.attrs["alt"]
text = re.search(r"[-]+", alt).group(0)
filename = f"{text}-fill.svg"
path = os.path.join(image_dir, filename)
Icons.make_monochrome_fill_rectangle(path, text)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": alt,
"path": f"{os.path.basename(image_dir)}/{filename}",
}
el.name = "span"
el.append(img)
el.attrs["style"] = "margin-right: 0.25em;"
def _do_display_yomikata_in_headword(self, entry):
return len(entry.yomikata) <= 10
def __do_display_yomikata_in_headword(entry):
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
return False
elif len(entry.yomikata) > 10:
return False
else:
return True
class JitenonYojiGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "四字熟語"
self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
def make_glossary(self, entry, image_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._convert_paragraphs(soup)
self._style_table_headers(soup)
self._unwrap_table_body(soup)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary
class JitenonKotowazaGlossary(JitenonGlossary):
def __init__(self):
super().__init__()
self._expression_header = "言葉"
self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
def make_glossary(self, entry, image_dir):
soup = entry.get_page_soup()
self._replace_punctuation(soup)
self._add_internal_links(soup, entry)
self._convert_paragraphs(soup)
self._style_table_headers(soup)
self._unwrap_table_body(soup)
self._decompose_table_rows(soup, entry)
self._insert_headword_line(soup, entry)
gloss = make_gloss(soup.body)
glossary = [gloss]
return glossary

View file

@ -1,6 +1,9 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.jitenon import make_glossary
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
@ -13,7 +16,7 @@ class JitenonTerminator(Terminator):
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry)
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
@ -27,9 +30,22 @@ class JitenonTerminator(Terminator):
return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression):
return ""
@ -42,6 +58,7 @@ class JitenonYojiTerminator(JitenonTerminator):
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, name):
super().__init__(name)
self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)

View file

@ -1,4 +1,13 @@
{
"jitenon-kokugo": {
"index": {
"title": "国語辞典オンライン",
"sequenced": true,
"format": 3,
"url": "https://kokugo.jitenon.jp/"
},
"tags": []
},
"jitenon-yoji": {
"index": {
"title": "四字熟語辞典オンライン",

View file

@ -18,6 +18,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import argparse
from bot.crawlers import JitenonKokugoCrawler
from bot.crawlers import JitenonYojiCrawler
from bot.crawlers import JitenonKotowazaCrawler
from bot.crawlers import Smk8Crawler
@ -59,6 +60,7 @@ def parse_args(targets):
def main():
crawlers = {
"jitenon-kokugo": JitenonKokugoCrawler,
"jitenon-yoji": JitenonYojiCrawler,
"jitenon-kotowaza": JitenonKotowazaCrawler,
"smk8": Smk8Crawler,