Add support for jitenon-kokugo
This commit is contained in:
parent
fa86377404
commit
7ad72a6e4f
|
@ -4,11 +4,13 @@ from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import bot.scraper as Scraper
|
import bot.scraper as Scraper
|
||||||
|
|
||||||
|
from bot.entries.jitenon import JitenonKokugoEntry
|
||||||
from bot.entries.jitenon import JitenonKotowazaEntry
|
from bot.entries.jitenon import JitenonKotowazaEntry
|
||||||
from bot.entries.jitenon import JitenonYojiEntry
|
from bot.entries.jitenon import JitenonYojiEntry
|
||||||
from bot.entries.smk8 import Smk8Entry
|
from bot.entries.smk8 import Smk8Entry
|
||||||
from bot.entries.daijirin2 import Daijirin2Entry
|
from bot.entries.daijirin2 import Daijirin2Entry
|
||||||
|
|
||||||
|
from bot.yomichan.export import JitenonKokugoExporter
|
||||||
from bot.yomichan.export import JitenonKotowazaExporter
|
from bot.yomichan.export import JitenonKotowazaExporter
|
||||||
from bot.yomichan.export import JitenonYojiExporter
|
from bot.yomichan.export import JitenonYojiExporter
|
||||||
from bot.yomichan.export import Smk8Exporter
|
from bot.yomichan.export import Smk8Exporter
|
||||||
|
@ -48,6 +50,41 @@ class _Crawler():
|
||||||
return page_id
|
return page_id
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonKokugoCrawler(_Crawler):
|
||||||
|
def __init__(self, args):
|
||||||
|
super().__init__(args)
|
||||||
|
self._entry_class = JitenonKokugoEntry
|
||||||
|
self._yomi_exporter = JitenonKokugoExporter(args.target)
|
||||||
|
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
|
||||||
|
self._page_id_pattern = r"word/p([0-9]+)$"
|
||||||
|
|
||||||
|
def collect_pages(self):
|
||||||
|
jitenon = Scraper.Jitenon()
|
||||||
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||||
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||||
|
gojuon_href = gojuon_a['href']
|
||||||
|
max_kana_page = 1
|
||||||
|
current_kana_page = 1
|
||||||
|
while current_kana_page <= max_kana_page:
|
||||||
|
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
|
||||||
|
current_kana_page += 1
|
||||||
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||||
|
page_total = kana_soup.find(class_="page_total").text
|
||||||
|
m = re.search(r"全([0-9]+)件", page_total)
|
||||||
|
if m:
|
||||||
|
max_kana_page = int(m.group(1))
|
||||||
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
|
page_link = kana_a['href']
|
||||||
|
page_id = self._parse_page_id(page_link)
|
||||||
|
if page_id is None:
|
||||||
|
continue
|
||||||
|
_, page_path = jitenon.scrape(page_link)
|
||||||
|
self._page_map[page_id] = page_path
|
||||||
|
pages_len = len(self._page_map)
|
||||||
|
print(f"Finished scraping {pages_len} pages")
|
||||||
|
|
||||||
|
|
||||||
class _JitenonCrawler(_Crawler):
|
class _JitenonCrawler(_Crawler):
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
super().__init__(args)
|
super().__init__(args)
|
||||||
|
|
|
@ -7,8 +7,14 @@ import bot.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
class _JitenonEntry(Entry):
|
class _JitenonEntry(Entry):
|
||||||
|
ID_TO_ENTRY = {}
|
||||||
|
|
||||||
def __init__(self, entry_id):
|
def __init__(self, entry_id):
|
||||||
super().__init__(entry_id)
|
super().__init__(entry_id)
|
||||||
|
if entry_id not in self.ID_TO_ENTRY:
|
||||||
|
self.ID_TO_ENTRY[entry_id] = self
|
||||||
|
else:
|
||||||
|
raise Exception(f"Duplicate entry ID: {entry_id}")
|
||||||
self.modified_date = date(1970, 1, 1)
|
self.modified_date = date(1970, 1, 1)
|
||||||
self.attribution = ""
|
self.attribution = ""
|
||||||
for column in self._COLUMNS.values():
|
for column in self._COLUMNS.values():
|
||||||
|
@ -44,9 +50,9 @@ class _JitenonEntry(Entry):
|
||||||
|
|
||||||
def _set_headwords(self):
|
def _set_headwords(self):
|
||||||
headwords = {}
|
headwords = {}
|
||||||
for yomikata in self.__yomikatas():
|
for yomikata in self._yomikatas():
|
||||||
headwords[yomikata] = [self.expression]
|
headwords[yomikata] = [self.expression]
|
||||||
ikei_headwords = self.__ikei_headwords()
|
ikei_headwords = self._ikei_headwords()
|
||||||
for reading, expressions in ikei_headwords.items():
|
for reading, expressions in ikei_headwords.items():
|
||||||
if reading not in headwords:
|
if reading not in headwords:
|
||||||
headwords[reading] = []
|
headwords[reading] = []
|
||||||
|
@ -73,7 +79,7 @@ class _JitenonEntry(Entry):
|
||||||
else:
|
else:
|
||||||
attr_value.append(colval)
|
attr_value.append(colval)
|
||||||
|
|
||||||
def __yomikatas(self):
|
def _yomikatas(self):
|
||||||
yomikata = self.yomikata
|
yomikata = self.yomikata
|
||||||
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
|
@ -94,7 +100,7 @@ class _JitenonEntry(Entry):
|
||||||
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
||||||
return [""]
|
return [""]
|
||||||
|
|
||||||
def __ikei_headwords(self):
|
def _ikei_headwords(self):
|
||||||
ikei_headwords = {}
|
ikei_headwords = {}
|
||||||
for val in self.ikei:
|
for val in self.ikei:
|
||||||
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
||||||
|
@ -174,3 +180,39 @@ class JitenonKotowazaEntry(_JitenonEntry):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||||
Expressions.add_fullwidth(expressions)
|
Expressions.add_fullwidth(expressions)
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonKokugoEntry(_JitenonEntry):
|
||||||
|
_COLUMNS = {
|
||||||
|
"言葉": ["expression", ""],
|
||||||
|
"読み方": ["yomikata", ""],
|
||||||
|
"意味": ["imi", ""],
|
||||||
|
"例文": ["reibun", ""],
|
||||||
|
"別表記": ["betsuhyouki", ""],
|
||||||
|
"対義語": ["taigigo", ""],
|
||||||
|
"活用": ["katsuyou", ""],
|
||||||
|
"用例": ["yourei", ""],
|
||||||
|
"類語": ["ruigo", ""],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
headwords = {}
|
||||||
|
for reading in self.yomikata.split("・"):
|
||||||
|
if reading not in headwords:
|
||||||
|
headwords[reading] = []
|
||||||
|
for expression in self.expression.split("・"):
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
if self.betsuhyouki.strip() != "":
|
||||||
|
for expression in self.betsuhyouki.split("・"):
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
self._headwords = headwords
|
||||||
|
|
||||||
|
def _set_variant_headwords(self):
|
||||||
|
for expressions in self._headwords.values():
|
||||||
|
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||||
|
Expressions.add_fullwidth(expressions)
|
||||||
|
Expressions.remove_iteration_mark(expressions)
|
||||||
|
Expressions.add_iteration_mark(expressions)
|
||||||
|
|
|
@ -7,6 +7,7 @@ from platformdirs import user_documents_dir, user_cache_dir
|
||||||
|
|
||||||
from bot.data import load_yomichan_metadata
|
from bot.data import load_yomichan_metadata
|
||||||
|
|
||||||
|
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
|
||||||
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
|
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
|
||||||
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
|
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
|
||||||
from bot.yomichan.terms.smk8 import Smk8Terminator
|
from bot.yomichan.terms.smk8 import Smk8Terminator
|
||||||
|
@ -20,8 +21,7 @@ class Exporter:
|
||||||
self._terms_per_file = 2000
|
self._terms_per_file = 2000
|
||||||
|
|
||||||
def export(self, entries, image_dir):
|
def export(self, entries, image_dir):
|
||||||
if image_dir is not None:
|
self.__init_build_image_dir(image_dir)
|
||||||
self.__init_build_image_dir(image_dir)
|
|
||||||
meta = load_yomichan_metadata()
|
meta = load_yomichan_metadata()
|
||||||
index = meta[self._name]["index"]
|
index = meta[self._name]["index"]
|
||||||
index["revision"] = self._get_revision(entries)
|
index["revision"] = self._get_revision(entries)
|
||||||
|
@ -42,10 +42,13 @@ class Exporter:
|
||||||
return self._build_dir
|
return self._build_dir
|
||||||
|
|
||||||
def __init_build_image_dir(self, image_dir):
|
def __init_build_image_dir(self, image_dir):
|
||||||
print("Copying image files to build directory...")
|
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
build_img_dir = os.path.join(build_dir, self._name)
|
build_img_dir = os.path.join(build_dir, self._name)
|
||||||
shutil.copytree(image_dir, build_img_dir)
|
if image_dir is not None:
|
||||||
|
print("Copying image files to build directory...")
|
||||||
|
shutil.copytree(image_dir, build_img_dir)
|
||||||
|
else:
|
||||||
|
os.makedirs(build_img_dir)
|
||||||
self._terminator.set_image_dir(build_img_dir)
|
self._terminator.set_image_dir(build_img_dir)
|
||||||
|
|
||||||
def __get_terms(self, entries):
|
def __get_terms(self, entries):
|
||||||
|
@ -131,6 +134,12 @@ class JitenonExporter(Exporter):
|
||||||
return attribution
|
return attribution
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonKokugoExporter(JitenonExporter):
|
||||||
|
def __init__(self, name):
|
||||||
|
super().__init__(name)
|
||||||
|
self._terminator = JitenonKokugoTerminator(name)
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiExporter(JitenonExporter):
|
class JitenonYojiExporter(JitenonExporter):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
super().__init__(name)
|
super().__init__(name)
|
||||||
|
|
|
@ -1,93 +1,176 @@
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import bot.icons as Icons
|
||||||
from bot.yomichan.glossary.gloss import make_gloss
|
from bot.yomichan.glossary.gloss import make_gloss
|
||||||
|
|
||||||
|
|
||||||
def make_glossary(entry):
|
class JitenonGlossary():
|
||||||
soup = entry.get_page_soup()
|
def __init__(self):
|
||||||
__replace_punctuation(soup)
|
self._id_pattern = None
|
||||||
__add_internal_links(soup)
|
self._expression_header = None
|
||||||
__convert_paragraphs(soup)
|
|
||||||
__style_table_headers(soup)
|
|
||||||
__unwrap_table_body(soup)
|
|
||||||
__decompose_table_rows(soup, entry)
|
|
||||||
__insert_headword_line(soup, entry)
|
|
||||||
gloss = make_gloss(soup.body)
|
|
||||||
glossary = [gloss]
|
|
||||||
return glossary
|
|
||||||
|
|
||||||
|
def _replace_punctuation(self, soup):
|
||||||
|
punctuation = {
|
||||||
|
"/": "/",
|
||||||
|
",": "、",
|
||||||
|
}
|
||||||
|
for el in soup.find_all(string=True):
|
||||||
|
text = el.text
|
||||||
|
for old, new in punctuation.items():
|
||||||
|
text = text.replace(old, new)
|
||||||
|
el.replace_with(text)
|
||||||
|
|
||||||
def __replace_punctuation(soup):
|
def _add_internal_links(self, soup, entry):
|
||||||
punctuation = {
|
for el in soup.find_all("a"):
|
||||||
"/": "/",
|
href = el.attrs["href"]
|
||||||
",": "、",
|
m = re.search(self._id_pattern, href)
|
||||||
}
|
if m is not None:
|
||||||
for el in soup.find_all(string=True):
|
ref_entry_id = int(m.group(1))
|
||||||
text = el.text
|
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
|
||||||
for old, new in punctuation.items():
|
expression = ref_entry.get_first_expression()
|
||||||
text = text.replace(old, new)
|
el.attrs["href"] = f"?query={expression}&wildcards=off"
|
||||||
el.replace_with(text)
|
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid href format: {href}")
|
||||||
|
|
||||||
|
def _convert_paragraphs(self, soup):
|
||||||
|
for p in soup.find_all("p"):
|
||||||
|
p.name = "div"
|
||||||
|
|
||||||
def __add_internal_links(soup):
|
def _style_table_headers(self, soup):
|
||||||
patterns = [
|
for th in soup.find_all("th"):
|
||||||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
th['style'] = "vertical-align: middle; text-align: center;"
|
||||||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
|
||||||
]
|
|
||||||
for a in soup.find_all("a"):
|
|
||||||
for pattern in patterns:
|
|
||||||
m = re.search(pattern, a.text)
|
|
||||||
if m:
|
|
||||||
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
|
||||||
break
|
|
||||||
|
|
||||||
|
def _unwrap_table_body(self, soup):
|
||||||
|
if soup.find("tbody") is not None:
|
||||||
|
soup.tbody.unwrap()
|
||||||
|
|
||||||
def __convert_paragraphs(soup):
|
def _decompose_table_rows(self, soup, entry):
|
||||||
for p in soup.find_all("p"):
|
for tr in soup.find_all("tr"):
|
||||||
p.name = "span"
|
if tr.find("th") is None:
|
||||||
|
continue
|
||||||
|
elif tr.th.text == self._expression_header:
|
||||||
def __style_table_headers(soup):
|
|
||||||
for th in soup.find_all("th"):
|
|
||||||
th['style'] = "vertical-align: middle; text-align: center;"
|
|
||||||
|
|
||||||
|
|
||||||
def __unwrap_table_body(soup):
|
|
||||||
if soup.find("tbody") is not None:
|
|
||||||
soup.tbody.unwrap()
|
|
||||||
|
|
||||||
|
|
||||||
def __decompose_table_rows(soup, entry):
|
|
||||||
for tr in soup.find_all("tr"):
|
|
||||||
if tr.find("th") is None:
|
|
||||||
continue
|
|
||||||
elif tr.th.text in ["四字熟語", "言葉"]:
|
|
||||||
tr.decompose()
|
|
||||||
elif tr.th.text == "読み方":
|
|
||||||
if __do_display_yomikata_in_headword(entry):
|
|
||||||
tr.decompose()
|
tr.decompose()
|
||||||
elif tr.th.text == "意味":
|
elif tr.th.text == "読み方":
|
||||||
imi = tr.td
|
if self._do_display_yomikata_in_headword(entry):
|
||||||
imi.name = "div"
|
tr.decompose()
|
||||||
soup.body.insert(0, imi)
|
elif tr.th.text == "意味":
|
||||||
tr.decompose()
|
imi = tr.td
|
||||||
if soup.find("tr") is None:
|
imi.name = "div"
|
||||||
soup.table.decompose()
|
soup.body.insert(0, imi)
|
||||||
|
tr.decompose()
|
||||||
|
if soup.find("tr") is None:
|
||||||
|
soup.table.decompose()
|
||||||
|
|
||||||
|
def _insert_headword_line(self, soup, entry):
|
||||||
|
headword_line = soup.new_tag("span")
|
||||||
|
if self._do_display_yomikata_in_headword(entry):
|
||||||
|
headword_line.string = f"{entry.yomikata}【{entry.expression}】"
|
||||||
|
else:
|
||||||
|
headword_line.string = f"【{entry.expression}】"
|
||||||
|
soup.body.insert(0, headword_line)
|
||||||
|
|
||||||
|
def _do_display_yomikata_in_headword(self, entry):
|
||||||
|
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
|
||||||
|
return False
|
||||||
|
elif len(entry.yomikata) > 10:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def __insert_headword_line(soup, entry):
|
class JitenonKokugoGlossary(JitenonGlossary):
|
||||||
headword_line = soup.new_tag("span")
|
def __init__(self):
|
||||||
if __do_display_yomikata_in_headword(entry):
|
super().__init__()
|
||||||
headword_line.string = f"{entry.yomikata}【{entry.expression}】"
|
self._expression_header = "言葉"
|
||||||
else:
|
self._id_pattern = r"kokugo.jitenon.jp/word/p([0-9]+)$"
|
||||||
headword_line.string = f"【{entry.expression}】"
|
|
||||||
soup.body.insert(0, headword_line)
|
def make_glossary(self, entry, image_dir):
|
||||||
|
soup = entry.get_page_soup()
|
||||||
|
self._remove_antonym_list_item(soup)
|
||||||
|
self._replace_number_icons(soup, image_dir)
|
||||||
|
self._replace_punctuation(soup)
|
||||||
|
self._add_internal_links(soup, entry)
|
||||||
|
self._convert_paragraphs(soup)
|
||||||
|
self._style_table_headers(soup)
|
||||||
|
self._unwrap_table_body(soup)
|
||||||
|
self._decompose_table_rows(soup, entry)
|
||||||
|
self._insert_headword_line(soup, entry)
|
||||||
|
gloss = make_gloss(soup.body)
|
||||||
|
glossary = [gloss]
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
def _remove_antonym_list_item(self, soup):
|
||||||
|
for el in soup.find_all("li"):
|
||||||
|
if el.text == "対義語辞典":
|
||||||
|
el.decompose()
|
||||||
|
|
||||||
|
def _replace_number_icons(self, soup, image_dir):
|
||||||
|
for el in soup.find_all("img"):
|
||||||
|
alt = el.attrs["alt"]
|
||||||
|
text = re.search(r"[0-9]+", alt).group(0)
|
||||||
|
filename = f"{text}-fill.svg"
|
||||||
|
path = os.path.join(image_dir, filename)
|
||||||
|
Icons.make_monochrome_fill_rectangle(path, text)
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": "monochrome",
|
||||||
|
"title": alt,
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "margin-right: 0.25em;"
|
||||||
|
|
||||||
|
def _do_display_yomikata_in_headword(self, entry):
|
||||||
|
return len(entry.yomikata) <= 10
|
||||||
|
|
||||||
|
|
||||||
def __do_display_yomikata_in_headword(entry):
|
class JitenonYojiGlossary(JitenonGlossary):
|
||||||
if not re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
|
def __init__(self):
|
||||||
return False
|
super().__init__()
|
||||||
elif len(entry.yomikata) > 10:
|
self._expression_header = "四字熟語"
|
||||||
return False
|
self._id_pattern = r"yoji.jitenon.jp/yoji.?/([0-9]+)\.html$"
|
||||||
else:
|
|
||||||
return True
|
def make_glossary(self, entry, image_dir):
|
||||||
|
soup = entry.get_page_soup()
|
||||||
|
self._replace_punctuation(soup)
|
||||||
|
self._add_internal_links(soup, entry)
|
||||||
|
self._convert_paragraphs(soup)
|
||||||
|
self._style_table_headers(soup)
|
||||||
|
self._unwrap_table_body(soup)
|
||||||
|
self._decompose_table_rows(soup, entry)
|
||||||
|
self._insert_headword_line(soup, entry)
|
||||||
|
gloss = make_gloss(soup.body)
|
||||||
|
glossary = [gloss]
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonKotowazaGlossary(JitenonGlossary):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._expression_header = "言葉"
|
||||||
|
self._id_pattern = r"kotowaza.jitenon.jp/kotowaza/([0-9]+)\.php$"
|
||||||
|
|
||||||
|
def make_glossary(self, entry, image_dir):
|
||||||
|
soup = entry.get_page_soup()
|
||||||
|
self._replace_punctuation(soup)
|
||||||
|
self._add_internal_links(soup, entry)
|
||||||
|
self._convert_paragraphs(soup)
|
||||||
|
self._style_table_headers(soup)
|
||||||
|
self._unwrap_table_body(soup)
|
||||||
|
self._decompose_table_rows(soup, entry)
|
||||||
|
self._insert_headword_line(soup, entry)
|
||||||
|
gloss = make_gloss(soup.body)
|
||||||
|
glossary = [gloss]
|
||||||
|
return glossary
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
from bot.yomichan.grammar import sudachi_rules
|
from bot.yomichan.grammar import sudachi_rules
|
||||||
from bot.yomichan.terms.terminator import Terminator
|
from bot.yomichan.terms.terminator import Terminator
|
||||||
from bot.yomichan.glossary.jitenon import make_glossary
|
|
||||||
|
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
|
||||||
|
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
|
||||||
|
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
|
||||||
|
|
||||||
|
|
||||||
class JitenonTerminator(Terminator):
|
class JitenonTerminator(Terminator):
|
||||||
|
@ -13,7 +16,7 @@ class JitenonTerminator(Terminator):
|
||||||
def _glossary(self, entry):
|
def _glossary(self, entry):
|
||||||
if entry.entry_id in self._glossary_cache:
|
if entry.entry_id in self._glossary_cache:
|
||||||
return self._glossary_cache[entry.entry_id]
|
return self._glossary_cache[entry.entry_id]
|
||||||
glossary = make_glossary(entry)
|
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
|
||||||
self._glossary_cache[entry.entry_id] = glossary
|
self._glossary_cache[entry.entry_id] = glossary
|
||||||
return glossary
|
return glossary
|
||||||
|
|
||||||
|
@ -27,9 +30,22 @@ class JitenonTerminator(Terminator):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonKokugoTerminator(JitenonTerminator):
|
||||||
|
def __init__(self, name):
|
||||||
|
super().__init__(name)
|
||||||
|
self._glossary_maker = JitenonKokugoGlossary()
|
||||||
|
|
||||||
|
def _inflection_rules(self, entry, expression):
|
||||||
|
return sudachi_rules(expression)
|
||||||
|
|
||||||
|
def _term_tags(self, entry):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiTerminator(JitenonTerminator):
|
class JitenonYojiTerminator(JitenonTerminator):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
super().__init__(name)
|
super().__init__(name)
|
||||||
|
self._glossary_maker = JitenonYojiGlossary()
|
||||||
|
|
||||||
def _inflection_rules(self, entry, expression):
|
def _inflection_rules(self, entry, expression):
|
||||||
return ""
|
return ""
|
||||||
|
@ -42,6 +58,7 @@ class JitenonYojiTerminator(JitenonTerminator):
|
||||||
class JitenonKotowazaTerminator(JitenonTerminator):
|
class JitenonKotowazaTerminator(JitenonTerminator):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
super().__init__(name)
|
super().__init__(name)
|
||||||
|
self._glossary_maker = JitenonKotowazaGlossary()
|
||||||
|
|
||||||
def _inflection_rules(self, entry, expression):
|
def _inflection_rules(self, entry, expression):
|
||||||
return sudachi_rules(expression)
|
return sudachi_rules(expression)
|
||||||
|
|
|
@ -1,4 +1,13 @@
|
||||||
{
|
{
|
||||||
|
"jitenon-kokugo": {
|
||||||
|
"index": {
|
||||||
|
"title": "国語辞典オンライン",
|
||||||
|
"sequenced": true,
|
||||||
|
"format": 3,
|
||||||
|
"url": "https://kokugo.jitenon.jp/"
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"jitenon-yoji": {
|
"jitenon-yoji": {
|
||||||
"index": {
|
"index": {
|
||||||
"title": "四字熟語辞典オンライン",
|
"title": "四字熟語辞典オンライン",
|
||||||
|
|
|
@ -18,6 +18,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
|
from bot.crawlers import JitenonKokugoCrawler
|
||||||
from bot.crawlers import JitenonYojiCrawler
|
from bot.crawlers import JitenonYojiCrawler
|
||||||
from bot.crawlers import JitenonKotowazaCrawler
|
from bot.crawlers import JitenonKotowazaCrawler
|
||||||
from bot.crawlers import Smk8Crawler
|
from bot.crawlers import Smk8Crawler
|
||||||
|
@ -59,6 +60,7 @@ def parse_args(targets):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
crawlers = {
|
crawlers = {
|
||||||
|
"jitenon-kokugo": JitenonKokugoCrawler,
|
||||||
"jitenon-yoji": JitenonYojiCrawler,
|
"jitenon-yoji": JitenonYojiCrawler,
|
||||||
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
||||||
"smk8": Smk8Crawler,
|
"smk8": Smk8Crawler,
|
||||||
|
|
Loading…
Reference in a new issue