Redesign Jitenon glossaries for yomichan
This commit is contained in:
parent
934f6534f1
commit
08b180f442
35
README.md
35
README.md
|
@ -1,35 +0,0 @@
|
||||||
# jitenbot
|
|
||||||
Jitenbot is a program for scraping Japanese dictionary websites and converting the scraped data into structured dictionary files.
|
|
||||||
|
|
||||||
### Target Websites
|
|
||||||
|
|
||||||
* [四字熟語辞典オンライン](https://yoji.jitenon.jp/)
|
|
||||||
* [故事・ことわざ・慣用句オンライン](https://kotowaza.jitenon.jp/)
|
|
||||||
|
|
||||||
### Export Formats
|
|
||||||
|
|
||||||
* [Yomichan](https://github.com/foosoft/yomichan)
|
|
||||||
|
|
||||||
# Usage
|
|
||||||
Add your desired HTTP request headers to [config.json](https://github.com/stephenmk/jitenbot/blob/main/config.json)
|
|
||||||
and ensure that all [requirements](https://github.com/stephenmk/jitenbot/blob/main/requirements.txt)
|
|
||||||
are installed.
|
|
||||||
|
|
||||||
```
|
|
||||||
jitenbot [-h] {all,jitenon-yoji,jitenon-kotowaza}
|
|
||||||
|
|
||||||
positional arguments:
|
|
||||||
{all,jitenon-yoji,jitenon-kotowaza}
|
|
||||||
website to crawl
|
|
||||||
|
|
||||||
options:
|
|
||||||
-h, --help show this help message and exit
|
|
||||||
```
|
|
||||||
|
|
||||||
Scraped webpages are written to a `webcache` directory. Each page may be as large as 100 KiB,
|
|
||||||
and a single dictionary may include thousands of pages. Ensure that adequate disk space is available.
|
|
||||||
|
|
||||||
Jitenbot will pause for at least 10 seconds between each web request. Depending upon the size of
|
|
||||||
the target dictionary, it make take hours or days to finish scraping.
|
|
||||||
|
|
||||||
Exported dictionary files will be saved in an `output` directory.
|
|
|
@ -36,6 +36,17 @@ class JitenonEntry:
|
||||||
self._set_headwords()
|
self._set_headwords()
|
||||||
return self._headwords
|
return self._headwords
|
||||||
|
|
||||||
|
def get_first_expression(self):
|
||||||
|
headwords = self.get_headwords()
|
||||||
|
expressions = next(iter(headwords.values()))
|
||||||
|
expression = expressions[0]
|
||||||
|
return expression
|
||||||
|
|
||||||
|
def get_first_reading(self):
|
||||||
|
headwords = self.get_headwords()
|
||||||
|
reading = next(iter(headwords.keys()))
|
||||||
|
return reading
|
||||||
|
|
||||||
def _set_headwords(self):
|
def _set_headwords(self):
|
||||||
headwords = {}
|
headwords = {}
|
||||||
for yomikata in self.__yomikatas():
|
for yomikata in self.__yomikatas():
|
||||||
|
|
|
@ -15,7 +15,7 @@ def __get_markup_structure(soup):
|
||||||
content = []
|
content = []
|
||||||
for child in soup.children:
|
for child in soup.children:
|
||||||
if child.name is None:
|
if child.name is None:
|
||||||
text = __clean(child.text)
|
text = child.text.strip()
|
||||||
if text != "":
|
if text != "":
|
||||||
content.append(text)
|
content.append(text)
|
||||||
else:
|
else:
|
||||||
|
@ -35,12 +35,6 @@ def __get_markup_structure(soup):
|
||||||
return node
|
return node
|
||||||
|
|
||||||
|
|
||||||
def __clean(text):
|
|
||||||
text = text.replace("/", "/")
|
|
||||||
text = text.strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def __get_attributes(attrs):
|
def __get_attributes(attrs):
|
||||||
attributes = {}
|
attributes = {}
|
||||||
if "href" in attrs:
|
if "href" in attrs:
|
||||||
|
|
|
@ -6,6 +6,31 @@ from bot.yomichan.glossary.gloss import make_gloss
|
||||||
|
|
||||||
def make_glossary(entry):
|
def make_glossary(entry):
|
||||||
soup = BeautifulSoup(entry.markup, "html5lib")
|
soup = BeautifulSoup(entry.markup, "html5lib")
|
||||||
|
__replace_punctuation(soup)
|
||||||
|
__add_internal_links(soup)
|
||||||
|
__convert_paragraphs(soup)
|
||||||
|
__style_table_headers(soup)
|
||||||
|
__unwrap_table_body(soup)
|
||||||
|
__decompose_table_rows(soup, entry)
|
||||||
|
__insert_headword_line(soup, entry)
|
||||||
|
gloss = make_gloss(soup.body)
|
||||||
|
glossary = [gloss]
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
|
||||||
|
def __replace_punctuation(soup):
|
||||||
|
punctuation = {
|
||||||
|
"/": "/",
|
||||||
|
",": "、",
|
||||||
|
}
|
||||||
|
for el in soup.find_all(string=True):
|
||||||
|
text = el.text
|
||||||
|
for old, new in punctuation.items():
|
||||||
|
text = text.replace(old, new)
|
||||||
|
el.replace_with(text)
|
||||||
|
|
||||||
|
|
||||||
|
def __add_internal_links(soup):
|
||||||
patterns = [
|
patterns = [
|
||||||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||||||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||||||
|
@ -16,10 +41,42 @@ def make_glossary(entry):
|
||||||
if m:
|
if m:
|
||||||
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_paragraphs(soup):
|
||||||
for p in soup.find_all("p"):
|
for p in soup.find_all("p"):
|
||||||
p.name = "span"
|
p.name = "span"
|
||||||
|
|
||||||
|
|
||||||
|
def __style_table_headers(soup):
|
||||||
for th in soup.find_all("th"):
|
for th in soup.find_all("th"):
|
||||||
th['style'] = "vertical-align: middle; text-align: center;"
|
th['style'] = "vertical-align: middle; text-align: center;"
|
||||||
gloss = make_gloss(soup.body)
|
|
||||||
glossary = [gloss]
|
|
||||||
return glossary
|
def __unwrap_table_body(soup):
|
||||||
|
if soup.find("tbody") is not None:
|
||||||
|
soup.tbody.unwrap()
|
||||||
|
|
||||||
|
|
||||||
|
def __decompose_table_rows(soup, entry):
|
||||||
|
for tr in soup.find_all("tr"):
|
||||||
|
if tr.find("th") is None:
|
||||||
|
continue
|
||||||
|
elif tr.th.text in ["四字熟語", "言葉"]:
|
||||||
|
tr.decompose()
|
||||||
|
elif tr.th.text == "読み方":
|
||||||
|
if re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
|
||||||
|
tr.decompose()
|
||||||
|
elif tr.th.text == "意味":
|
||||||
|
imi = tr.td
|
||||||
|
imi.name = "div"
|
||||||
|
soup.body.insert(0, imi)
|
||||||
|
tr.decompose()
|
||||||
|
if soup.find("tr") is None:
|
||||||
|
soup.table.decompose()
|
||||||
|
|
||||||
|
|
||||||
|
def __insert_headword_line(soup, entry):
|
||||||
|
headword_line = soup.new_tag("span")
|
||||||
|
headword_line.string = f"{entry.get_first_reading()}【{entry.expression}】"
|
||||||
|
soup.body.insert(0, headword_line)
|
||||||
|
|
|
@ -35,7 +35,7 @@ class JitenonYojiTerminator(JitenonTerminator):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def _term_tags(self, entry):
|
def _term_tags(self, entry):
|
||||||
tags = entry.kankenkyuu.replace(" ", "").split("/")
|
tags = entry.kankenkyuu.split("/")
|
||||||
return " ".join(tags)
|
return " ".join(tags)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ class Terminator:
|
||||||
score = -len(terms)
|
score = -len(terms)
|
||||||
glossary = self._glossary(entry)
|
glossary = self._glossary(entry)
|
||||||
sequence = self._sequence(entry)
|
sequence = self._sequence(entry)
|
||||||
term_tags = ""
|
term_tags = self._term_tags(entry)
|
||||||
term = [
|
term = [
|
||||||
expression, reading, definition_tags, inflection_rules,
|
expression, reading, definition_tags, inflection_rules,
|
||||||
score, glossary, sequence, term_tags
|
score, glossary, sequence, term_tags
|
||||||
|
|
30
jitenbot.py
30
jitenbot.py
|
@ -22,19 +22,31 @@ from bot.crawlers import JitenonKotowazaCrawler
|
||||||
|
|
||||||
|
|
||||||
crawlers = {
|
crawlers = {
|
||||||
'jitenon-yoji': JitenonYojiCrawler,
|
"jitenon-yoji": JitenonYojiCrawler,
|
||||||
'jitenon-kotowaza': JitenonKotowazaCrawler,
|
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def add_target_argument(parser):
|
||||||
|
target_argument_params = {
|
||||||
|
"choices": crawlers.keys(),
|
||||||
|
"help": "Dictionary to convert."
|
||||||
|
}
|
||||||
|
parser.add_argument("target", **target_argument_params)
|
||||||
|
|
||||||
|
|
||||||
|
def make_parser():
|
||||||
|
argument_parser_params = {
|
||||||
|
"prog": "jitenbot",
|
||||||
|
"description": "Convert Japanese dictionary files to new formats.",
|
||||||
|
}
|
||||||
|
parser = argparse.ArgumentParser(**argument_parser_params)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(
|
parser = make_parser()
|
||||||
prog='jitenbot',
|
add_target_argument(parser)
|
||||||
description='Convert Japanese dictionary files to new formats.')
|
|
||||||
parser.add_argument(
|
|
||||||
'target',
|
|
||||||
choices=crawlers.keys(),
|
|
||||||
help='Dictionary to convert.')
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue