Redesign Jitenon glossaries for yomichan
This commit is contained in:
parent
934f6534f1
commit
08b180f442
35
README.md
35
README.md
|
@ -1,35 +0,0 @@
|
|||
# jitenbot
|
||||
Jitenbot is a program for scraping Japanese dictionary websites and converting the scraped data into structured dictionary files.
|
||||
|
||||
### Target Websites
|
||||
|
||||
* [四字熟語辞典オンライン](https://yoji.jitenon.jp/)
|
||||
* [故事・ことわざ・慣用句オンライン](https://kotowaza.jitenon.jp/)
|
||||
|
||||
### Export Formats
|
||||
|
||||
* [Yomichan](https://github.com/foosoft/yomichan)
|
||||
|
||||
# Usage
|
||||
Add your desired HTTP request headers to [config.json](https://github.com/stephenmk/jitenbot/blob/main/config.json)
|
||||
and ensure that all [requirements](https://github.com/stephenmk/jitenbot/blob/main/requirements.txt)
|
||||
are installed.
|
||||
|
||||
```
|
||||
jitenbot [-h] {all,jitenon-yoji,jitenon-kotowaza}
|
||||
|
||||
positional arguments:
|
||||
{all,jitenon-yoji,jitenon-kotowaza}
|
||||
website to crawl
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
```
|
||||
|
||||
Scraped webpages are written to a `webcache` directory. Each page may be as large as 100 KiB,
|
||||
and a single dictionary may include thousands of pages. Ensure that adequate disk space is available.
|
||||
|
||||
Jitenbot will pause for at least 10 seconds between each web request. Depending upon the size of
|
||||
the target dictionary, it make take hours or days to finish scraping.
|
||||
|
||||
Exported dictionary files will be saved in an `output` directory.
|
|
@ -36,6 +36,17 @@ class JitenonEntry:
|
|||
self._set_headwords()
|
||||
return self._headwords
|
||||
|
||||
def get_first_expression(self):
|
||||
headwords = self.get_headwords()
|
||||
expressions = next(iter(headwords.values()))
|
||||
expression = expressions[0]
|
||||
return expression
|
||||
|
||||
def get_first_reading(self):
|
||||
headwords = self.get_headwords()
|
||||
reading = next(iter(headwords.keys()))
|
||||
return reading
|
||||
|
||||
def _set_headwords(self):
|
||||
headwords = {}
|
||||
for yomikata in self.__yomikatas():
|
||||
|
|
|
@ -15,7 +15,7 @@ def __get_markup_structure(soup):
|
|||
content = []
|
||||
for child in soup.children:
|
||||
if child.name is None:
|
||||
text = __clean(child.text)
|
||||
text = child.text.strip()
|
||||
if text != "":
|
||||
content.append(text)
|
||||
else:
|
||||
|
@ -35,12 +35,6 @@ def __get_markup_structure(soup):
|
|||
return node
|
||||
|
||||
|
||||
def __clean(text):
|
||||
text = text.replace("/", "/")
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
def __get_attributes(attrs):
|
||||
attributes = {}
|
||||
if "href" in attrs:
|
||||
|
|
|
@ -6,6 +6,31 @@ from bot.yomichan.glossary.gloss import make_gloss
|
|||
|
||||
def make_glossary(entry):
|
||||
soup = BeautifulSoup(entry.markup, "html5lib")
|
||||
__replace_punctuation(soup)
|
||||
__add_internal_links(soup)
|
||||
__convert_paragraphs(soup)
|
||||
__style_table_headers(soup)
|
||||
__unwrap_table_body(soup)
|
||||
__decompose_table_rows(soup, entry)
|
||||
__insert_headword_line(soup, entry)
|
||||
gloss = make_gloss(soup.body)
|
||||
glossary = [gloss]
|
||||
return glossary
|
||||
|
||||
|
||||
def __replace_punctuation(soup):
|
||||
punctuation = {
|
||||
"/": "/",
|
||||
",": "、",
|
||||
}
|
||||
for el in soup.find_all(string=True):
|
||||
text = el.text
|
||||
for old, new in punctuation.items():
|
||||
text = text.replace(old, new)
|
||||
el.replace_with(text)
|
||||
|
||||
|
||||
def __add_internal_links(soup):
|
||||
patterns = [
|
||||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||||
|
@ -16,10 +41,42 @@ def make_glossary(entry):
|
|||
if m:
|
||||
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
||||
break
|
||||
|
||||
|
||||
def __convert_paragraphs(soup):
|
||||
for p in soup.find_all("p"):
|
||||
p.name = "span"
|
||||
|
||||
|
||||
def __style_table_headers(soup):
|
||||
for th in soup.find_all("th"):
|
||||
th['style'] = "vertical-align: middle; text-align: center;"
|
||||
gloss = make_gloss(soup.body)
|
||||
glossary = [gloss]
|
||||
return glossary
|
||||
|
||||
|
||||
def __unwrap_table_body(soup):
|
||||
if soup.find("tbody") is not None:
|
||||
soup.tbody.unwrap()
|
||||
|
||||
|
||||
def __decompose_table_rows(soup, entry):
|
||||
for tr in soup.find_all("tr"):
|
||||
if tr.find("th") is None:
|
||||
continue
|
||||
elif tr.th.text in ["四字熟語", "言葉"]:
|
||||
tr.decompose()
|
||||
elif tr.th.text == "読み方":
|
||||
if re.match(r"^[ぁ-ヿ、]+$", entry.yomikata):
|
||||
tr.decompose()
|
||||
elif tr.th.text == "意味":
|
||||
imi = tr.td
|
||||
imi.name = "div"
|
||||
soup.body.insert(0, imi)
|
||||
tr.decompose()
|
||||
if soup.find("tr") is None:
|
||||
soup.table.decompose()
|
||||
|
||||
|
||||
def __insert_headword_line(soup, entry):
|
||||
headword_line = soup.new_tag("span")
|
||||
headword_line.string = f"{entry.get_first_reading()}【{entry.expression}】"
|
||||
soup.body.insert(0, headword_line)
|
||||
|
|
|
@ -35,7 +35,7 @@ class JitenonYojiTerminator(JitenonTerminator):
|
|||
return ""
|
||||
|
||||
def _term_tags(self, entry):
|
||||
tags = entry.kankenkyuu.replace(" ", "").split("/")
|
||||
tags = entry.kankenkyuu.split("/")
|
||||
return " ".join(tags)
|
||||
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ class Terminator:
|
|||
score = -len(terms)
|
||||
glossary = self._glossary(entry)
|
||||
sequence = self._sequence(entry)
|
||||
term_tags = ""
|
||||
term_tags = self._term_tags(entry)
|
||||
term = [
|
||||
expression, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
|
|
30
jitenbot.py
30
jitenbot.py
|
@ -22,19 +22,31 @@ from bot.crawlers import JitenonKotowazaCrawler
|
|||
|
||||
|
||||
crawlers = {
|
||||
'jitenon-yoji': JitenonYojiCrawler,
|
||||
'jitenon-kotowaza': JitenonKotowazaCrawler,
|
||||
"jitenon-yoji": JitenonYojiCrawler,
|
||||
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
||||
}
|
||||
|
||||
|
||||
def add_target_argument(parser):
|
||||
target_argument_params = {
|
||||
"choices": crawlers.keys(),
|
||||
"help": "Dictionary to convert."
|
||||
}
|
||||
parser.add_argument("target", **target_argument_params)
|
||||
|
||||
|
||||
def make_parser():
|
||||
argument_parser_params = {
|
||||
"prog": "jitenbot",
|
||||
"description": "Convert Japanese dictionary files to new formats.",
|
||||
}
|
||||
parser = argparse.ArgumentParser(**argument_parser_params)
|
||||
return parser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='jitenbot',
|
||||
description='Convert Japanese dictionary files to new formats.')
|
||||
parser.add_argument(
|
||||
'target',
|
||||
choices=crawlers.keys(),
|
||||
help='Dictionary to convert.')
|
||||
parser = make_parser()
|
||||
add_target_argument(parser)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
|
Loading…
Reference in a new issue