First version

Support for Jitenon's yoji dictionary
This commit is contained in:
stephenmk 2023-04-07 22:05:36 -05:00
parent 0a155809fe
commit f9ad9e6d21
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
8 changed files with 427 additions and 0 deletions

4
.gitignore vendored
View file

@ -1,3 +1,7 @@
webcache/
output/
notes/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

6
config.json Normal file
View file

@ -0,0 +1,6 @@
{
"http-request-headers": {
"User-Agent": "",
"Accept-Language": ""
}
}

41
crawlers.py Normal file
View file

@ -0,0 +1,41 @@
import re
from bs4 import BeautifulSoup
import scraper as Scraper
import yomichan as Yomichan
from jitenon_yoji import JitenonYoji
def jitenon_yoji_crawler():
entries = {}
jitenon = Scraper.Jitenon()
gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
if sequence in entries:
continue
yoji = jitenon.scrape(kana_href)
yoji_soup = BeautifulSoup(yoji, features="html5lib")
entry = JitenonYoji(sequence)
entry.add_soup(yoji_soup)
entries[sequence] = entry
terms = []
for entry in entries.values():
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "四字熟語辞典オンライン",
"revision": "test",
"sequenced": True,
"format": 3,
"url": "https://yoji.jitenon.jp/",
"attribution": "© 2012-2023 四字熟語辞典オンライン",
"description": "",
}
Yomichan.create_zip(terms, index)

22
jitenbot.py Normal file
View file

@ -0,0 +1,22 @@
""" jitenbot
Copyright (C) 2023 Stephen Kraus
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from crawlers import jitenon_yoji_crawler
if __name__ == "__main__":
jitenon_yoji_crawler()

116
jitenon_yoji.py Normal file
View file

@ -0,0 +1,116 @@
import re
import yomichan as Yomichan
import util as Util
class JitenonYoji:
columns = {
"四字熟語": ["yojijukugo", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
self.sequence = sequence
self.yomichan_glossary = [""]
for column in self.columns.values():
setattr(self, column[0], column[1])
def add_soup(self, yoji_soup):
table = yoji_soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = row.td.decode_contents()
self.__set_column(colname, colval)
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self.__headwords()):
(yoji, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
yoji, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def __set_column(self, colname, colval):
attr_name = self.columns[colname][0]
attr_value = getattr(self, attr_name)
colval = colval.replace("\n", "").replace(",", "").strip()
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
elif isinstance(attr_value, list):
if len(attr_value) == 0:
setattr(self, attr_name, [colval])
else:
attr_value.append(colval)
setattr(self, attr_name, attr_value)
def __headwords(self):
words = []
for yomikata in self.__yomikatas():
headword = [self.yojijukugo, yomikata]
if headword in words:
words.remove(headword)
words.append(headword)
for headword in self.__ikei_headwords():
if headword in words:
words.remove(headword)
words.append(headword)
return words
def __yomikatas(self):
m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
if m:
return [self.yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", self.yomikata)
if m:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", self.yomikata)
if m:
return Util.expand_shouryaku(self.yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", self.yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
def __ikei_headwords(self):
ikei_headwords = []
for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ]+)$", val)
if m:
headword = [m.group(1), m.group(2)]
ikei_headwords.append(headword)
else:
raise Exception(f"Invalid 異形 format: {val}\n{self}")
return ikei_headwords
def __str__(self):
colvals = [str(self.sequence)]
for attr in self.columns.values():
attr_val = getattr(self, attr[0])
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)

103
scraper.py Normal file
View file

@ -0,0 +1,103 @@
import time
import requests
import re
import os
import json
import hashlib
from pathlib import Path
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from datetime import datetime
class Scraper():
def __init__(self):
self.netloc_re = \
re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$")
self.__set_session()
def scrape(self, urlstring):
url = urlparse(urlstring, scheme='https://', allow_fragments=True)
self.__validate_url(url)
cache_path = self.__cache_path(url)
cache_contents = self.__read_cache(cache_path)
if cache_contents is not None:
return cache_contents
html = self.__get(urlstring)
with open(cache_path, "w") as f:
f.write(html)
return html
def __set_session(self):
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
with open("config.json", "r") as f:
config = json.load(f)
headers = config["http-request-headers"]
self.session = requests.Session()
self.session.mount("https://", adapter)
self.session.headers.update(headers)
def __validate_url(self, url):
valid = False
if self.netloc_re.match(url.netloc):
valid = True
# may add more validators later
if not valid:
raise Exception(f"Invalid URL: {url.geturl()}")
def __cache_path(self, url):
cache_dir = os.path.join("webcache", self.__class__.__name__.lower())
netloc_match = self.netloc_re.match(url.netloc)
if netloc_match.group(1) is not None:
subdomain = netloc_match.group(1)
cache_dir = os.path.join(cache_dir, subdomain)
paths = re.findall(r"/([^/]+)", url.path)
if len(paths) < 1:
raise Exception(f"Invalid path in URL: {url.geturl()}")
for x in paths[:len(paths)-1]:
cache_dir = os.path.join(cache_dir, x)
if not Path(cache_dir).is_dir():
os.makedirs(cache_dir)
basename = paths[-1].replace(".", "_")
urlstring_hash = hashlib.md5(url.geturl().encode()).hexdigest()
filename = f"{basename}-{urlstring_hash}.html"
cache_path = os.path.join(cache_dir, filename)
return cache_path
def __read_cache(self, cache_path):
if Path(cache_path).is_file():
with open(cache_path, "r") as f:
file_contents = f.read()
else:
file_contents = None
return file_contents
def __get(self, url):
delay = 10
time.sleep(delay)
now = datetime.now().strftime("%H:%M:%S")
print(f"{now} scraping {url.geturl()} ...", end='')
try:
response = self.session.get(url, timeout=10)
print("OK")
return response.text
except Exception:
print("failed")
print("resetting session and trying again")
self.__set_session()
response = self.session.get(url, timeout=10)
return response.text
class Jitenon(Scraper):
def __init__(self):
self.domain = r"jitenon\.jp"
Scraper.__init__(self)

26
util.py Normal file
View file

@ -0,0 +1,26 @@
import re
def expand_shouryaku(shouryaku):
"""Return a list of words described by a 省略 notation.
eg. "有(り)合(わ)せ" -> [
"有り合わせ",
"有合わせ",
"有り合せ",
"有合せ"
]
"""
groups = re.findall(r"([^]*)(([^]+))?", shouryaku)
forms = [""]
for group in groups:
new_forms = []
for form in forms:
new_forms.append(form + group[0])
forms = new_forms.copy()
if group[2] == '':
continue
new_forms = []
for form in forms:
new_forms.append(form + group[2])
forms = new_forms.copy() + forms.copy()
return forms

109
yomichan.py Normal file
View file

@ -0,0 +1,109 @@
import json
import os
import shutil
import uuid
from pathlib import Path
from css_parser import parseStyle
def create_zip(terms, index, tags=[]):
build_directory = str(uuid.uuid4())
os.mkdir(build_directory)
terms_per_file = 500
max_i = int(len(terms) / terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = terms_per_file * i
end = terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
index_file = os.path.join(build_directory, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
if len(tags) > 0:
tag_file = os.path.join(build_directory, "tag_bank_1.json")
with open(tag_file, 'w', encoding='utf8') as f:
json.dump(tags, f, indent=4, ensure_ascii=False)
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
if Path(zip_file).is_file():
os.remove(zip_file)
shutil.make_archive(zip_filename, "zip", build_directory)
if not Path("output").is_dir():
os.mkdir("output")
shutil.move(zip_file, "output")
shutil.rmtree(build_directory)
def soup_to_gloss(soup):
structured_content = __get_markup_structure(soup)
return {
"type": "structured-content",
"content": structured_content
}
def __get_markup_structure(soup):
node = {}
content = []
for child in soup.children:
if child.name is None:
text = child.text.strip()
if text != "":
content.append(text)
else:
content.append(__get_markup_structure(child))
node["tag"] = soup.name
attributes = __get_attributes(soup.attrs)
for key, val in attributes.items():
node[key] = val
if node["tag"] == "th":
node["style"] = {"verticalAlign": "middle", "textAlign": "center"}
elif node["tag"] == "p":
node["tag"] = "span"
if len(content) == 0:
pass
elif len(content) == 1:
node["content"] = content[0]
else:
node["content"] = content
return node
def __get_attributes(attrs):
attributes = {}
if "href" in attrs:
attributes["href"] = attrs["href"]
if "rowspan" in attrs:
attributes["rowSpan"] = int(attrs["rowspan"])
if "colspan" in attrs:
attributes["colSpan"] = int(attrs["colspan"])
if "style" in attrs:
attributes["style"] = __get_style(attrs["style"])
return attributes
def __get_style(inline_style_string):
style = {}
parsedStyle = parseStyle(inline_style_string)
if parsedStyle.fontSize != "":
style["fontSize"] = parsedStyle.fontSize
if parsedStyle.verticalAlign != "":
style["verticalAlign"] = parsedStyle.verticalAlign
if parsedStyle.textDecoration != "":
style["textDecorationLine"] = parsedStyle.textDecoration
if parsedStyle.listStyleType != "":
style["listStyleType"] = parsedStyle.listStyleType
if parsedStyle.fontStyle != "":
style["fontStyle"] = parsedStyle.fontStyle
if parsedStyle.fontWeight != "":
style["fontWeight"] = parsedStyle.fontWeight
return style