First version
Support for Jitenon's yoji dictionary
This commit is contained in:
parent
0a155809fe
commit
f9ad9e6d21
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,3 +1,7 @@
|
|||
webcache/
|
||||
output/
|
||||
notes/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
|
6
config.json
Normal file
6
config.json
Normal file
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"http-request-headers": {
|
||||
"User-Agent": "",
|
||||
"Accept-Language": ""
|
||||
}
|
||||
}
|
41
crawlers.py
Normal file
41
crawlers.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import scraper as Scraper
|
||||
import yomichan as Yomichan
|
||||
from jitenon_yoji import JitenonYoji
|
||||
|
||||
|
||||
def jitenon_yoji_crawler():
|
||||
entries = {}
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||
gojuon_soup = BeautifulSoup(gojuon, features="html.parser")
|
||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||
gojuon_href = gojuon_a['href']
|
||||
kana = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||
if sequence in entries:
|
||||
continue
|
||||
yoji = jitenon.scrape(kana_href)
|
||||
yoji_soup = BeautifulSoup(yoji, features="html5lib")
|
||||
entry = JitenonYoji(sequence)
|
||||
entry.add_soup(yoji_soup)
|
||||
entries[sequence] = entry
|
||||
terms = []
|
||||
for entry in entries.values():
|
||||
for term in entry.yomichan_terms():
|
||||
terms.append(term)
|
||||
index = {
|
||||
"title": "四字熟語辞典オンライン",
|
||||
"revision": "test",
|
||||
"sequenced": True,
|
||||
"format": 3,
|
||||
"url": "https://yoji.jitenon.jp/",
|
||||
"attribution": "© 2012-2023 四字熟語辞典オンライン",
|
||||
"description": "",
|
||||
}
|
||||
Yomichan.create_zip(terms, index)
|
22
jitenbot.py
Normal file
22
jitenbot.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
""" jitenbot
|
||||
Copyright (C) 2023 Stephen Kraus
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
|
||||
from crawlers import jitenon_yoji_crawler
|
||||
|
||||
if __name__ == "__main__":
|
||||
jitenon_yoji_crawler()
|
116
jitenon_yoji.py
Normal file
116
jitenon_yoji.py
Normal file
|
@ -0,0 +1,116 @@
|
|||
import re
|
||||
|
||||
import yomichan as Yomichan
|
||||
import util as Util
|
||||
|
||||
|
||||
class JitenonYoji:
|
||||
columns = {
|
||||
"四字熟語": ["yojijukugo", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"漢検級": ["kankenkyuu", ""],
|
||||
"場面用途": ["bamenyouto", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類義語": ["ruigigo", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
self.sequence = sequence
|
||||
self.yomichan_glossary = [""]
|
||||
for column in self.columns.values():
|
||||
setattr(self, column[0], column[1])
|
||||
|
||||
def add_soup(self, yoji_soup):
|
||||
table = yoji_soup.find(class_="kanjirighttb")
|
||||
rows = table.find("tbody").find_all("tr")
|
||||
colname = ""
|
||||
for row in rows:
|
||||
colname = row.th.text if row.th is not None else colname
|
||||
colval = row.td.decode_contents()
|
||||
self.__set_column(colname, colval)
|
||||
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
for idx, headword in enumerate(self.__headwords()):
|
||||
(yoji, reading) = headword
|
||||
definition_tags = None
|
||||
inflection_rules = ""
|
||||
score = -idx
|
||||
glossary = self.yomichan_glossary
|
||||
sequence = self.sequence
|
||||
term_tags = ""
|
||||
term = [
|
||||
yoji, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
return terms
|
||||
|
||||
def __set_column(self, colname, colval):
|
||||
attr_name = self.columns[colname][0]
|
||||
attr_value = getattr(self, attr_name)
|
||||
colval = colval.replace("\n", "").replace(",", "、").strip()
|
||||
if isinstance(attr_value, str):
|
||||
setattr(self, attr_name, colval)
|
||||
elif isinstance(attr_value, list):
|
||||
if len(attr_value) == 0:
|
||||
setattr(self, attr_name, [colval])
|
||||
else:
|
||||
attr_value.append(colval)
|
||||
setattr(self, attr_name, attr_value)
|
||||
|
||||
def __headwords(self):
|
||||
words = []
|
||||
for yomikata in self.__yomikatas():
|
||||
headword = [self.yojijukugo, yomikata]
|
||||
if headword in words:
|
||||
words.remove(headword)
|
||||
words.append(headword)
|
||||
for headword in self.__ikei_headwords():
|
||||
if headword in words:
|
||||
words.remove(headword)
|
||||
words.append(headword)
|
||||
return words
|
||||
|
||||
def __yomikatas(self):
|
||||
m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
|
||||
if m:
|
||||
return [self.yomikata]
|
||||
m = re.search(r"^([ぁ-ヿ]+)<br/>", self.yomikata)
|
||||
if m:
|
||||
return [m.group(1)]
|
||||
m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", self.yomikata)
|
||||
if m:
|
||||
return Util.expand_shouryaku(self.yomikata)
|
||||
m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", self.yomikata)
|
||||
if m:
|
||||
yomikatas = [m.group(1)]
|
||||
alts = m.group(2).split("/")
|
||||
for alt in alts:
|
||||
yomikatas.append(alt.strip())
|
||||
return yomikatas
|
||||
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
|
||||
|
||||
def __ikei_headwords(self):
|
||||
ikei_headwords = []
|
||||
for val in self.ikei:
|
||||
m = re.search(r"^([^(]+)(([ぁ-ヿ]+))$", val)
|
||||
if m:
|
||||
headword = [m.group(1), m.group(2)]
|
||||
ikei_headwords.append(headword)
|
||||
else:
|
||||
raise Exception(f"Invalid 異形 format: {val}\n{self}")
|
||||
return ikei_headwords
|
||||
|
||||
def __str__(self):
|
||||
colvals = [str(self.sequence)]
|
||||
for attr in self.columns.values():
|
||||
attr_val = getattr(self, attr[0])
|
||||
if isinstance(attr_val, str):
|
||||
colvals.append(attr_val)
|
||||
elif isinstance(attr_val, list):
|
||||
colvals.append(";".join(attr_val))
|
||||
return ",".join(colvals)
|
103
scraper.py
Normal file
103
scraper.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
import time
|
||||
import requests
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class Scraper():
|
||||
def __init__(self):
|
||||
self.netloc_re = \
|
||||
re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$")
|
||||
self.__set_session()
|
||||
|
||||
def scrape(self, urlstring):
|
||||
url = urlparse(urlstring, scheme='https://', allow_fragments=True)
|
||||
self.__validate_url(url)
|
||||
cache_path = self.__cache_path(url)
|
||||
cache_contents = self.__read_cache(cache_path)
|
||||
if cache_contents is not None:
|
||||
return cache_contents
|
||||
html = self.__get(urlstring)
|
||||
with open(cache_path, "w") as f:
|
||||
f.write(html)
|
||||
return html
|
||||
|
||||
def __set_session(self):
|
||||
retry_strategy = Retry(
|
||||
total=3,
|
||||
backoff_factor=1,
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
allowed_methods=["HEAD", "GET", "OPTIONS"]
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
with open("config.json", "r") as f:
|
||||
config = json.load(f)
|
||||
headers = config["http-request-headers"]
|
||||
self.session = requests.Session()
|
||||
self.session.mount("https://", adapter)
|
||||
self.session.headers.update(headers)
|
||||
|
||||
def __validate_url(self, url):
|
||||
valid = False
|
||||
if self.netloc_re.match(url.netloc):
|
||||
valid = True
|
||||
# may add more validators later
|
||||
if not valid:
|
||||
raise Exception(f"Invalid URL: {url.geturl()}")
|
||||
|
||||
def __cache_path(self, url):
|
||||
cache_dir = os.path.join("webcache", self.__class__.__name__.lower())
|
||||
netloc_match = self.netloc_re.match(url.netloc)
|
||||
if netloc_match.group(1) is not None:
|
||||
subdomain = netloc_match.group(1)
|
||||
cache_dir = os.path.join(cache_dir, subdomain)
|
||||
paths = re.findall(r"/([^/]+)", url.path)
|
||||
if len(paths) < 1:
|
||||
raise Exception(f"Invalid path in URL: {url.geturl()}")
|
||||
for x in paths[:len(paths)-1]:
|
||||
cache_dir = os.path.join(cache_dir, x)
|
||||
if not Path(cache_dir).is_dir():
|
||||
os.makedirs(cache_dir)
|
||||
basename = paths[-1].replace(".", "_")
|
||||
urlstring_hash = hashlib.md5(url.geturl().encode()).hexdigest()
|
||||
filename = f"{basename}-{urlstring_hash}.html"
|
||||
cache_path = os.path.join(cache_dir, filename)
|
||||
return cache_path
|
||||
|
||||
def __read_cache(self, cache_path):
|
||||
if Path(cache_path).is_file():
|
||||
with open(cache_path, "r") as f:
|
||||
file_contents = f.read()
|
||||
else:
|
||||
file_contents = None
|
||||
return file_contents
|
||||
|
||||
def __get(self, url):
|
||||
delay = 10
|
||||
time.sleep(delay)
|
||||
now = datetime.now().strftime("%H:%M:%S")
|
||||
print(f"{now} scraping {url.geturl()} ...", end='')
|
||||
try:
|
||||
response = self.session.get(url, timeout=10)
|
||||
print("OK")
|
||||
return response.text
|
||||
except Exception:
|
||||
print("failed")
|
||||
print("resetting session and trying again")
|
||||
self.__set_session()
|
||||
response = self.session.get(url, timeout=10)
|
||||
return response.text
|
||||
|
||||
|
||||
class Jitenon(Scraper):
|
||||
def __init__(self):
|
||||
self.domain = r"jitenon\.jp"
|
||||
Scraper.__init__(self)
|
26
util.py
Normal file
26
util.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
import re
|
||||
|
||||
|
||||
def expand_shouryaku(shouryaku):
|
||||
"""Return a list of words described by a 省略 notation.
|
||||
eg. "有(り)合(わ)せ" -> [
|
||||
"有り合わせ",
|
||||
"有合わせ",
|
||||
"有り合せ",
|
||||
"有合せ"
|
||||
]
|
||||
"""
|
||||
groups = re.findall(r"([^(]*)((([^(]+)))?", shouryaku)
|
||||
forms = [""]
|
||||
for group in groups:
|
||||
new_forms = []
|
||||
for form in forms:
|
||||
new_forms.append(form + group[0])
|
||||
forms = new_forms.copy()
|
||||
if group[2] == '':
|
||||
continue
|
||||
new_forms = []
|
||||
for form in forms:
|
||||
new_forms.append(form + group[2])
|
||||
forms = new_forms.copy() + forms.copy()
|
||||
return forms
|
109
yomichan.py
Normal file
109
yomichan.py
Normal file
|
@ -0,0 +1,109 @@
|
|||
import json
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from css_parser import parseStyle
|
||||
|
||||
|
||||
def create_zip(terms, index, tags=[]):
|
||||
build_directory = str(uuid.uuid4())
|
||||
os.mkdir(build_directory)
|
||||
|
||||
terms_per_file = 500
|
||||
max_i = int(len(terms) / terms_per_file) + 1
|
||||
for i in range(max_i):
|
||||
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
||||
with open(term_file, "w", encoding='utf8') as f:
|
||||
start = terms_per_file * i
|
||||
end = terms_per_file * (i + 1)
|
||||
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
||||
|
||||
index_file = os.path.join(build_directory, "index.json")
|
||||
with open(index_file, 'w', encoding='utf8') as f:
|
||||
json.dump(index, f, indent=4, ensure_ascii=False)
|
||||
|
||||
if len(tags) > 0:
|
||||
tag_file = os.path.join(build_directory, "tag_bank_1.json")
|
||||
with open(tag_file, 'w', encoding='utf8') as f:
|
||||
json.dump(tags, f, indent=4, ensure_ascii=False)
|
||||
|
||||
zip_filename = index["title"]
|
||||
zip_file = f"{zip_filename}.zip"
|
||||
if Path(zip_file).is_file():
|
||||
os.remove(zip_file)
|
||||
shutil.make_archive(zip_filename, "zip", build_directory)
|
||||
if not Path("output").is_dir():
|
||||
os.mkdir("output")
|
||||
shutil.move(zip_file, "output")
|
||||
shutil.rmtree(build_directory)
|
||||
|
||||
|
||||
def soup_to_gloss(soup):
|
||||
structured_content = __get_markup_structure(soup)
|
||||
return {
|
||||
"type": "structured-content",
|
||||
"content": structured_content
|
||||
}
|
||||
|
||||
|
||||
def __get_markup_structure(soup):
|
||||
node = {}
|
||||
content = []
|
||||
for child in soup.children:
|
||||
if child.name is None:
|
||||
text = child.text.strip()
|
||||
if text != "":
|
||||
content.append(text)
|
||||
else:
|
||||
content.append(__get_markup_structure(child))
|
||||
|
||||
node["tag"] = soup.name
|
||||
attributes = __get_attributes(soup.attrs)
|
||||
for key, val in attributes.items():
|
||||
node[key] = val
|
||||
|
||||
if node["tag"] == "th":
|
||||
node["style"] = {"verticalAlign": "middle", "textAlign": "center"}
|
||||
elif node["tag"] == "p":
|
||||
node["tag"] = "span"
|
||||
|
||||
if len(content) == 0:
|
||||
pass
|
||||
elif len(content) == 1:
|
||||
node["content"] = content[0]
|
||||
else:
|
||||
node["content"] = content
|
||||
|
||||
return node
|
||||
|
||||
|
||||
def __get_attributes(attrs):
|
||||
attributes = {}
|
||||
if "href" in attrs:
|
||||
attributes["href"] = attrs["href"]
|
||||
if "rowspan" in attrs:
|
||||
attributes["rowSpan"] = int(attrs["rowspan"])
|
||||
if "colspan" in attrs:
|
||||
attributes["colSpan"] = int(attrs["colspan"])
|
||||
if "style" in attrs:
|
||||
attributes["style"] = __get_style(attrs["style"])
|
||||
return attributes
|
||||
|
||||
|
||||
def __get_style(inline_style_string):
|
||||
style = {}
|
||||
parsedStyle = parseStyle(inline_style_string)
|
||||
if parsedStyle.fontSize != "":
|
||||
style["fontSize"] = parsedStyle.fontSize
|
||||
if parsedStyle.verticalAlign != "":
|
||||
style["verticalAlign"] = parsedStyle.verticalAlign
|
||||
if parsedStyle.textDecoration != "":
|
||||
style["textDecorationLine"] = parsedStyle.textDecoration
|
||||
if parsedStyle.listStyleType != "":
|
||||
style["listStyleType"] = parsedStyle.listStyleType
|
||||
if parsedStyle.fontStyle != "":
|
||||
style["fontStyle"] = parsedStyle.fontStyle
|
||||
if parsedStyle.fontWeight != "":
|
||||
style["fontWeight"] = parsedStyle.fontWeight
|
||||
return style
|
Loading…
Reference in a new issue