diff --git a/.gitignore b/.gitignore
index b6e4761..b009cb5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+webcache/
+output/
+notes/
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..9bd9bed
--- /dev/null
+++ b/config.json
@@ -0,0 +1,6 @@
+{
+ "http-request-headers": {
+ "User-Agent": "",
+ "Accept-Language": ""
+ }
+}
diff --git a/crawlers.py b/crawlers.py
new file mode 100644
index 0000000..51017c5
--- /dev/null
+++ b/crawlers.py
@@ -0,0 +1,41 @@
+import re
+from bs4 import BeautifulSoup
+
+import scraper as Scraper
+import yomichan as Yomichan
+from jitenon_yoji import JitenonYoji
+
+
+def jitenon_yoji_crawler():
+ entries = {}
+ jitenon = Scraper.Jitenon()
+ gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
+ gojuon_soup = BeautifulSoup(gojuon, features="html.parser")
+ for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+ gojuon_href = gojuon_a['href']
+ kana = jitenon.scrape(gojuon_href)
+ kana_soup = BeautifulSoup(kana, features="html.parser")
+ for kana_a in kana_soup.select(".word_box a", href=True):
+ kana_href = kana_a['href']
+ sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
+ if sequence in entries:
+ continue
+ yoji = jitenon.scrape(kana_href)
+ yoji_soup = BeautifulSoup(yoji, features="html5lib")
+ entry = JitenonYoji(sequence)
+ entry.add_soup(yoji_soup)
+ entries[sequence] = entry
+ terms = []
+ for entry in entries.values():
+ for term in entry.yomichan_terms():
+ terms.append(term)
+ index = {
+ "title": "四字熟語辞典オンライン",
+ "revision": "test",
+ "sequenced": True,
+ "format": 3,
+ "url": "https://yoji.jitenon.jp/",
+ "attribution": "© 2012-2023 四字熟語辞典オンライン",
+ "description": "",
+ }
+ Yomichan.create_zip(terms, index)
diff --git a/jitenbot.py b/jitenbot.py
new file mode 100644
index 0000000..76a2f1f
--- /dev/null
+++ b/jitenbot.py
@@ -0,0 +1,22 @@
+""" jitenbot
+Copyright (C) 2023 Stephen Kraus
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+
+"""
+
+from crawlers import jitenon_yoji_crawler
+
+if __name__ == "__main__":
+ jitenon_yoji_crawler()
diff --git a/jitenon_yoji.py b/jitenon_yoji.py
new file mode 100644
index 0000000..ceabac2
--- /dev/null
+++ b/jitenon_yoji.py
@@ -0,0 +1,116 @@
+import re
+
+import yomichan as Yomichan
+import util as Util
+
+
+class JitenonYoji:
+ columns = {
+ "四字熟語": ["yojijukugo", ""],
+ "読み方": ["yomikata", ""],
+ "意味": ["imi", ""],
+ "出典": ["shutten", ""],
+ "漢検級": ["kankenkyuu", ""],
+ "場面用途": ["bamenyouto", ""],
+ "異形": ["ikei", []],
+ "類義語": ["ruigigo", []],
+ }
+
+ def __init__(self, sequence):
+ self.sequence = sequence
+ self.yomichan_glossary = [""]
+ for column in self.columns.values():
+ setattr(self, column[0], column[1])
+
+ def add_soup(self, yoji_soup):
+ table = yoji_soup.find(class_="kanjirighttb")
+ rows = table.find("tbody").find_all("tr")
+ colname = ""
+ for row in rows:
+ colname = row.th.text if row.th is not None else colname
+ colval = row.td.decode_contents()
+ self.__set_column(colname, colval)
+ self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
+
+ def yomichan_terms(self):
+ terms = []
+ for idx, headword in enumerate(self.__headwords()):
+ (yoji, reading) = headword
+ definition_tags = None
+ inflection_rules = ""
+ score = -idx
+ glossary = self.yomichan_glossary
+ sequence = self.sequence
+ term_tags = ""
+ term = [
+ yoji, reading, definition_tags, inflection_rules,
+ score, glossary, sequence, term_tags
+ ]
+ terms.append(term)
+ return terms
+
+ def __set_column(self, colname, colval):
+ attr_name = self.columns[colname][0]
+ attr_value = getattr(self, attr_name)
+ colval = colval.replace("\n", "").replace(",", "、").strip()
+ if isinstance(attr_value, str):
+ setattr(self, attr_name, colval)
+ elif isinstance(attr_value, list):
+ if len(attr_value) == 0:
+ setattr(self, attr_name, [colval])
+ else:
+ attr_value.append(colval)
+ setattr(self, attr_name, attr_value)
+
+ def __headwords(self):
+ words = []
+ for yomikata in self.__yomikatas():
+ headword = [self.yojijukugo, yomikata]
+ if headword in words:
+ words.remove(headword)
+ words.append(headword)
+ for headword in self.__ikei_headwords():
+ if headword in words:
+ words.remove(headword)
+ words.append(headword)
+ return words
+
+ def __yomikatas(self):
+ m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
+ if m:
+ return [self.yomikata]
+ m = re.search(r"^([ぁ-ヿ]+)
", self.yomikata)
+ if m:
+ return [m.group(1)]
+ m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", self.yomikata)
+ if m:
+ return Util.expand_shouryaku(self.yomikata)
+ m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", self.yomikata)
+ if m:
+ yomikatas = [m.group(1)]
+ alts = m.group(2).split("/")
+ for alt in alts:
+ yomikatas.append(alt.strip())
+ return yomikatas
+ raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
+
+ def __ikei_headwords(self):
+ ikei_headwords = []
+ for val in self.ikei:
+ m = re.search(r"^([^(]+)(([ぁ-ヿ]+))$", val)
+ if m:
+ headword = [m.group(1), m.group(2)]
+ ikei_headwords.append(headword)
+ else:
+ raise Exception(f"Invalid 異形 format: {val}\n{self}")
+ return ikei_headwords
+
+ def __str__(self):
+ colvals = [str(self.sequence)]
+ for attr in self.columns.values():
+ attr_val = getattr(self, attr[0])
+ if isinstance(attr_val, str):
+ colvals.append(attr_val)
+ elif isinstance(attr_val, list):
+ colvals.append(";".join(attr_val))
+ return ",".join(colvals)
diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..42cd4b8
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,103 @@
+import time
+import requests
+import re
+import os
+import json
+import hashlib
+
+from pathlib import Path
+from urllib.parse import urlparse
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+from datetime import datetime
+
+
+class Scraper():
+ def __init__(self):
+ self.netloc_re = \
+ re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$")
+ self.__set_session()
+
+ def scrape(self, urlstring):
+ url = urlparse(urlstring, scheme='https://', allow_fragments=True)
+ self.__validate_url(url)
+ cache_path = self.__cache_path(url)
+ cache_contents = self.__read_cache(cache_path)
+ if cache_contents is not None:
+ return cache_contents
+ html = self.__get(urlstring)
+ with open(cache_path, "w") as f:
+ f.write(html)
+ return html
+
+ def __set_session(self):
+ retry_strategy = Retry(
+ total=3,
+ backoff_factor=1,
+ status_forcelist=[429, 500, 502, 503, 504],
+ allowed_methods=["HEAD", "GET", "OPTIONS"]
+ )
+ adapter = HTTPAdapter(max_retries=retry_strategy)
+ with open("config.json", "r") as f:
+ config = json.load(f)
+ headers = config["http-request-headers"]
+ self.session = requests.Session()
+ self.session.mount("https://", adapter)
+ self.session.headers.update(headers)
+
+ def __validate_url(self, url):
+ valid = False
+ if self.netloc_re.match(url.netloc):
+ valid = True
+ # may add more validators later
+ if not valid:
+ raise Exception(f"Invalid URL: {url.geturl()}")
+
+ def __cache_path(self, url):
+ cache_dir = os.path.join("webcache", self.__class__.__name__.lower())
+ netloc_match = self.netloc_re.match(url.netloc)
+ if netloc_match.group(1) is not None:
+ subdomain = netloc_match.group(1)
+ cache_dir = os.path.join(cache_dir, subdomain)
+ paths = re.findall(r"/([^/]+)", url.path)
+ if len(paths) < 1:
+ raise Exception(f"Invalid path in URL: {url.geturl()}")
+ for x in paths[:len(paths)-1]:
+ cache_dir = os.path.join(cache_dir, x)
+ if not Path(cache_dir).is_dir():
+ os.makedirs(cache_dir)
+ basename = paths[-1].replace(".", "_")
+ urlstring_hash = hashlib.md5(url.geturl().encode()).hexdigest()
+ filename = f"{basename}-{urlstring_hash}.html"
+ cache_path = os.path.join(cache_dir, filename)
+ return cache_path
+
+ def __read_cache(self, cache_path):
+ if Path(cache_path).is_file():
+ with open(cache_path, "r") as f:
+ file_contents = f.read()
+ else:
+ file_contents = None
+ return file_contents
+
+ def __get(self, url):
+ delay = 10
+ time.sleep(delay)
+ now = datetime.now().strftime("%H:%M:%S")
+ print(f"{now} scraping {url.geturl()} ...", end='')
+ try:
+ response = self.session.get(url, timeout=10)
+ print("OK")
+ return response.text
+ except Exception:
+ print("failed")
+ print("resetting session and trying again")
+ self.__set_session()
+ response = self.session.get(url, timeout=10)
+ return response.text
+
+
+class Jitenon(Scraper):
+ def __init__(self):
+ self.domain = r"jitenon\.jp"
+ Scraper.__init__(self)
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..ba63084
--- /dev/null
+++ b/util.py
@@ -0,0 +1,26 @@
+import re
+
+
+def expand_shouryaku(shouryaku):
+ """Return a list of words described by a 省略 notation.
+ eg. "有(り)合(わ)せ" -> [
+ "有り合わせ",
+ "有合わせ",
+ "有り合せ",
+ "有合せ"
+ ]
+ """
+ groups = re.findall(r"([^(]*)((([^(]+)))?", shouryaku)
+ forms = [""]
+ for group in groups:
+ new_forms = []
+ for form in forms:
+ new_forms.append(form + group[0])
+ forms = new_forms.copy()
+ if group[2] == '':
+ continue
+ new_forms = []
+ for form in forms:
+ new_forms.append(form + group[2])
+ forms = new_forms.copy() + forms.copy()
+ return forms
diff --git a/yomichan.py b/yomichan.py
new file mode 100644
index 0000000..9d970e2
--- /dev/null
+++ b/yomichan.py
@@ -0,0 +1,109 @@
+import json
+import os
+import shutil
+import uuid
+from pathlib import Path
+from css_parser import parseStyle
+
+
+def create_zip(terms, index, tags=[]):
+ build_directory = str(uuid.uuid4())
+ os.mkdir(build_directory)
+
+ terms_per_file = 500
+ max_i = int(len(terms) / terms_per_file) + 1
+ for i in range(max_i):
+ term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
+ with open(term_file, "w", encoding='utf8') as f:
+ start = terms_per_file * i
+ end = terms_per_file * (i + 1)
+ json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
+
+ index_file = os.path.join(build_directory, "index.json")
+ with open(index_file, 'w', encoding='utf8') as f:
+ json.dump(index, f, indent=4, ensure_ascii=False)
+
+ if len(tags) > 0:
+ tag_file = os.path.join(build_directory, "tag_bank_1.json")
+ with open(tag_file, 'w', encoding='utf8') as f:
+ json.dump(tags, f, indent=4, ensure_ascii=False)
+
+ zip_filename = index["title"]
+ zip_file = f"{zip_filename}.zip"
+ if Path(zip_file).is_file():
+ os.remove(zip_file)
+ shutil.make_archive(zip_filename, "zip", build_directory)
+ if not Path("output").is_dir():
+ os.mkdir("output")
+ shutil.move(zip_file, "output")
+ shutil.rmtree(build_directory)
+
+
+def soup_to_gloss(soup):
+ structured_content = __get_markup_structure(soup)
+ return {
+ "type": "structured-content",
+ "content": structured_content
+ }
+
+
+def __get_markup_structure(soup):
+ node = {}
+ content = []
+ for child in soup.children:
+ if child.name is None:
+ text = child.text.strip()
+ if text != "":
+ content.append(text)
+ else:
+ content.append(__get_markup_structure(child))
+
+ node["tag"] = soup.name
+ attributes = __get_attributes(soup.attrs)
+ for key, val in attributes.items():
+ node[key] = val
+
+ if node["tag"] == "th":
+ node["style"] = {"verticalAlign": "middle", "textAlign": "center"}
+ elif node["tag"] == "p":
+ node["tag"] = "span"
+
+ if len(content) == 0:
+ pass
+ elif len(content) == 1:
+ node["content"] = content[0]
+ else:
+ node["content"] = content
+
+ return node
+
+
+def __get_attributes(attrs):
+ attributes = {}
+ if "href" in attrs:
+ attributes["href"] = attrs["href"]
+ if "rowspan" in attrs:
+ attributes["rowSpan"] = int(attrs["rowspan"])
+ if "colspan" in attrs:
+ attributes["colSpan"] = int(attrs["colspan"])
+ if "style" in attrs:
+ attributes["style"] = __get_style(attrs["style"])
+ return attributes
+
+
+def __get_style(inline_style_string):
+ style = {}
+ parsedStyle = parseStyle(inline_style_string)
+ if parsedStyle.fontSize != "":
+ style["fontSize"] = parsedStyle.fontSize
+ if parsedStyle.verticalAlign != "":
+ style["verticalAlign"] = parsedStyle.verticalAlign
+ if parsedStyle.textDecoration != "":
+ style["textDecorationLine"] = parsedStyle.textDecoration
+ if parsedStyle.listStyleType != "":
+ style["listStyleType"] = parsedStyle.listStyleType
+ if parsedStyle.fontStyle != "":
+ style["fontStyle"] = parsedStyle.fontStyle
+ if parsedStyle.fontWeight != "":
+ style["fontWeight"] = parsedStyle.fontWeight
+ return style