diff --git a/.gitignore b/.gitignore index b6e4761..b009cb5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +webcache/ +output/ +notes/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/config.json b/config.json new file mode 100644 index 0000000..9bd9bed --- /dev/null +++ b/config.json @@ -0,0 +1,6 @@ +{ + "http-request-headers": { + "User-Agent": "", + "Accept-Language": "" + } +} diff --git a/crawlers.py b/crawlers.py new file mode 100644 index 0000000..51017c5 --- /dev/null +++ b/crawlers.py @@ -0,0 +1,41 @@ +import re +from bs4 import BeautifulSoup + +import scraper as Scraper +import yomichan as Yomichan +from jitenon_yoji import JitenonYoji + + +def jitenon_yoji_crawler(): + entries = {} + jitenon = Scraper.Jitenon() + gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") + gojuon_soup = BeautifulSoup(gojuon, features="html.parser") + for gojuon_a in gojuon_soup.select(".kana_area a", href=True): + gojuon_href = gojuon_a['href'] + kana = jitenon.scrape(gojuon_href) + kana_soup = BeautifulSoup(kana, features="html.parser") + for kana_a in kana_soup.select(".word_box a", href=True): + kana_href = kana_a['href'] + sequence = int(re.search(r"([0-9]+).html", kana_href).group(1)) + if sequence in entries: + continue + yoji = jitenon.scrape(kana_href) + yoji_soup = BeautifulSoup(yoji, features="html5lib") + entry = JitenonYoji(sequence) + entry.add_soup(yoji_soup) + entries[sequence] = entry + terms = [] + for entry in entries.values(): + for term in entry.yomichan_terms(): + terms.append(term) + index = { + "title": "四字熟語辞典オンライン", + "revision": "test", + "sequenced": True, + "format": 3, + "url": "https://yoji.jitenon.jp/", + "attribution": "© 2012-2023 四字熟語辞典オンライン", + "description": "", + } + Yomichan.create_zip(terms, index) diff --git a/jitenbot.py b/jitenbot.py new file mode 100644 index 0000000..76a2f1f --- /dev/null +++ b/jitenbot.py @@ -0,0 +1,22 @@ +""" jitenbot +Copyright (C) 2023 Stephen Kraus + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +""" + +from crawlers import jitenon_yoji_crawler + +if __name__ == "__main__": + jitenon_yoji_crawler() diff --git a/jitenon_yoji.py b/jitenon_yoji.py new file mode 100644 index 0000000..ceabac2 --- /dev/null +++ b/jitenon_yoji.py @@ -0,0 +1,116 @@ +import re + +import yomichan as Yomichan +import util as Util + + +class JitenonYoji: + columns = { + "四字熟語": ["yojijukugo", ""], + "読み方": ["yomikata", ""], + "意味": ["imi", ""], + "出典": ["shutten", ""], + "漢検級": ["kankenkyuu", ""], + "場面用途": ["bamenyouto", ""], + "異形": ["ikei", []], + "類義語": ["ruigigo", []], + } + + def __init__(self, sequence): + self.sequence = sequence + self.yomichan_glossary = [""] + for column in self.columns.values(): + setattr(self, column[0], column[1]) + + def add_soup(self, yoji_soup): + table = yoji_soup.find(class_="kanjirighttb") + rows = table.find("tbody").find_all("tr") + colname = "" + for row in rows: + colname = row.th.text if row.th is not None else colname + colval = row.td.decode_contents() + self.__set_column(colname, colval) + self.yomichan_glossary = [Yomichan.soup_to_gloss(table)] + + def yomichan_terms(self): + terms = [] + for idx, headword in enumerate(self.__headwords()): + (yoji, reading) = headword + definition_tags = None + inflection_rules = "" + score = -idx + glossary = self.yomichan_glossary + sequence = self.sequence + term_tags = "" + term = [ + yoji, reading, definition_tags, inflection_rules, + score, glossary, sequence, term_tags + ] + terms.append(term) + return terms + + def __set_column(self, colname, colval): + attr_name = self.columns[colname][0] + attr_value = getattr(self, attr_name) + colval = colval.replace("\n", "").replace(",", "、").strip() + if isinstance(attr_value, str): + setattr(self, attr_name, colval) + elif isinstance(attr_value, list): + if len(attr_value) == 0: + setattr(self, attr_name, [colval]) + else: + attr_value.append(colval) + setattr(self, attr_name, attr_value) + + def __headwords(self): + words = [] + for yomikata in self.__yomikatas(): + headword = [self.yojijukugo, yomikata] + if headword in words: + words.remove(headword) + words.append(headword) + for headword in self.__ikei_headwords(): + if headword in words: + words.remove(headword) + words.append(headword) + return words + + def __yomikatas(self): + m = re.search(r"^[ぁ-ヿ]+$", self.yomikata) + if m: + return [self.yomikata] + m = re.search(r"^([ぁ-ヿ]+)
", self.yomikata) + if m: + return [m.group(1)] + m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", self.yomikata) + if m: + return Util.expand_shouryaku(self.yomikata) + m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", self.yomikata) + if m: + yomikatas = [m.group(1)] + alts = m.group(2).split("/") + for alt in alts: + yomikatas.append(alt.strip()) + return yomikatas + raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}") + + def __ikei_headwords(self): + ikei_headwords = [] + for val in self.ikei: + m = re.search(r"^([^(]+)(([ぁ-ヿ]+))$", val) + if m: + headword = [m.group(1), m.group(2)] + ikei_headwords.append(headword) + else: + raise Exception(f"Invalid 異形 format: {val}\n{self}") + return ikei_headwords + + def __str__(self): + colvals = [str(self.sequence)] + for attr in self.columns.values(): + attr_val = getattr(self, attr[0]) + if isinstance(attr_val, str): + colvals.append(attr_val) + elif isinstance(attr_val, list): + colvals.append(";".join(attr_val)) + return ",".join(colvals) diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..42cd4b8 --- /dev/null +++ b/scraper.py @@ -0,0 +1,103 @@ +import time +import requests +import re +import os +import json +import hashlib + +from pathlib import Path +from urllib.parse import urlparse +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +from datetime import datetime + + +class Scraper(): + def __init__(self): + self.netloc_re = \ + re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$") + self.__set_session() + + def scrape(self, urlstring): + url = urlparse(urlstring, scheme='https://', allow_fragments=True) + self.__validate_url(url) + cache_path = self.__cache_path(url) + cache_contents = self.__read_cache(cache_path) + if cache_contents is not None: + return cache_contents + html = self.__get(urlstring) + with open(cache_path, "w") as f: + f.write(html) + return html + + def __set_session(self): + retry_strategy = Retry( + total=3, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "OPTIONS"] + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + with open("config.json", "r") as f: + config = json.load(f) + headers = config["http-request-headers"] + self.session = requests.Session() + self.session.mount("https://", adapter) + self.session.headers.update(headers) + + def __validate_url(self, url): + valid = False + if self.netloc_re.match(url.netloc): + valid = True + # may add more validators later + if not valid: + raise Exception(f"Invalid URL: {url.geturl()}") + + def __cache_path(self, url): + cache_dir = os.path.join("webcache", self.__class__.__name__.lower()) + netloc_match = self.netloc_re.match(url.netloc) + if netloc_match.group(1) is not None: + subdomain = netloc_match.group(1) + cache_dir = os.path.join(cache_dir, subdomain) + paths = re.findall(r"/([^/]+)", url.path) + if len(paths) < 1: + raise Exception(f"Invalid path in URL: {url.geturl()}") + for x in paths[:len(paths)-1]: + cache_dir = os.path.join(cache_dir, x) + if not Path(cache_dir).is_dir(): + os.makedirs(cache_dir) + basename = paths[-1].replace(".", "_") + urlstring_hash = hashlib.md5(url.geturl().encode()).hexdigest() + filename = f"{basename}-{urlstring_hash}.html" + cache_path = os.path.join(cache_dir, filename) + return cache_path + + def __read_cache(self, cache_path): + if Path(cache_path).is_file(): + with open(cache_path, "r") as f: + file_contents = f.read() + else: + file_contents = None + return file_contents + + def __get(self, url): + delay = 10 + time.sleep(delay) + now = datetime.now().strftime("%H:%M:%S") + print(f"{now} scraping {url.geturl()} ...", end='') + try: + response = self.session.get(url, timeout=10) + print("OK") + return response.text + except Exception: + print("failed") + print("resetting session and trying again") + self.__set_session() + response = self.session.get(url, timeout=10) + return response.text + + +class Jitenon(Scraper): + def __init__(self): + self.domain = r"jitenon\.jp" + Scraper.__init__(self) diff --git a/util.py b/util.py new file mode 100644 index 0000000..ba63084 --- /dev/null +++ b/util.py @@ -0,0 +1,26 @@ +import re + + +def expand_shouryaku(shouryaku): + """Return a list of words described by a 省略 notation. + eg. "有(り)合(わ)せ" -> [ + "有り合わせ", + "有合わせ", + "有り合せ", + "有合せ" + ] + """ + groups = re.findall(r"([^(]*)((([^(]+)))?", shouryaku) + forms = [""] + for group in groups: + new_forms = [] + for form in forms: + new_forms.append(form + group[0]) + forms = new_forms.copy() + if group[2] == '': + continue + new_forms = [] + for form in forms: + new_forms.append(form + group[2]) + forms = new_forms.copy() + forms.copy() + return forms diff --git a/yomichan.py b/yomichan.py new file mode 100644 index 0000000..9d970e2 --- /dev/null +++ b/yomichan.py @@ -0,0 +1,109 @@ +import json +import os +import shutil +import uuid +from pathlib import Path +from css_parser import parseStyle + + +def create_zip(terms, index, tags=[]): + build_directory = str(uuid.uuid4()) + os.mkdir(build_directory) + + terms_per_file = 500 + max_i = int(len(terms) / terms_per_file) + 1 + for i in range(max_i): + term_file = os.path.join(build_directory, f"term_bank_{i+1}.json") + with open(term_file, "w", encoding='utf8') as f: + start = terms_per_file * i + end = terms_per_file * (i + 1) + json.dump(terms[start:end], f, indent=4, ensure_ascii=False) + + index_file = os.path.join(build_directory, "index.json") + with open(index_file, 'w', encoding='utf8') as f: + json.dump(index, f, indent=4, ensure_ascii=False) + + if len(tags) > 0: + tag_file = os.path.join(build_directory, "tag_bank_1.json") + with open(tag_file, 'w', encoding='utf8') as f: + json.dump(tags, f, indent=4, ensure_ascii=False) + + zip_filename = index["title"] + zip_file = f"{zip_filename}.zip" + if Path(zip_file).is_file(): + os.remove(zip_file) + shutil.make_archive(zip_filename, "zip", build_directory) + if not Path("output").is_dir(): + os.mkdir("output") + shutil.move(zip_file, "output") + shutil.rmtree(build_directory) + + +def soup_to_gloss(soup): + structured_content = __get_markup_structure(soup) + return { + "type": "structured-content", + "content": structured_content + } + + +def __get_markup_structure(soup): + node = {} + content = [] + for child in soup.children: + if child.name is None: + text = child.text.strip() + if text != "": + content.append(text) + else: + content.append(__get_markup_structure(child)) + + node["tag"] = soup.name + attributes = __get_attributes(soup.attrs) + for key, val in attributes.items(): + node[key] = val + + if node["tag"] == "th": + node["style"] = {"verticalAlign": "middle", "textAlign": "center"} + elif node["tag"] == "p": + node["tag"] = "span" + + if len(content) == 0: + pass + elif len(content) == 1: + node["content"] = content[0] + else: + node["content"] = content + + return node + + +def __get_attributes(attrs): + attributes = {} + if "href" in attrs: + attributes["href"] = attrs["href"] + if "rowspan" in attrs: + attributes["rowSpan"] = int(attrs["rowspan"]) + if "colspan" in attrs: + attributes["colSpan"] = int(attrs["colspan"]) + if "style" in attrs: + attributes["style"] = __get_style(attrs["style"]) + return attributes + + +def __get_style(inline_style_string): + style = {} + parsedStyle = parseStyle(inline_style_string) + if parsedStyle.fontSize != "": + style["fontSize"] = parsedStyle.fontSize + if parsedStyle.verticalAlign != "": + style["verticalAlign"] = parsedStyle.verticalAlign + if parsedStyle.textDecoration != "": + style["textDecorationLine"] = parsedStyle.textDecoration + if parsedStyle.listStyleType != "": + style["listStyleType"] = parsedStyle.listStyleType + if parsedStyle.fontStyle != "": + style["fontStyle"] = parsedStyle.fontStyle + if parsedStyle.fontWeight != "": + style["fontWeight"] = parsedStyle.fontWeight + return style