diff --git a/crawlers.py b/crawlers.py
index 76c2c84..ebaf0ed 100644
--- a/crawlers.py
+++ b/crawlers.py
@@ -4,9 +4,15 @@ from bs4 import BeautifulSoup
import scraper as Scraper
import yomichan as Yomichan
from jitenon_yoji import JitenonYoji
+from jitenon_kotowaza import JitenonKotowaza
-def jitenon_yoji_crawler():
+def run_all():
+ jitenon_yoji()
+ jitenon_kotowaza()
+
+
+def jitenon_yoji():
entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
@@ -24,7 +30,6 @@ def jitenon_yoji_crawler():
entry = JitenonYoji(sequence)
entry.add_document(yoji_doc)
entries[sequence] = entry
-
terms = []
attribution = ""
modified_date = None
@@ -43,3 +48,46 @@ def jitenon_yoji_crawler():
"attribution": attribution,
}
Yomichan.create_zip(terms, index)
+
+
+def jitenon_kotowaza():
+ entries = {}
+ jitenon = Scraper.Jitenon()
+ gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
+ gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
+ for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
+ gojuon_href = gojuon_a['href']
+ kana_doc = jitenon.scrape(gojuon_href)
+ kana_soup = BeautifulSoup(kana_doc, features="html.parser")
+ for kana_a in kana_soup.select(".word_box a", href=True):
+ kana_href = kana_a['href']
+ m = re.search(r"([0-9]+).php", kana_href)
+ if m:
+ sequence = int(m.group(1))
+ else:
+ # print(f"Skipping {kana_href}")
+ continue
+ if sequence in entries:
+ continue
+ kotowaza_doc = jitenon.scrape(kana_href)
+ entry = JitenonKotowaza(sequence)
+ entry.add_document(kotowaza_doc)
+ entries[sequence] = entry
+ terms = []
+ attribution = ""
+ modified_date = None
+ for entry in entries.values():
+ if modified_date is None or entry.modified_date > modified_date:
+ modified_date = entry.modified_date
+ attribution = entry.attribution
+ for term in entry.yomichan_terms():
+ terms.append(term)
+ index = {
+ "title": "故事・ことわざ・慣用句オンライン",
+ "revision": f"jitenon-kotowaza.{modified_date}",
+ "sequenced": True,
+ "format": 3,
+ "url": "https://kotowaza.jitenon.jp/",
+ "attribution": attribution,
+ }
+ Yomichan.create_zip(terms, index)
diff --git a/jitenbot.py b/jitenbot.py
index 76a2f1f..1a6503a 100644
--- a/jitenbot.py
+++ b/jitenbot.py
@@ -16,7 +16,34 @@ along with this program. If not, see .
"""
-from crawlers import jitenon_yoji_crawler
+import argparse
+import crawlers as Crawlers
+
+
+choices = {
+ 'all': Crawlers.run_all,
+ 'jitenon-yoji': Crawlers.jitenon_yoji,
+ 'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
+}
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ prog='jitenbot',
+ description='Crawl and convert Japanese web dictionaries.')
+ parser.add_argument(
+ 'target',
+ choices=choices.keys(),
+ help='website to crawl')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+ crawler = choices[args.target]
+ crawler()
+
if __name__ == "__main__":
- jitenon_yoji_crawler()
+ main()
diff --git a/jitenon_yoji.py b/jitenon_yoji.py
index 5b48e6d..4345dca 100644
--- a/jitenon_yoji.py
+++ b/jitenon_yoji.py
@@ -8,7 +8,7 @@ import util as Util
class JitenonYoji:
columns = {
- "四字熟語": ["yojijukugo", ""],
+ "四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
@@ -79,7 +79,7 @@ class JitenonYoji:
def __headwords(self):
words = []
for yomikata in self.__yomikatas():
- headword = [self.yojijukugo, yomikata]
+ headword = [self.expression, yomikata]
if headword in words:
words.remove(headword)
words.append(headword)
@@ -90,16 +90,17 @@ class JitenonYoji:
return words
def __yomikatas(self):
- m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
+ yomikata = self.yomikata.replace(" ", "")
+ m = re.search(r"^[ぁ-ヿ]+$", yomikata)
if m:
- return [self.yomikata]
- m = re.search(r"^([ぁ-ヿ]+)
", self.yomikata)
+ return [yomikata]
+ m = re.search(r"^([ぁ-ヿ]+)
", yomikata)
if m:
return [m.group(1)]
- m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", self.yomikata)
+ m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", yomikata)
if m:
- return Util.expand_shouryaku(self.yomikata)
- m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", self.yomikata)
+ return Util.expand_shouryaku(yomikata)
+ m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
diff --git a/scraper.py b/scraper.py
index 42cd4b8..a8de968 100644
--- a/scraper.py
+++ b/scraper.py
@@ -14,8 +14,8 @@ from datetime import datetime
class Scraper():
def __init__(self):
- self.netloc_re = \
- re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$")
+ pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
+ self.netloc_re = re.compile(pattern)
self.__set_session()
def scrape(self, urlstring):
@@ -80,20 +80,20 @@ class Scraper():
file_contents = None
return file_contents
- def __get(self, url):
+ def __get(self, urlstring):
delay = 10
time.sleep(delay)
now = datetime.now().strftime("%H:%M:%S")
- print(f"{now} scraping {url.geturl()} ...", end='')
+ print(f"{now} scraping {urlstring} ...", end='')
try:
- response = self.session.get(url, timeout=10)
+ response = self.session.get(urlstring, timeout=10)
print("OK")
return response.text
except Exception:
print("failed")
print("resetting session and trying again")
self.__set_session()
- response = self.session.get(url, timeout=10)
+ response = self.session.get(urlstring, timeout=10)
return response.text