Add support Jitenon Kotowaza
This commit is contained in:
parent
79632843cb
commit
16d694d2d2
52
crawlers.py
52
crawlers.py
|
@ -4,9 +4,15 @@ from bs4 import BeautifulSoup
|
||||||
import scraper as Scraper
|
import scraper as Scraper
|
||||||
import yomichan as Yomichan
|
import yomichan as Yomichan
|
||||||
from jitenon_yoji import JitenonYoji
|
from jitenon_yoji import JitenonYoji
|
||||||
|
from jitenon_kotowaza import JitenonKotowaza
|
||||||
|
|
||||||
|
|
||||||
def jitenon_yoji_crawler():
|
def run_all():
|
||||||
|
jitenon_yoji()
|
||||||
|
jitenon_kotowaza()
|
||||||
|
|
||||||
|
|
||||||
|
def jitenon_yoji():
|
||||||
entries = {}
|
entries = {}
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||||
|
@ -24,7 +30,6 @@ def jitenon_yoji_crawler():
|
||||||
entry = JitenonYoji(sequence)
|
entry = JitenonYoji(sequence)
|
||||||
entry.add_document(yoji_doc)
|
entry.add_document(yoji_doc)
|
||||||
entries[sequence] = entry
|
entries[sequence] = entry
|
||||||
|
|
||||||
terms = []
|
terms = []
|
||||||
attribution = ""
|
attribution = ""
|
||||||
modified_date = None
|
modified_date = None
|
||||||
|
@ -43,3 +48,46 @@ def jitenon_yoji_crawler():
|
||||||
"attribution": attribution,
|
"attribution": attribution,
|
||||||
}
|
}
|
||||||
Yomichan.create_zip(terms, index)
|
Yomichan.create_zip(terms, index)
|
||||||
|
|
||||||
|
|
||||||
|
def jitenon_kotowaza():
|
||||||
|
entries = {}
|
||||||
|
jitenon = Scraper.Jitenon()
|
||||||
|
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||||
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||||
|
gojuon_href = gojuon_a['href']
|
||||||
|
kana_doc = jitenon.scrape(gojuon_href)
|
||||||
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||||
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
|
kana_href = kana_a['href']
|
||||||
|
m = re.search(r"([0-9]+).php", kana_href)
|
||||||
|
if m:
|
||||||
|
sequence = int(m.group(1))
|
||||||
|
else:
|
||||||
|
# print(f"Skipping {kana_href}")
|
||||||
|
continue
|
||||||
|
if sequence in entries:
|
||||||
|
continue
|
||||||
|
kotowaza_doc = jitenon.scrape(kana_href)
|
||||||
|
entry = JitenonKotowaza(sequence)
|
||||||
|
entry.add_document(kotowaza_doc)
|
||||||
|
entries[sequence] = entry
|
||||||
|
terms = []
|
||||||
|
attribution = ""
|
||||||
|
modified_date = None
|
||||||
|
for entry in entries.values():
|
||||||
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
|
modified_date = entry.modified_date
|
||||||
|
attribution = entry.attribution
|
||||||
|
for term in entry.yomichan_terms():
|
||||||
|
terms.append(term)
|
||||||
|
index = {
|
||||||
|
"title": "故事・ことわざ・慣用句オンライン",
|
||||||
|
"revision": f"jitenon-kotowaza.{modified_date}",
|
||||||
|
"sequenced": True,
|
||||||
|
"format": 3,
|
||||||
|
"url": "https://kotowaza.jitenon.jp/",
|
||||||
|
"attribution": attribution,
|
||||||
|
}
|
||||||
|
Yomichan.create_zip(terms, index)
|
||||||
|
|
31
jitenbot.py
31
jitenbot.py
|
@ -16,7 +16,34 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from crawlers import jitenon_yoji_crawler
|
import argparse
|
||||||
|
import crawlers as Crawlers
|
||||||
|
|
||||||
|
|
||||||
|
choices = {
|
||||||
|
'all': Crawlers.run_all,
|
||||||
|
'jitenon-yoji': Crawlers.jitenon_yoji,
|
||||||
|
'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog='jitenbot',
|
||||||
|
description='Crawl and convert Japanese web dictionaries.')
|
||||||
|
parser.add_argument(
|
||||||
|
'target',
|
||||||
|
choices=choices.keys(),
|
||||||
|
help='website to crawl')
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
crawler = choices[args.target]
|
||||||
|
crawler()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
jitenon_yoji_crawler()
|
main()
|
||||||
|
|
|
@ -8,7 +8,7 @@ import util as Util
|
||||||
|
|
||||||
class JitenonYoji:
|
class JitenonYoji:
|
||||||
columns = {
|
columns = {
|
||||||
"四字熟語": ["yojijukugo", ""],
|
"四字熟語": ["expression", ""],
|
||||||
"読み方": ["yomikata", ""],
|
"読み方": ["yomikata", ""],
|
||||||
"意味": ["imi", ""],
|
"意味": ["imi", ""],
|
||||||
"出典": ["shutten", ""],
|
"出典": ["shutten", ""],
|
||||||
|
@ -79,7 +79,7 @@ class JitenonYoji:
|
||||||
def __headwords(self):
|
def __headwords(self):
|
||||||
words = []
|
words = []
|
||||||
for yomikata in self.__yomikatas():
|
for yomikata in self.__yomikatas():
|
||||||
headword = [self.yojijukugo, yomikata]
|
headword = [self.expression, yomikata]
|
||||||
if headword in words:
|
if headword in words:
|
||||||
words.remove(headword)
|
words.remove(headword)
|
||||||
words.append(headword)
|
words.append(headword)
|
||||||
|
@ -90,16 +90,17 @@ class JitenonYoji:
|
||||||
return words
|
return words
|
||||||
|
|
||||||
def __yomikatas(self):
|
def __yomikatas(self):
|
||||||
m = re.search(r"^[ぁ-ヿ]+$", self.yomikata)
|
yomikata = self.yomikata.replace(" ", "")
|
||||||
|
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return [self.yomikata]
|
return [yomikata]
|
||||||
m = re.search(r"^([ぁ-ヿ]+)<br/>", self.yomikata)
|
m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return [m.group(1)]
|
return [m.group(1)]
|
||||||
m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", self.yomikata)
|
m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return Util.expand_shouryaku(self.yomikata)
|
return Util.expand_shouryaku(yomikata)
|
||||||
m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", self.yomikata)
|
m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
yomikatas = [m.group(1)]
|
yomikatas = [m.group(1)]
|
||||||
alts = m.group(2).split("/")
|
alts = m.group(2).split("/")
|
||||||
|
|
12
scraper.py
12
scraper.py
|
@ -14,8 +14,8 @@ from datetime import datetime
|
||||||
|
|
||||||
class Scraper():
|
class Scraper():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.netloc_re = \
|
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
|
||||||
re.compile(r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$")
|
self.netloc_re = re.compile(pattern)
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
|
|
||||||
def scrape(self, urlstring):
|
def scrape(self, urlstring):
|
||||||
|
@ -80,20 +80,20 @@ class Scraper():
|
||||||
file_contents = None
|
file_contents = None
|
||||||
return file_contents
|
return file_contents
|
||||||
|
|
||||||
def __get(self, url):
|
def __get(self, urlstring):
|
||||||
delay = 10
|
delay = 10
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
now = datetime.now().strftime("%H:%M:%S")
|
now = datetime.now().strftime("%H:%M:%S")
|
||||||
print(f"{now} scraping {url.geturl()} ...", end='')
|
print(f"{now} scraping {urlstring} ...", end='')
|
||||||
try:
|
try:
|
||||||
response = self.session.get(url, timeout=10)
|
response = self.session.get(urlstring, timeout=10)
|
||||||
print("OK")
|
print("OK")
|
||||||
return response.text
|
return response.text
|
||||||
except Exception:
|
except Exception:
|
||||||
print("failed")
|
print("failed")
|
||||||
print("resetting session and trying again")
|
print("resetting session and trying again")
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
response = self.session.get(url, timeout=10)
|
response = self.session.get(urlstring, timeout=10)
|
||||||
return response.text
|
return response.text
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue