2023-05-01 22:31:28 +00:00
|
|
|
import os
|
2023-04-08 03:05:36 +00:00
|
|
|
import re
|
2023-05-06 21:55:00 +00:00
|
|
|
from abc import ABC, abstractmethod
|
2023-04-08 03:05:36 +00:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
2023-04-11 17:01:23 +00:00
|
|
|
import bot.scraper as Scraper
|
2023-05-06 21:55:00 +00:00
|
|
|
from bot.entries.factory import new_entry
|
|
|
|
from bot.yomichan.exporters.factory import new_exporter
|
2023-04-08 03:05:36 +00:00
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
class Crawler(ABC):
|
|
|
|
def __init__(self, target):
|
|
|
|
self._target = target
|
2023-05-01 22:31:28 +00:00
|
|
|
self._page_map = {}
|
|
|
|
self._entries = []
|
2023-05-06 21:55:00 +00:00
|
|
|
self._page_id_pattern = None
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def collect_pages(self, page_dir):
|
|
|
|
pass
|
2023-04-10 16:14:52 +00:00
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
def read_pages(self):
|
|
|
|
pages_len = len(self._page_map)
|
|
|
|
items = self._page_map.items()
|
|
|
|
for idx, (page_id, page_path) in enumerate(items):
|
|
|
|
update = f"Reading page {idx+1}/{pages_len}"
|
2023-04-22 22:56:52 +00:00
|
|
|
print(update, end='\r', flush=True)
|
2023-05-06 21:55:00 +00:00
|
|
|
entry = new_entry(self._target, page_id)
|
2023-05-02 01:03:03 +00:00
|
|
|
with open(page_path, "r", encoding="utf-8") as f:
|
2023-05-01 22:31:28 +00:00
|
|
|
page = f.read()
|
|
|
|
entry.set_page(page)
|
|
|
|
self._entries.append(entry)
|
2023-04-22 22:56:52 +00:00
|
|
|
print()
|
2023-04-10 16:14:52 +00:00
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
def make_yomichan_dictionary(self, image_dir):
|
|
|
|
exporter = new_exporter(self._target)
|
|
|
|
exporter.export(self._entries, image_dir)
|
2023-05-01 22:31:28 +00:00
|
|
|
|
|
|
|
def _parse_page_id(self, page_link):
|
|
|
|
m = re.search(self._page_id_pattern, page_link)
|
2023-05-06 21:55:00 +00:00
|
|
|
if m is None:
|
2023-05-01 22:31:28 +00:00
|
|
|
return None
|
|
|
|
page_id = int(m.group(1))
|
|
|
|
if page_id in self._page_map:
|
|
|
|
return None
|
|
|
|
return page_id
|
2023-04-22 22:56:52 +00:00
|
|
|
|
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
class JitenonKokugoCrawler(Crawler):
|
|
|
|
def __init__(self, target):
|
|
|
|
super().__init__(target)
|
2023-05-06 03:53:17 +00:00
|
|
|
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
|
|
|
|
self._page_id_pattern = r"word/p([0-9]+)$"
|
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
def collect_pages(self, page_dir):
|
2023-05-06 03:53:17 +00:00
|
|
|
jitenon = Scraper.Jitenon()
|
|
|
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
|
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
|
|
gojuon_href = gojuon_a['href']
|
|
|
|
max_kana_page = 1
|
|
|
|
current_kana_page = 1
|
|
|
|
while current_kana_page <= max_kana_page:
|
|
|
|
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
|
|
|
|
current_kana_page += 1
|
|
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
|
|
page_total = kana_soup.find(class_="page_total").text
|
|
|
|
m = re.search(r"全([0-9]+)件", page_total)
|
|
|
|
if m:
|
|
|
|
max_kana_page = int(m.group(1))
|
|
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
|
|
page_link = kana_a['href']
|
|
|
|
page_id = self._parse_page_id(page_link)
|
|
|
|
if page_id is None:
|
|
|
|
continue
|
|
|
|
_, page_path = jitenon.scrape(page_link)
|
|
|
|
self._page_map[page_id] = page_path
|
|
|
|
pages_len = len(self._page_map)
|
|
|
|
print(f"Finished scraping {pages_len} pages")
|
|
|
|
|
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
class _JitenonCrawler(Crawler):
|
|
|
|
def __init__(self, target):
|
|
|
|
super().__init__(target)
|
|
|
|
self._gojuon_url = None
|
2023-04-22 22:56:52 +00:00
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
def collect_pages(self, page_dir):
|
2023-05-01 22:31:28 +00:00
|
|
|
print("Scraping jitenon.jp")
|
2023-04-22 22:56:52 +00:00
|
|
|
jitenon = Scraper.Jitenon()
|
2023-04-22 23:32:11 +00:00
|
|
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
2023-04-22 22:56:52 +00:00
|
|
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
|
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
|
|
gojuon_href = gojuon_a['href']
|
|
|
|
kana_doc, _ = jitenon.scrape(gojuon_href)
|
|
|
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
|
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
2023-05-01 22:31:28 +00:00
|
|
|
page_link = kana_a['href']
|
|
|
|
page_id = self._parse_page_id(page_link)
|
|
|
|
if page_id is None:
|
2023-04-22 22:56:52 +00:00
|
|
|
continue
|
2023-05-01 22:31:28 +00:00
|
|
|
_, page_path = jitenon.scrape(page_link)
|
|
|
|
self._page_map[page_id] = page_path
|
|
|
|
pages_len = len(self._page_map)
|
|
|
|
print(f"Finished scraping {pages_len} pages")
|
2023-04-22 22:56:52 +00:00
|
|
|
|
2023-04-22 23:32:11 +00:00
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
class JitenonYojiCrawler(_JitenonCrawler):
|
2023-05-06 21:55:00 +00:00
|
|
|
def __init__(self, target):
|
|
|
|
super().__init__(target)
|
2023-04-22 23:32:11 +00:00
|
|
|
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
|
2023-05-01 22:31:28 +00:00
|
|
|
self._page_id_pattern = r"([0-9]+)\.html$"
|
2023-04-22 22:56:52 +00:00
|
|
|
|
2023-04-22 23:32:11 +00:00
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
class JitenonKotowazaCrawler(_JitenonCrawler):
|
2023-05-06 21:55:00 +00:00
|
|
|
def __init__(self, target):
|
|
|
|
super().__init__(target)
|
2023-04-22 23:32:11 +00:00
|
|
|
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
|
2023-05-01 22:31:28 +00:00
|
|
|
self._page_id_pattern = r"([0-9]+)\.php$"
|
|
|
|
|
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
class _MonokakidoCrawler(Crawler):
|
|
|
|
def __init__(self, target):
|
|
|
|
super().__init__(target)
|
2023-05-01 22:31:28 +00:00
|
|
|
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
|
|
|
|
2023-05-06 21:55:00 +00:00
|
|
|
def collect_pages(self, page_dir):
|
|
|
|
print(f"Searching for page files in `{page_dir}`")
|
|
|
|
for pagefile in os.listdir(page_dir):
|
2023-05-01 22:31:28 +00:00
|
|
|
page_id = self._parse_page_id(pagefile)
|
|
|
|
if page_id is None or page_id == 0:
|
|
|
|
continue
|
2023-05-06 21:55:00 +00:00
|
|
|
path = os.path.join(page_dir, pagefile)
|
2023-05-01 22:31:28 +00:00
|
|
|
self._page_map[page_id] = path
|
|
|
|
pages_len = len(self._page_map)
|
|
|
|
print(f"Found {pages_len} page files for processing")
|
|
|
|
|
|
|
|
|
|
|
|
class Smk8Crawler(_MonokakidoCrawler):
|
2023-05-06 21:55:00 +00:00
|
|
|
def __init__(self, target):
|
|
|
|
super().__init__(target)
|
2023-05-01 22:31:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Daijirin2Crawler(_MonokakidoCrawler):
|
2023-05-06 21:55:00 +00:00
|
|
|
def __init__(self, target):
|
|
|
|
super().__init__(target)
|