Organize crawler logic into classes

This commit is contained in:
stephenmk 2023-04-22 17:56:52 -05:00
parent 4721eed4c6
commit 8868383a08
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
5 changed files with 96 additions and 87 deletions

View file

@ -2,78 +2,84 @@ import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import bot.scraper as Scraper import bot.scraper as Scraper
import bot.yomichan.export as YomichanExport
from bot.entries.jitenon_kotowaza import JitenonKotowaza from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
from bot.entries.jitenon_yoji import JitenonYoji from bot.yomichan.export import JitenonKotowazaExporter
from bot.entries.jitenon_yoji import JitenonYojiEntry
from bot.yomichan.export import JitenonYojiExporter
def run_all(): class Crawler():
jitenon_yoji() def __init__(self):
jitenon_kotowaza() self.crawl_map = {}
self.entries = []
def make_entries(self):
entries_len = len(self.crawl_map)
items = self.crawl_map.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = self.entry_class(entry_id)
entry.add_document(entry_path)
self.entries.append(entry)
print()
def make_yomichan_dictionary(self):
self.yomi_exporter.export(self.entries)
def jitenon_yoji(): class JitenonYojiCrawler(Crawler):
print("Scraping jitenon-yoji...") def __init__(self):
entry_id_to_entry_path = {} super().__init__()
jitenon = Scraper.Jitenon() self.entry_class = JitenonYojiEntry
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") self.yomi_exporter = JitenonYojiExporter()
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True): def crawl(self):
gojuon_href = gojuon_a['href'] print("Scraping jitenon-yoji...")
kana_doc, _ = jitenon.scrape(gojuon_href) jitenon = Scraper.Jitenon()
kana_soup = BeautifulSoup(kana_doc, features="html.parser") gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
for kana_a in kana_soup.select(".word_box a", href=True): gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
kana_href = kana_a['href'] for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1)) gojuon_href = gojuon_a['href']
if entry_id in entry_id_to_entry_path: kana_doc, _ = jitenon.scrape(gojuon_href)
continue kana_soup = BeautifulSoup(kana_doc, features="html.parser")
_, entry_path = jitenon.scrape(kana_href) for kana_a in kana_soup.select(".word_box a", href=True):
entry_id_to_entry_path[entry_id] = entry_path kana_href = kana_a['href']
entries_len = len(entry_id_to_entry_path) entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
print(f"Finished scraping {entries_len} entries") if entry_id in self.crawl_map:
entries = [] continue
items = entry_id_to_entry_path.items() _, entry_path = jitenon.scrape(kana_href)
for idx, (entry_id, entry_path) in enumerate(items): self.crawl_map[entry_id] = entry_path
update = f"Reading entry {idx+1}/{entries_len}" entries_len = len(self.crawl_map)
print(update, end='\r', flush=True) print(f"Finished scraping {entries_len} entries")
entry = JitenonYoji(entry_id)
entry.add_document(entry_path)
entries.append(entry)
print()
exporter = YomichanExport.JitenonYojiExporter()
exporter.export(entries)
def jitenon_kotowaza(): class JitenonKotowazaCrawler(Crawler):
print("Scraping jitenon-kotowaza...") def __init__(self):
entry_id_to_entry_path = {} super().__init__()
jitenon = Scraper.Jitenon() self.entry_class = JitenonKotowazaEntry
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") self.yomi_exporter = JitenonKotowazaExporter()
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True): def crawl(self):
gojuon_href = gojuon_a['href'] print("Scraping jitenon-kotowaza...")
kana_doc, _ = jitenon.scrape(gojuon_href) jitenon = Scraper.Jitenon()
kana_soup = BeautifulSoup(kana_doc, features="html.parser") gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
for kana_a in kana_soup.select(".word_box a", href=True): gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
kana_href = kana_a['href'] for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
m = re.search(r"([0-9]+).php", kana_href) gojuon_href = gojuon_a['href']
if not m: kana_doc, _ = jitenon.scrape(gojuon_href)
continue kana_soup = BeautifulSoup(kana_doc, features="html.parser")
entry_id = int(m.group(1)) for kana_a in kana_soup.select(".word_box a", href=True):
if entry_id in entry_id_to_entry_path: kana_href = kana_a['href']
continue m = re.search(r"([0-9]+).php", kana_href)
_, entry_path = jitenon.scrape(kana_href) if not m:
entry_id_to_entry_path[entry_id] = entry_path continue
entries_len = len(entry_id_to_entry_path) entry_id = int(m.group(1))
print(f"Finished scraping {entries_len} entries") if entry_id in self.crawl_map:
entries = [] continue
items = entry_id_to_entry_path.items() _, entry_path = jitenon.scrape(kana_href)
for idx, (entry_id, entry_path) in enumerate(items): self.crawl_map[entry_id] = entry_path
update = f"Reading entry {idx+1}/{entries_len}" entries_len = len(self.crawl_map)
print(update, end='\r', flush=True) print(f"Finished scraping {entries_len} entries")
entry = JitenonKotowaza(entry_id)
entry.add_document(entry_path)
entries.append(entry)
print()
exporter = YomichanExport.JitenonKotowazaExporter()
exporter.export(entries)

View file

@ -6,7 +6,7 @@ import bot.yomichan.html_gloss as YomichanGloss
import bot.util as Util import bot.util as Util
class Jitenon: class JitenonEntry:
def __init__(self, sequence): def __init__(self, sequence):
self.sequence = sequence self.sequence = sequence
self.yomichan_glossary = [""] self.yomichan_glossary = [""]

View file

@ -1,8 +1,8 @@
from bot.entries.jitenon import Jitenon from bot.entries.jitenon import JitenonEntry
import bot.yomichan.grammar as Grammar import bot.yomichan.grammar as Grammar
class JitenonKotowaza(Jitenon): class JitenonKotowazaEntry(JitenonEntry):
columns = { columns = {
"言葉": ["expression", ""], "言葉": ["expression", ""],
"読み方": ["yomikata", ""], "読み方": ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonKotowaza(Jitenon):
} }
def __init__(self, sequence): def __init__(self, sequence):
Jitenon.__init__(self, sequence) super().__init__(sequence)
def yomichan_terms(self): def yomichan_terms(self):
terms = [] terms = []
@ -38,4 +38,4 @@ class JitenonKotowaza(Jitenon):
return [["金棒引き", "かなぼうひき"], return [["金棒引き", "かなぼうひき"],
["鉄棒引き", "かなぼうひき"]] ["鉄棒引き", "かなぼうひき"]]
else: else:
return Jitenon._headwords(self) return super()._headwords()

View file

@ -1,7 +1,7 @@
from bot.entries.jitenon import Jitenon from bot.entries.jitenon import JitenonEntry
class JitenonYoji(Jitenon): class JitenonYojiEntry(JitenonEntry):
columns = { columns = {
"四字熟語": ["expression", ""], "四字熟語": ["expression", ""],
"読み方": ["yomikata", ""], "読み方": ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonYoji(Jitenon):
} }
def __init__(self, sequence): def __init__(self, sequence):
Jitenon.__init__(self, sequence) super().__init__(sequence)
def yomichan_terms(self): def yomichan_terms(self):
terms = [] terms = []

View file

@ -17,32 +17,35 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
""" """
import argparse import argparse
import bot.crawlers as Crawlers from bot.crawlers import JitenonYojiCrawler
from bot.crawlers import JitenonKotowazaCrawler
choices = { crawlers = {
'all': Crawlers.run_all, 'jitenon-yoji': JitenonYojiCrawler,
'jitenon-yoji': Crawlers.jitenon_yoji, 'jitenon-kotowaza': JitenonKotowazaCrawler,
'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
} }
def parse_args(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog='jitenbot', prog='jitenbot',
description='Crawl and convert Japanese web dictionaries.') description='Convert Japanese dictionary files to new formats.')
parser.add_argument( parser.add_argument(
'target', 'target',
choices=choices.keys(), choices=crawlers.keys(),
help='website to crawl') help='Dictionary to convert.')
args = parser.parse_args() args = parser.parse_args()
return args return args
def main(): def main():
args = parse_args() args = parse_args()
crawler = choices[args.target] crawler_class = crawlers[args.target]
crawler() crawler = crawler_class()
crawler.crawl()
crawler.make_entries()
crawler.make_yomichan_dictionary()
if __name__ == "__main__": if __name__ == "__main__":