Organize crawler logic into classes

This commit is contained in:
stephenmk 2023-04-22 17:56:52 -05:00
parent 4721eed4c6
commit 8868383a08
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
5 changed files with 96 additions and 87 deletions

View file

@ -2,78 +2,84 @@ import re
from bs4 import BeautifulSoup
import bot.scraper as Scraper
import bot.yomichan.export as YomichanExport
from bot.entries.jitenon_kotowaza import JitenonKotowaza
from bot.entries.jitenon_yoji import JitenonYoji
from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
from bot.yomichan.export import JitenonKotowazaExporter
from bot.entries.jitenon_yoji import JitenonYojiEntry
from bot.yomichan.export import JitenonYojiExporter
def run_all():
jitenon_yoji()
jitenon_kotowaza()
class Crawler():
def __init__(self):
self.crawl_map = {}
self.entries = []
def make_entries(self):
entries_len = len(self.crawl_map)
items = self.crawl_map.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = self.entry_class(entry_id)
entry.add_document(entry_path)
self.entries.append(entry)
print()
def make_yomichan_dictionary(self):
self.yomi_exporter.export(self.entries)
def jitenon_yoji():
print("Scraping jitenon-yoji...")
entry_id_to_entry_path = {}
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
if entry_id in entry_id_to_entry_path:
continue
_, entry_path = jitenon.scrape(kana_href)
entry_id_to_entry_path[entry_id] = entry_path
entries_len = len(entry_id_to_entry_path)
print(f"Finished scraping {entries_len} entries")
entries = []
items = entry_id_to_entry_path.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = JitenonYoji(entry_id)
entry.add_document(entry_path)
entries.append(entry)
print()
exporter = YomichanExport.JitenonYojiExporter()
exporter.export(entries)
class JitenonYojiCrawler(Crawler):
def __init__(self):
super().__init__()
self.entry_class = JitenonYojiEntry
self.yomi_exporter = JitenonYojiExporter()
def crawl(self):
print("Scraping jitenon-yoji...")
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
if entry_id in self.crawl_map:
continue
_, entry_path = jitenon.scrape(kana_href)
self.crawl_map[entry_id] = entry_path
entries_len = len(self.crawl_map)
print(f"Finished scraping {entries_len} entries")
def jitenon_kotowaza():
print("Scraping jitenon-kotowaza...")
entry_id_to_entry_path = {}
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
m = re.search(r"([0-9]+).php", kana_href)
if not m:
continue
entry_id = int(m.group(1))
if entry_id in entry_id_to_entry_path:
continue
_, entry_path = jitenon.scrape(kana_href)
entry_id_to_entry_path[entry_id] = entry_path
entries_len = len(entry_id_to_entry_path)
print(f"Finished scraping {entries_len} entries")
entries = []
items = entry_id_to_entry_path.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = JitenonKotowaza(entry_id)
entry.add_document(entry_path)
entries.append(entry)
print()
exporter = YomichanExport.JitenonKotowazaExporter()
exporter.export(entries)
class JitenonKotowazaCrawler(Crawler):
def __init__(self):
super().__init__()
self.entry_class = JitenonKotowazaEntry
self.yomi_exporter = JitenonKotowazaExporter()
def crawl(self):
print("Scraping jitenon-kotowaza...")
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
m = re.search(r"([0-9]+).php", kana_href)
if not m:
continue
entry_id = int(m.group(1))
if entry_id in self.crawl_map:
continue
_, entry_path = jitenon.scrape(kana_href)
self.crawl_map[entry_id] = entry_path
entries_len = len(self.crawl_map)
print(f"Finished scraping {entries_len} entries")

View file

@ -6,7 +6,7 @@ import bot.yomichan.html_gloss as YomichanGloss
import bot.util as Util
class Jitenon:
class JitenonEntry:
def __init__(self, sequence):
self.sequence = sequence
self.yomichan_glossary = [""]

View file

@ -1,8 +1,8 @@
from bot.entries.jitenon import Jitenon
from bot.entries.jitenon import JitenonEntry
import bot.yomichan.grammar as Grammar
class JitenonKotowaza(Jitenon):
class JitenonKotowazaEntry(JitenonEntry):
columns = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonKotowaza(Jitenon):
}
def __init__(self, sequence):
Jitenon.__init__(self, sequence)
super().__init__(sequence)
def yomichan_terms(self):
terms = []
@ -38,4 +38,4 @@ class JitenonKotowaza(Jitenon):
return [["金棒引き", "かなぼうひき"],
["鉄棒引き", "かなぼうひき"]]
else:
return Jitenon._headwords(self)
return super()._headwords()

View file

@ -1,7 +1,7 @@
from bot.entries.jitenon import Jitenon
from bot.entries.jitenon import JitenonEntry
class JitenonYoji(Jitenon):
class JitenonYojiEntry(JitenonEntry):
columns = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
@ -14,7 +14,7 @@ class JitenonYoji(Jitenon):
}
def __init__(self, sequence):
Jitenon.__init__(self, sequence)
super().__init__(sequence)
def yomichan_terms(self):
terms = []

View file

@ -17,32 +17,35 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import argparse
import bot.crawlers as Crawlers
from bot.crawlers import JitenonYojiCrawler
from bot.crawlers import JitenonKotowazaCrawler
choices = {
'all': Crawlers.run_all,
'jitenon-yoji': Crawlers.jitenon_yoji,
'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
crawlers = {
'jitenon-yoji': JitenonYojiCrawler,
'jitenon-kotowaza': JitenonKotowazaCrawler,
}
def parse_args():
parser = argparse.ArgumentParser(
prog='jitenbot',
description='Crawl and convert Japanese web dictionaries.')
description='Convert Japanese dictionary files to new formats.')
parser.add_argument(
'target',
choices=choices.keys(),
help='website to crawl')
choices=crawlers.keys(),
help='Dictionary to convert.')
args = parser.parse_args()
return args
def main():
args = parse_args()
crawler = choices[args.target]
crawler()
crawler_class = crawlers[args.target]
crawler = crawler_class()
crawler.crawl()
crawler.make_entries()
crawler.make_yomichan_dictionary()
if __name__ == "__main__":