Organize crawler logic into classes
This commit is contained in:
parent
4721eed4c6
commit
8868383a08
144
bot/crawlers.py
144
bot/crawlers.py
|
@ -2,78 +2,84 @@ import re
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.scraper as Scraper
|
||||
import bot.yomichan.export as YomichanExport
|
||||
from bot.entries.jitenon_kotowaza import JitenonKotowaza
|
||||
from bot.entries.jitenon_yoji import JitenonYoji
|
||||
|
||||
from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
|
||||
from bot.yomichan.export import JitenonKotowazaExporter
|
||||
|
||||
from bot.entries.jitenon_yoji import JitenonYojiEntry
|
||||
from bot.yomichan.export import JitenonYojiExporter
|
||||
|
||||
|
||||
def run_all():
|
||||
jitenon_yoji()
|
||||
jitenon_kotowaza()
|
||||
class Crawler():
|
||||
def __init__(self):
|
||||
self.crawl_map = {}
|
||||
self.entries = []
|
||||
|
||||
def make_entries(self):
|
||||
entries_len = len(self.crawl_map)
|
||||
items = self.crawl_map.items()
|
||||
for idx, (entry_id, entry_path) in enumerate(items):
|
||||
update = f"Reading entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = self.entry_class(entry_id)
|
||||
entry.add_document(entry_path)
|
||||
self.entries.append(entry)
|
||||
print()
|
||||
|
||||
def make_yomichan_dictionary(self):
|
||||
self.yomi_exporter.export(self.entries)
|
||||
|
||||
|
||||
def jitenon_yoji():
|
||||
print("Scraping jitenon-yoji...")
|
||||
entry_id_to_entry_path = {}
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||
gojuon_href = gojuon_a['href']
|
||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||
if entry_id in entry_id_to_entry_path:
|
||||
continue
|
||||
_, entry_path = jitenon.scrape(kana_href)
|
||||
entry_id_to_entry_path[entry_id] = entry_path
|
||||
entries_len = len(entry_id_to_entry_path)
|
||||
print(f"Finished scraping {entries_len} entries")
|
||||
entries = []
|
||||
items = entry_id_to_entry_path.items()
|
||||
for idx, (entry_id, entry_path) in enumerate(items):
|
||||
update = f"Reading entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = JitenonYoji(entry_id)
|
||||
entry.add_document(entry_path)
|
||||
entries.append(entry)
|
||||
print()
|
||||
exporter = YomichanExport.JitenonYojiExporter()
|
||||
exporter.export(entries)
|
||||
class JitenonYojiCrawler(Crawler):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.entry_class = JitenonYojiEntry
|
||||
self.yomi_exporter = JitenonYojiExporter()
|
||||
|
||||
def crawl(self):
|
||||
print("Scraping jitenon-yoji...")
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||
gojuon_href = gojuon_a['href']
|
||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||
if entry_id in self.crawl_map:
|
||||
continue
|
||||
_, entry_path = jitenon.scrape(kana_href)
|
||||
self.crawl_map[entry_id] = entry_path
|
||||
entries_len = len(self.crawl_map)
|
||||
print(f"Finished scraping {entries_len} entries")
|
||||
|
||||
|
||||
def jitenon_kotowaza():
|
||||
print("Scraping jitenon-kotowaza...")
|
||||
entry_id_to_entry_path = {}
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||
gojuon_href = gojuon_a['href']
|
||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
m = re.search(r"([0-9]+).php", kana_href)
|
||||
if not m:
|
||||
continue
|
||||
entry_id = int(m.group(1))
|
||||
if entry_id in entry_id_to_entry_path:
|
||||
continue
|
||||
_, entry_path = jitenon.scrape(kana_href)
|
||||
entry_id_to_entry_path[entry_id] = entry_path
|
||||
entries_len = len(entry_id_to_entry_path)
|
||||
print(f"Finished scraping {entries_len} entries")
|
||||
entries = []
|
||||
items = entry_id_to_entry_path.items()
|
||||
for idx, (entry_id, entry_path) in enumerate(items):
|
||||
update = f"Reading entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = JitenonKotowaza(entry_id)
|
||||
entry.add_document(entry_path)
|
||||
entries.append(entry)
|
||||
print()
|
||||
exporter = YomichanExport.JitenonKotowazaExporter()
|
||||
exporter.export(entries)
|
||||
class JitenonKotowazaCrawler(Crawler):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.entry_class = JitenonKotowazaEntry
|
||||
self.yomi_exporter = JitenonKotowazaExporter()
|
||||
|
||||
def crawl(self):
|
||||
print("Scraping jitenon-kotowaza...")
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||
gojuon_href = gojuon_a['href']
|
||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
m = re.search(r"([0-9]+).php", kana_href)
|
||||
if not m:
|
||||
continue
|
||||
entry_id = int(m.group(1))
|
||||
if entry_id in self.crawl_map:
|
||||
continue
|
||||
_, entry_path = jitenon.scrape(kana_href)
|
||||
self.crawl_map[entry_id] = entry_path
|
||||
entries_len = len(self.crawl_map)
|
||||
print(f"Finished scraping {entries_len} entries")
|
||||
|
|
|
@ -6,7 +6,7 @@ import bot.yomichan.html_gloss as YomichanGloss
|
|||
import bot.util as Util
|
||||
|
||||
|
||||
class Jitenon:
|
||||
class JitenonEntry:
|
||||
def __init__(self, sequence):
|
||||
self.sequence = sequence
|
||||
self.yomichan_glossary = [""]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from bot.entries.jitenon import Jitenon
|
||||
from bot.entries.jitenon import JitenonEntry
|
||||
import bot.yomichan.grammar as Grammar
|
||||
|
||||
|
||||
class JitenonKotowaza(Jitenon):
|
||||
class JitenonKotowazaEntry(JitenonEntry):
|
||||
columns = {
|
||||
"言葉": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
|
@ -14,7 +14,7 @@ class JitenonKotowaza(Jitenon):
|
|||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
Jitenon.__init__(self, sequence)
|
||||
super().__init__(sequence)
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
|
@ -38,4 +38,4 @@ class JitenonKotowaza(Jitenon):
|
|||
return [["金棒引き", "かなぼうひき"],
|
||||
["鉄棒引き", "かなぼうひき"]]
|
||||
else:
|
||||
return Jitenon._headwords(self)
|
||||
return super()._headwords()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from bot.entries.jitenon import Jitenon
|
||||
from bot.entries.jitenon import JitenonEntry
|
||||
|
||||
|
||||
class JitenonYoji(Jitenon):
|
||||
class JitenonYojiEntry(JitenonEntry):
|
||||
columns = {
|
||||
"四字熟語": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
|
@ -14,7 +14,7 @@ class JitenonYoji(Jitenon):
|
|||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
Jitenon.__init__(self, sequence)
|
||||
super().__init__(sequence)
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
|
|
23
jitenbot.py
23
jitenbot.py
|
@ -17,32 +17,35 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|||
"""
|
||||
|
||||
import argparse
|
||||
import bot.crawlers as Crawlers
|
||||
from bot.crawlers import JitenonYojiCrawler
|
||||
from bot.crawlers import JitenonKotowazaCrawler
|
||||
|
||||
|
||||
choices = {
|
||||
'all': Crawlers.run_all,
|
||||
'jitenon-yoji': Crawlers.jitenon_yoji,
|
||||
'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
|
||||
crawlers = {
|
||||
'jitenon-yoji': JitenonYojiCrawler,
|
||||
'jitenon-kotowaza': JitenonKotowazaCrawler,
|
||||
}
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='jitenbot',
|
||||
description='Crawl and convert Japanese web dictionaries.')
|
||||
description='Convert Japanese dictionary files to new formats.')
|
||||
parser.add_argument(
|
||||
'target',
|
||||
choices=choices.keys(),
|
||||
help='website to crawl')
|
||||
choices=crawlers.keys(),
|
||||
help='Dictionary to convert.')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
crawler = choices[args.target]
|
||||
crawler()
|
||||
crawler_class = crawlers[args.target]
|
||||
crawler = crawler_class()
|
||||
crawler.crawl()
|
||||
crawler.make_entries()
|
||||
crawler.make_yomichan_dictionary()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in a new issue