Organize crawler logic into classes
This commit is contained in:
parent
4721eed4c6
commit
8868383a08
|
@ -2,19 +2,42 @@ import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import bot.scraper as Scraper
|
import bot.scraper as Scraper
|
||||||
import bot.yomichan.export as YomichanExport
|
|
||||||
from bot.entries.jitenon_kotowaza import JitenonKotowaza
|
from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
|
||||||
from bot.entries.jitenon_yoji import JitenonYoji
|
from bot.yomichan.export import JitenonKotowazaExporter
|
||||||
|
|
||||||
|
from bot.entries.jitenon_yoji import JitenonYojiEntry
|
||||||
|
from bot.yomichan.export import JitenonYojiExporter
|
||||||
|
|
||||||
|
|
||||||
def run_all():
|
class Crawler():
|
||||||
jitenon_yoji()
|
def __init__(self):
|
||||||
jitenon_kotowaza()
|
self.crawl_map = {}
|
||||||
|
self.entries = []
|
||||||
|
|
||||||
|
def make_entries(self):
|
||||||
|
entries_len = len(self.crawl_map)
|
||||||
|
items = self.crawl_map.items()
|
||||||
|
for idx, (entry_id, entry_path) in enumerate(items):
|
||||||
|
update = f"Reading entry {idx+1}/{entries_len}"
|
||||||
|
print(update, end='\r', flush=True)
|
||||||
|
entry = self.entry_class(entry_id)
|
||||||
|
entry.add_document(entry_path)
|
||||||
|
self.entries.append(entry)
|
||||||
|
print()
|
||||||
|
|
||||||
|
def make_yomichan_dictionary(self):
|
||||||
|
self.yomi_exporter.export(self.entries)
|
||||||
|
|
||||||
|
|
||||||
def jitenon_yoji():
|
class JitenonYojiCrawler(Crawler):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.entry_class = JitenonYojiEntry
|
||||||
|
self.yomi_exporter = JitenonYojiExporter()
|
||||||
|
|
||||||
|
def crawl(self):
|
||||||
print("Scraping jitenon-yoji...")
|
print("Scraping jitenon-yoji...")
|
||||||
entry_id_to_entry_path = {}
|
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
@ -25,28 +48,22 @@ def jitenon_yoji():
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
kana_href = kana_a['href']
|
kana_href = kana_a['href']
|
||||||
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||||
if entry_id in entry_id_to_entry_path:
|
if entry_id in self.crawl_map:
|
||||||
continue
|
continue
|
||||||
_, entry_path = jitenon.scrape(kana_href)
|
_, entry_path = jitenon.scrape(kana_href)
|
||||||
entry_id_to_entry_path[entry_id] = entry_path
|
self.crawl_map[entry_id] = entry_path
|
||||||
entries_len = len(entry_id_to_entry_path)
|
entries_len = len(self.crawl_map)
|
||||||
print(f"Finished scraping {entries_len} entries")
|
print(f"Finished scraping {entries_len} entries")
|
||||||
entries = []
|
|
||||||
items = entry_id_to_entry_path.items()
|
|
||||||
for idx, (entry_id, entry_path) in enumerate(items):
|
|
||||||
update = f"Reading entry {idx+1}/{entries_len}"
|
|
||||||
print(update, end='\r', flush=True)
|
|
||||||
entry = JitenonYoji(entry_id)
|
|
||||||
entry.add_document(entry_path)
|
|
||||||
entries.append(entry)
|
|
||||||
print()
|
|
||||||
exporter = YomichanExport.JitenonYojiExporter()
|
|
||||||
exporter.export(entries)
|
|
||||||
|
|
||||||
|
|
||||||
def jitenon_kotowaza():
|
class JitenonKotowazaCrawler(Crawler):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.entry_class = JitenonKotowazaEntry
|
||||||
|
self.yomi_exporter = JitenonKotowazaExporter()
|
||||||
|
|
||||||
|
def crawl(self):
|
||||||
print("Scraping jitenon-kotowaza...")
|
print("Scraping jitenon-kotowaza...")
|
||||||
entry_id_to_entry_path = {}
|
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
@ -60,20 +77,9 @@ def jitenon_kotowaza():
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
entry_id = int(m.group(1))
|
entry_id = int(m.group(1))
|
||||||
if entry_id in entry_id_to_entry_path:
|
if entry_id in self.crawl_map:
|
||||||
continue
|
continue
|
||||||
_, entry_path = jitenon.scrape(kana_href)
|
_, entry_path = jitenon.scrape(kana_href)
|
||||||
entry_id_to_entry_path[entry_id] = entry_path
|
self.crawl_map[entry_id] = entry_path
|
||||||
entries_len = len(entry_id_to_entry_path)
|
entries_len = len(self.crawl_map)
|
||||||
print(f"Finished scraping {entries_len} entries")
|
print(f"Finished scraping {entries_len} entries")
|
||||||
entries = []
|
|
||||||
items = entry_id_to_entry_path.items()
|
|
||||||
for idx, (entry_id, entry_path) in enumerate(items):
|
|
||||||
update = f"Reading entry {idx+1}/{entries_len}"
|
|
||||||
print(update, end='\r', flush=True)
|
|
||||||
entry = JitenonKotowaza(entry_id)
|
|
||||||
entry.add_document(entry_path)
|
|
||||||
entries.append(entry)
|
|
||||||
print()
|
|
||||||
exporter = YomichanExport.JitenonKotowazaExporter()
|
|
||||||
exporter.export(entries)
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ import bot.yomichan.html_gloss as YomichanGloss
|
||||||
import bot.util as Util
|
import bot.util as Util
|
||||||
|
|
||||||
|
|
||||||
class Jitenon:
|
class JitenonEntry:
|
||||||
def __init__(self, sequence):
|
def __init__(self, sequence):
|
||||||
self.sequence = sequence
|
self.sequence = sequence
|
||||||
self.yomichan_glossary = [""]
|
self.yomichan_glossary = [""]
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from bot.entries.jitenon import Jitenon
|
from bot.entries.jitenon import JitenonEntry
|
||||||
import bot.yomichan.grammar as Grammar
|
import bot.yomichan.grammar as Grammar
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowaza(Jitenon):
|
class JitenonKotowazaEntry(JitenonEntry):
|
||||||
columns = {
|
columns = {
|
||||||
"言葉": ["expression", ""],
|
"言葉": ["expression", ""],
|
||||||
"読み方": ["yomikata", ""],
|
"読み方": ["yomikata", ""],
|
||||||
|
@ -14,7 +14,7 @@ class JitenonKotowaza(Jitenon):
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, sequence):
|
def __init__(self, sequence):
|
||||||
Jitenon.__init__(self, sequence)
|
super().__init__(sequence)
|
||||||
|
|
||||||
def yomichan_terms(self):
|
def yomichan_terms(self):
|
||||||
terms = []
|
terms = []
|
||||||
|
@ -38,4 +38,4 @@ class JitenonKotowaza(Jitenon):
|
||||||
return [["金棒引き", "かなぼうひき"],
|
return [["金棒引き", "かなぼうひき"],
|
||||||
["鉄棒引き", "かなぼうひき"]]
|
["鉄棒引き", "かなぼうひき"]]
|
||||||
else:
|
else:
|
||||||
return Jitenon._headwords(self)
|
return super()._headwords()
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from bot.entries.jitenon import Jitenon
|
from bot.entries.jitenon import JitenonEntry
|
||||||
|
|
||||||
|
|
||||||
class JitenonYoji(Jitenon):
|
class JitenonYojiEntry(JitenonEntry):
|
||||||
columns = {
|
columns = {
|
||||||
"四字熟語": ["expression", ""],
|
"四字熟語": ["expression", ""],
|
||||||
"読み方": ["yomikata", ""],
|
"読み方": ["yomikata", ""],
|
||||||
|
@ -14,7 +14,7 @@ class JitenonYoji(Jitenon):
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, sequence):
|
def __init__(self, sequence):
|
||||||
Jitenon.__init__(self, sequence)
|
super().__init__(sequence)
|
||||||
|
|
||||||
def yomichan_terms(self):
|
def yomichan_terms(self):
|
||||||
terms = []
|
terms = []
|
||||||
|
|
23
jitenbot.py
23
jitenbot.py
|
@ -17,32 +17,35 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import bot.crawlers as Crawlers
|
from bot.crawlers import JitenonYojiCrawler
|
||||||
|
from bot.crawlers import JitenonKotowazaCrawler
|
||||||
|
|
||||||
|
|
||||||
choices = {
|
crawlers = {
|
||||||
'all': Crawlers.run_all,
|
'jitenon-yoji': JitenonYojiCrawler,
|
||||||
'jitenon-yoji': Crawlers.jitenon_yoji,
|
'jitenon-kotowaza': JitenonKotowazaCrawler,
|
||||||
'jitenon-kotowaza': Crawlers.jitenon_kotowaza,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog='jitenbot',
|
prog='jitenbot',
|
||||||
description='Crawl and convert Japanese web dictionaries.')
|
description='Convert Japanese dictionary files to new formats.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'target',
|
'target',
|
||||||
choices=choices.keys(),
|
choices=crawlers.keys(),
|
||||||
help='website to crawl')
|
help='Dictionary to convert.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
crawler = choices[args.target]
|
crawler_class = crawlers[args.target]
|
||||||
crawler()
|
crawler = crawler_class()
|
||||||
|
crawler.crawl()
|
||||||
|
crawler.make_entries()
|
||||||
|
crawler.make_yomichan_dictionary()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue