Add crawler factory

This commit is contained in:
stephenmk 2023-05-06 13:15:38 -05:00
parent 68949dde6c
commit 3d795ab49f
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
5 changed files with 38 additions and 18 deletions

View file

@ -6,8 +6,10 @@
- [ ] Add factory classes to reduce the amount of class import statements
- [ ] Add build scripts for producing program binaries
- [ ] Support exporting to MDict (.MDX) dictionary format
- [ ] Validate scraped webpages after downloading
- [ ] Log non-fatal failures to a log file instead of raising exceptions
- [ ] Support more dictionary websites
- [ ] [国語辞典オンライン](https://kokugo.jitenon.jp/)
- [x] [国語辞典オンライン](https://kokugo.jitenon.jp/)
- [ ] [Yoji-Jukugo.com](https://yoji-jukugo.com/)
- [ ] [実用日本語表現辞典](https://www.weblio.jp/cat/dictionary/jtnhj)
- [ ] Support more Monokakido dictionaries

18
bot/crawlers/factory.py Normal file
View file

@ -0,0 +1,18 @@
from bot.targets import Targets
from bot.crawlers.crawlers import JitenonKokugoCrawler
from bot.crawlers.crawlers import JitenonYojiCrawler
from bot.crawlers.crawlers import JitenonKotowazaCrawler
from bot.crawlers.crawlers import Smk8Crawler
from bot.crawlers.crawlers import Daijirin2Crawler
def new_crawler(target, args):
crawler_map = {
Targets.JITENON_KOKUGO: JitenonKokugoCrawler,
Targets.JITENON_YOJI: JitenonYojiCrawler,
Targets.JITENON_KOTOWAZA: JitenonKotowazaCrawler,
Targets.SMK8: Smk8Crawler,
Targets.DAIJIRIN2: Daijirin2Crawler,
}
return crawler_map[target](args)

9
bot/targets.py Normal file
View file

@ -0,0 +1,9 @@
from enum import Enum
class Targets(Enum):
JITENON_KOKUGO = "jitenon-kokugo"
JITENON_YOJI = "jitenon-yoji"
JITENON_KOTOWAZA = "jitenon-kotowaza"
SMK8 = "smk8"
DAIJIRIN2 = "daijirin2"

View file

@ -18,11 +18,8 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import argparse
from bot.crawlers import JitenonKokugoCrawler
from bot.crawlers import JitenonYojiCrawler
from bot.crawlers import JitenonKotowazaCrawler
from bot.crawlers import Smk8Crawler
from bot.crawlers import Daijirin2Crawler
from bot.targets import Targets
from bot.crawlers.factory import new_crawler
def directory(d):
@ -34,14 +31,14 @@ def directory(d):
return d
def parse_args(targets):
def parse_args(target_names):
parser = argparse.ArgumentParser(
prog="jitenbot",
description="Convert Japanese dictionary files to new formats.",
)
parser.add_argument(
"target",
choices=targets,
choices=target_names,
help="name of dictionary to convert"
)
parser.add_argument(
@ -59,16 +56,10 @@ def parse_args(targets):
def main():
crawlers = {
"jitenon-kokugo": JitenonKokugoCrawler,
"jitenon-yoji": JitenonYojiCrawler,
"jitenon-kotowaza": JitenonKotowazaCrawler,
"smk8": Smk8Crawler,
"daijirin2": Daijirin2Crawler,
}
args = parse_args(crawlers.keys())
crawler_class = crawlers[args.target]
crawler = crawler_class(args)
target_names = [x.value for x in Targets]
args = parse_args(target_names)
selected_target = Targets(args.target)
crawler = new_crawler(selected_target, args)
crawler.collect_pages()
crawler.read_pages()
crawler.make_yomichan_dictionary()