Reorganize file structure of all other modules

This commit is contained in:
stephenmk 2023-07-26 23:48:24 -05:00
parent 9b3fdc86d1
commit 7b2ba96db9
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
61 changed files with 517 additions and 547 deletions

View file

@ -0,0 +1,54 @@
import re
from abc import ABC, abstractmethod
from bot.factory import new_entry
from bot.factory import new_yomichan_exporter
from bot.factory import new_mdict_exporter
class BaseCrawler(ABC):
def __init__(self, target):
self._target = target
self._page_map = {}
self._entries = []
self._page_id_pattern = None
@abstractmethod
def collect_pages(self, page_dir):
raise NotImplementedError
def read_pages(self):
pages_len = len(self._page_map)
items = self._page_map.items()
for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True)
entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f:
page = f.read()
try:
entry.set_page(page)
except ValueError as err:
print(err)
print("Try deleting and redownloading file:")
print(f"\t{page_path}\n")
continue
self._entries.append(entry)
print()
def make_yomichan_dictionary(self, media_dir, validate):
exporter = new_yomichan_exporter(self._target)
exporter.export(self._entries, media_dir, validate)
def make_mdict_dictionary(self, media_dir, icon_file):
exporter = new_mdict_exporter(self._target)
exporter.export(self._entries, media_dir, icon_file)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
if m is None:
return None
page_id = int(m.group(1))
if page_id in self._page_map:
return None
return page_id

View file

@ -0,0 +1,29 @@
from bs4 import BeautifulSoup
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
from bot.crawlers.base.crawler import BaseCrawler
class JitenonCrawler(BaseCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = None
def collect_pages(self, page_dir):
print("Scraping jitenon.jp")
jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")

View file

@ -0,0 +1,19 @@
import os
from bot.crawlers.base.crawler import BaseCrawler
class MonokakidoCrawler(BaseCrawler):
def __init__(self, target):
super().__init__(target)
self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self, page_dir):
print(f"Searching for page files in `{page_dir}`")
for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0:
continue
path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path
pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing")

View file

@ -1,158 +0,0 @@
import os
import re
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
import bot.crawlers.scraper as Scraper
from bot.entries.factory import new_entry
from bot.yomichan.exporters.factory import new_yomi_exporter
from bot.mdict.exporters.factory import new_mdict_exporter
class Crawler(ABC):
def __init__(self, target):
self._target = target
self._page_map = {}
self._entries = []
self._page_id_pattern = None
@abstractmethod
def collect_pages(self, page_dir):
pass
def read_pages(self):
pages_len = len(self._page_map)
items = self._page_map.items()
for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True)
entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f:
page = f.read()
try:
entry.set_page(page)
except ValueError as err:
print(err)
print("Try deleting and redownloading file:")
print(f"\t{page_path}\n")
continue
self._entries.append(entry)
print()
def make_yomichan_dictionary(self, media_dir, validate):
exporter = new_yomi_exporter(self._target)
exporter.export(self._entries, media_dir, validate)
def make_mdict_dictionary(self, media_dir, icon_file):
exporter = new_mdict_exporter(self._target)
exporter.export(self._entries, media_dir, icon_file)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
if m is None:
return None
page_id = int(m.group(1))
if page_id in self._page_map:
return None
return page_id
class JitenonKokugoCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self, page_dir):
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
max_kana_page = 1
current_kana_page = 1
while current_kana_page <= max_kana_page:
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
current_kana_page += 1
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
page_total = kana_soup.find(class_="page_total").text
m = re.search(r"全([0-9]+)件", page_total)
if m:
max_kana_page = int(m.group(1))
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
class _JitenonCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = None
def collect_pages(self, page_dir):
print("Scraping jitenon.jp")
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
class JitenonYojiCrawler(_JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._page_id_pattern = r"([0-9]+)\.html$"
class JitenonKotowazaCrawler(_JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._page_id_pattern = r"([0-9]+)\.php$"
class _MonokakidoCrawler(Crawler):
def __init__(self, target):
super().__init__(target)
self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self, page_dir):
print(f"Searching for page files in `{page_dir}`")
for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0:
continue
path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path
pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing")
class Smk8Crawler(_MonokakidoCrawler):
pass
class Daijirin2Crawler(_MonokakidoCrawler):
pass
class Sankoku8Crawler(_MonokakidoCrawler):
pass

View file

@ -0,0 +1,5 @@
from bot.crawlers.base.monokakido import MonokakidoCrawler
class Crawler(MonokakidoCrawler):
pass

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.crawlers.crawlers import JitenonKokugoCrawler
from bot.crawlers.crawlers import JitenonYojiCrawler
from bot.crawlers.crawlers import JitenonKotowazaCrawler
from bot.crawlers.crawlers import Smk8Crawler
from bot.crawlers.crawlers import Daijirin2Crawler
from bot.crawlers.crawlers import Sankoku8Crawler
def new_crawler(target):
crawler_map = {
Targets.JITENON_KOKUGO: JitenonKokugoCrawler,
Targets.JITENON_YOJI: JitenonYojiCrawler,
Targets.JITENON_KOTOWAZA: JitenonKotowazaCrawler,
Targets.SMK8: Smk8Crawler,
Targets.DAIJIRIN2: Daijirin2Crawler,
Targets.SANKOKU8: Sankoku8Crawler,
}
return crawler_map[target](target)

View file

@ -0,0 +1,38 @@
import re
from bs4 import BeautifulSoup
from bot.crawlers.base.crawler import BaseCrawler
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
class Crawler(BaseCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self, page_dir):
jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
max_kana_page = 1
current_kana_page = 1
while current_kana_page <= max_kana_page:
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
current_kana_page += 1
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
page_total = kana_soup.find(class_="page_total").text
m = re.search(r"全([0-9]+)件", page_total)
if m:
max_kana_page = int(m.group(1))
for kana_a in kana_soup.select(".word_box a", href=True):
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")

View file

@ -0,0 +1,8 @@
from bot.crawlers.base.jitenon import JitenonCrawler
class Crawler(JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._page_id_pattern = r"([0-9]+)\.php$"

View file

@ -0,0 +1,8 @@
from bot.crawlers.base.jitenon import JitenonCrawler
class Crawler(JitenonCrawler):
def __init__(self, target):
super().__init__(target)
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._page_id_pattern = r"([0-9]+)\.html$"

5
bot/crawlers/sankoku8.py Normal file
View file

@ -0,0 +1,5 @@
from bot.crawlers.base.monokakido import MonokakidoCrawler
class Crawler(MonokakidoCrawler):
pass

View file

@ -0,0 +1,10 @@
import re
from bot.crawlers.scrapers.scraper import BaseScraper
class Jitenon(BaseScraper):
def _get_netloc_re(self):
domain = r"jitenon\.jp"
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + domain + r"$"
netloc_re = re.compile(pattern)
return netloc_re

View file

@ -1,24 +1,24 @@
import time import time
import requests
import re import re
import os import os
import hashlib import hashlib
from datetime import datetime from datetime import datetime
from pathlib import Path
from platformdirs import user_cache_dir
from urllib.parse import urlparse from urllib.parse import urlparse
from pathlib import Path
from abc import ABC, abstractmethod
import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from platformdirs import user_cache_dir
from bot.data import load_config from bot.data import load_config
class Scraper(): class BaseScraper(ABC):
def __init__(self): def __init__(self):
self._config = load_config() self._config = load_config()
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$" self.netloc_re = self._get_netloc_re()
self.netloc_re = re.compile(pattern)
self.__set_session() self.__set_session()
def scrape(self, urlstring): def scrape(self, urlstring):
@ -34,6 +34,10 @@ class Scraper():
print("Discovering cached files...", end='\r', flush=True) print("Discovering cached files...", end='\r', flush=True)
return html, cache_path return html, cache_path
@abstractmethod
def _get_netloc_re(self):
raise NotImplementedError
def __set_session(self): def __set_session(self):
retry_strategy = Retry( retry_strategy = Retry(
total=3, total=3,
@ -99,9 +103,3 @@ class Scraper():
self.__set_session() self.__set_session()
response = self.session.get(urlstring, timeout=10) response = self.session.get(urlstring, timeout=10)
return response.text return response.text
class Jitenon(Scraper):
def __init__(self):
self.domain = r"jitenon\.jp"
super().__init__()

5
bot/crawlers/smk8.py Normal file
View file

@ -0,0 +1,5 @@
from bot.crawlers.base.monokakido import MonokakidoCrawler
class Crawler(MonokakidoCrawler):
pass

View file

@ -18,15 +18,15 @@ class Entry(ABC):
@abstractmethod @abstractmethod
def get_global_identifier(self): def get_global_identifier(self):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def set_page(self, page): def set_page(self, page):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def get_page_soup(self): def get_page_soup(self):
pass raise NotImplementedError
def get_headwords(self): def get_headwords(self):
if self._headwords is not None: if self._headwords is not None:
@ -38,15 +38,15 @@ class Entry(ABC):
@abstractmethod @abstractmethod
def _get_headwords(self): def _get_headwords(self):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _add_variant_expressions(self, headwords): def _add_variant_expressions(self, headwords):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def get_part_of_speech_tags(self): def get_part_of_speech_tags(self):
pass raise NotImplementedError
def get_parent(self): def get_parent(self):
if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID: if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:

View file

@ -58,7 +58,7 @@ class JitenonEntry(Entry):
@abstractmethod @abstractmethod
def _get_column_map(self): def _get_column_map(self):
pass raise NotImplementedError
def __set_modified_date(self, page): def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)

View file

@ -39,7 +39,7 @@ class SanseidoEntry(Entry):
@abstractmethod @abstractmethod
def _get_subentry_parameters(self): def _get_subentry_parameters(self):
pass raise NotImplementedError
def _add_variant_expressions(self, headwords): def _add_variant_expressions(self, headwords):
for expressions in headwords.values(): for expressions in headwords.values():

View file

@ -1,7 +0,0 @@
import importlib
def new_entry(target, page_id):
module_path = f"bot.entries.{target.name.lower()}.entry"
module = importlib.import_module(module_path)
return module.Entry(target, page_id)

37
bot/factory.py Normal file
View file

@ -0,0 +1,37 @@
import importlib
def new_crawler(target):
module_path = f"bot.crawlers.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Crawler(target)
def new_entry(target, page_id):
module_path = f"bot.entries.{target.name.lower()}.entry"
module = importlib.import_module(module_path)
return module.Entry(target, page_id)
def new_yomichan_exporter(target):
module_path = f"bot.yomichan.exporters.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Exporter(target)
def new_yomichan_terminator(target):
module_path = f"bot.yomichan.terms.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Terminator(target)
def new_mdict_exporter(target):
module_path = f"bot.mdict.exporters.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Exporter(target)
def new_mdict_terminator(target):
module_path = f"bot.mdict.terms.{target.name.lower()}"
module = importlib.import_module(module_path)
return module.Terminator(target)

View file

@ -1,20 +1,18 @@
# pylint: disable=too-few-public-methods
import subprocess
import os import os
import shutil import shutil
import subprocess
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir from platformdirs import user_documents_dir, user_cache_dir
from bot.mdict.terms.factory import new_terminator from bot.factory import new_mdict_terminator
class Exporter(ABC): class BaseExporter(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._terminator = new_terminator(target) self._terminator = new_mdict_terminator(target)
self._build_dir = None self._build_dir = None
self._build_media_dir = None self._build_media_dir = None
self._description_file = None self._description_file = None
@ -168,58 +166,8 @@ class Exporter(ABC):
@abstractmethod @abstractmethod
def _get_revision(self, entries): def _get_revision(self, entries):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _get_attribution(self, entries): def _get_attribution(self, entries):
pass raise NotImplementedError
class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = modified_date.strftime("%Y年%m月%d日閲覧")
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution
class JitenonKokugoExporter(_JitenonExporter):
pass
class JitenonYojiExporter(_JitenonExporter):
pass
class JitenonKotowazaExporter(_JitenonExporter):
pass
class _MonokakidoExporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
return timestamp
class Smk8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"
class Sankoku8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,18 @@
from bot.mdict.exporters.base.exporter import BaseExporter
class JitenonExporter(BaseExporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = modified_date.strftime("%Y年%m月%d日閲覧")
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution

View file

@ -0,0 +1,8 @@
from datetime import datetime
from bot.mdict.exporters.base.exporter import BaseExporter
class MonokakidoExporter(BaseExporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
return timestamp

View file

@ -0,0 +1,6 @@
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.mdict.exporters.export import JitenonKokugoExporter
from bot.mdict.exporters.export import JitenonYojiExporter
from bot.mdict.exporters.export import JitenonKotowazaExporter
from bot.mdict.exporters.export import Smk8Exporter
from bot.mdict.exporters.export import Daijirin2Exporter
from bot.mdict.exporters.export import Sankoku8Exporter
def new_mdict_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
Targets.SANKOKU8: Sankoku8Exporter,
}
return exporter_map[target](target)

View file

@ -0,0 +1,5 @@
from bot.mdict.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.mdict.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.mdict.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,6 @@
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,6 @@
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"

View file

@ -0,0 +1,20 @@
from bot.mdict.terms.base.terminator import BaseTerminator
class JitenonTerminator(BaseTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []

View file

@ -2,7 +2,7 @@ import re
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
class Terminator(ABC): class BaseTerminator(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._glossary_cache = {} self._glossary_cache = {}
@ -72,12 +72,12 @@ class Terminator(ABC):
@abstractmethod @abstractmethod
def _glossary(self, entry): def _glossary(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _link_glossary_parameters(self, entry): def _link_glossary_parameters(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _subentry_lists(self, entry): def _subentry_lists(self, entry):
pass raise NotImplementedError

View file

@ -1,8 +1,8 @@
from bot.mdict.terms.terminator import Terminator from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.daijirin2 import make_glossary from bot.mdict.glossary.daijirin2 import make_glossary
class Daijirin2Terminator(Terminator): class Terminator(BaseTerminator):
def _glossary(self, entry): def _glossary(self, entry):
if entry.entry_id in self._glossary_cache: if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id] return self._glossary_cache[entry.entry_id]

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.mdict.terms.jitenon import JitenonKokugoTerminator
from bot.mdict.terms.jitenon import JitenonYojiTerminator
from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
from bot.mdict.terms.smk8 import Smk8Terminator
from bot.mdict.terms.daijirin2 import Daijirin2Terminator
from bot.mdict.terms.sankoku8 import Sankoku8Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
Targets.SANKOKU8: Sankoku8Terminator,
}
return terminator_map[target](target)

View file

@ -1,42 +0,0 @@
from bot.mdict.terms.terminator import Terminator
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()

View file

@ -0,0 +1,8 @@
from bot.mdict.terms.base.jitenon import JitenonTerminator
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()

View file

@ -0,0 +1,8 @@
from bot.mdict.terms.base.jitenon import JitenonTerminator
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()

View file

@ -0,0 +1,8 @@
from bot.mdict.terms.base.jitenon import JitenonTerminator
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()

View file

@ -1,8 +1,8 @@
from bot.mdict.terms.terminator import Terminator from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.sankoku8 import make_glossary from bot.mdict.glossary.sankoku8 import make_glossary
class Sankoku8Terminator(Terminator): class Terminator(BaseTerminator):
def _glossary(self, entry): def _glossary(self, entry):
if entry.entry_id in self._glossary_cache: if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id] return self._glossary_cache[entry.entry_id]

View file

@ -1,8 +1,8 @@
from bot.mdict.terms.terminator import Terminator from bot.mdict.terms.base.terminator import BaseTerminator
from bot.mdict.glossary.smk8 import make_glossary from bot.mdict.glossary.smk8 import make_glossary
class Smk8Terminator(Terminator): class Terminator(BaseTerminator):
def _glossary(self, entry): def _glossary(self, entry):
if entry.entry_id in self._glossary_cache: if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id] return self._glossary_cache[entry.entry_id]

View file

@ -1,24 +1,22 @@
# pylint: disable=too-few-public-methods
import json import json
import os import os
import shutil import shutil
import copy import copy
from pathlib import Path from pathlib import Path
from datetime import datetime
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from platformdirs import user_documents_dir, user_cache_dir
import fastjsonschema import fastjsonschema
from platformdirs import user_documents_dir, user_cache_dir
from bot.data import load_yomichan_metadata from bot.data import load_yomichan_metadata
from bot.yomichan.terms.factory import new_terminator
from bot.data import load_yomichan_term_schema from bot.data import load_yomichan_term_schema
from bot.factory import new_yomichan_terminator
class Exporter(ABC): class BaseExporter(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._terminator = new_terminator(target) self._terminator = new_yomichan_terminator(target)
self._build_dir = None self._build_dir = None
self._terms_per_file = 2000 self._terms_per_file = 2000
@ -36,11 +34,11 @@ class Exporter(ABC):
@abstractmethod @abstractmethod
def _get_revision(self, entries): def _get_revision(self, entries):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _get_attribution(self, entries): def _get_attribution(self, entries):
pass raise NotImplementedError
def _get_build_dir(self): def _get_build_dir(self):
if self._build_dir is not None: if self._build_dir is not None:
@ -118,10 +116,10 @@ class Exporter(ABC):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
max_i = int(len(terms) / self._terms_per_file) + 1 max_i = int(len(terms) / self._terms_per_file) + 1
for i in range(max_i): for i in range(max_i):
update = f"Writing terms to term bank {i+1}/{max_i}"
print(update, end='\r', flush=True)
start = self._terms_per_file * i start = self._terms_per_file * i
end = self._terms_per_file * (i + 1) end = self._terms_per_file * (i + 1)
update = f"Writing terms to term banks {start} - {end}"
print(update, end='\r', flush=True)
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json") term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f: with open(term_file, "w", encoding='utf8') as f:
json.dump(terms[start:end], f, indent=4, ensure_ascii=False) json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
@ -142,8 +140,8 @@ class Exporter(ABC):
json.dump(tags, f, indent=4, ensure_ascii=False) json.dump(tags, f, indent=4, ensure_ascii=False)
def __write_archive(self, filename): def __write_archive(self, filename):
print("Archiving data to ZIP file...")
archive_format = "zip" archive_format = "zip"
print(f"Archiving data to {archive_format.upper()} file...")
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan") out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir(): if not Path(out_dir).is_dir():
os.makedirs(out_dir) os.makedirs(out_dir)
@ -154,58 +152,8 @@ class Exporter(ABC):
base_filename = os.path.join(out_dir, filename) base_filename = os.path.join(out_dir, filename)
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
shutil.make_archive(base_filename, archive_format, build_dir) shutil.make_archive(base_filename, archive_format, build_dir)
print(f"Dictionary file saved to {out_filepath}") print(f"Dictionary file saved to `{out_filepath}`")
def __rm_build_dir(self): def __rm_build_dir(self):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
shutil.rmtree(build_dir) shutil.rmtree(build_dir)
class _JitenonExporter(Exporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = f"{self._target.value};{modified_date}"
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution
class JitenonKokugoExporter(_JitenonExporter):
pass
class JitenonYojiExporter(_JitenonExporter):
pass
class JitenonKotowazaExporter(_JitenonExporter):
pass
class _MonokakidoExporter(Exporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"
class Smk8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"
class Sankoku8Exporter(_MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,18 @@
from bot.yomichan.exporters.base.exporter import BaseExporter
class JitenonExporter(BaseExporter):
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = f"{self._target.value};{modified_date}"
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution

View file

@ -0,0 +1,8 @@
from datetime import datetime
from bot.yomichan.exporters.base.exporter import BaseExporter
class MonokakidoExporter(BaseExporter):
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._target.value};{timestamp}"

View file

@ -0,0 +1,6 @@
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.yomichan.exporters.export import JitenonKokugoExporter
from bot.yomichan.exporters.export import JitenonYojiExporter
from bot.yomichan.exporters.export import JitenonKotowazaExporter
from bot.yomichan.exporters.export import Smk8Exporter
from bot.yomichan.exporters.export import Daijirin2Exporter
from bot.yomichan.exporters.export import Sankoku8Exporter
def new_yomi_exporter(target):
exporter_map = {
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
Targets.JITENON_YOJI: JitenonYojiExporter,
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
Targets.SMK8: Smk8Exporter,
Targets.DAIJIRIN2: Daijirin2Exporter,
Targets.SANKOKU8: Sankoku8Exporter,
}
return exporter_map[target](target)

View file

@ -0,0 +1,5 @@
from bot.yomichan.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.yomichan.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,5 @@
from bot.yomichan.exporters.base.jitenon import JitenonExporter
class Exporter(JitenonExporter):
pass

View file

@ -0,0 +1,6 @@
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2021"

View file

@ -0,0 +1,6 @@
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
class Exporter(MonokakidoExporter):
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"

View file

@ -1,9 +1,10 @@
import re import re
import os import os
from bs4 import BeautifulSoup
from functools import cache from functools import cache
from pathlib import Path from pathlib import Path
from bs4 import BeautifulSoup
import bot.yomichan.glossary.icons as Icons import bot.yomichan.glossary.icons as Icons
from bot.soup import delete_soup_nodes from bot.soup import delete_soup_nodes
from bot.data import load_yomichan_name_conversion from bot.data import load_yomichan_name_conversion

View file

@ -0,0 +1,26 @@
from bot.yomichan.terms.base.terminator import BaseTerminator
class JitenonTerminator(BaseTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _definition_tags(self, entry):
return None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []

View file

@ -2,7 +2,7 @@ from abc import abstractmethod, ABC
from bot.data import load_yomichan_inflection_categories from bot.data import load_yomichan_inflection_categories
class Terminator(ABC): class BaseTerminator(ABC):
def __init__(self, target): def __init__(self, target):
self._target = target self._target = target
self._glossary_cache = {} self._glossary_cache = {}
@ -66,28 +66,28 @@ class Terminator(ABC):
@abstractmethod @abstractmethod
def _definition_tags(self, entry): def _definition_tags(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _inflection_rules(self, entry, expression): def _inflection_rules(self, entry, expression):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _glossary(self, entry): def _glossary(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _sequence(self, entry): def _sequence(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _term_tags(self, entry): def _term_tags(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _link_glossary_parameters(self, entry): def _link_glossary_parameters(self, entry):
pass raise NotImplementedError
@abstractmethod @abstractmethod
def _subentry_lists(self, entry): def _subentry_lists(self, entry):
pass raise NotImplementedError

View file

@ -1,11 +1,10 @@
from bot.entries.daijirin2.phrase_entry import PhraseEntry from bot.entries.daijirin2.phrase_entry import PhraseEntry
from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.daijirin2 import make_glossary from bot.yomichan.glossary.daijirin2 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Daijirin2Terminator(Terminator): class Terminator(BaseTerminator):
def _definition_tags(self, entry): def _definition_tags(self, entry):
return "" return ""

View file

@ -1,20 +0,0 @@
from bot.targets import Targets
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
from bot.yomichan.terms.sankoku8 import Sankoku8Terminator
def new_terminator(target):
terminator_map = {
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
Targets.JITENON_YOJI: JitenonYojiTerminator,
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
Targets.SMK8: Smk8Terminator,
Targets.DAIJIRIN2: Daijirin2Terminator,
Targets.SANKOKU8: Sankoku8Terminator,
}
return terminator_map[target](target)

View file

@ -1,68 +0,0 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
class JitenonTerminator(Terminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = None
def _definition_tags(self, entry):
return None
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []
class JitenonKokugoTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression):
return ""
def _term_tags(self, entry):
tags = entry.kanken_level.split("/")
return " ".join(tags)
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""

View file

@ -0,0 +1,15 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
from bot.yomichan.terms.base.jitenon import JitenonTerminator
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKokugoGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""

View file

@ -0,0 +1,15 @@
from bot.yomichan.grammar import sudachi_rules
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
from bot.yomichan.terms.base.jitenon import JitenonTerminator
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonKotowazaGlossary()
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)
def _term_tags(self, entry):
return ""

View file

@ -0,0 +1,15 @@
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
from bot.yomichan.terms.base.jitenon import JitenonTerminator
class Terminator(JitenonTerminator):
def __init__(self, target):
super().__init__(target)
self._glossary_maker = JitenonYojiGlossary()
def _inflection_rules(self, entry, expression):
return ""
def _term_tags(self, entry):
tags = entry.kanken_level.split("/")
return " ".join(tags)

View file

@ -1,11 +1,10 @@
from bot.entries.sankoku8.phrase_entry import PhraseEntry from bot.entries.sankoku8.phrase_entry import PhraseEntry
from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.sankoku8 import make_glossary from bot.yomichan.glossary.sankoku8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Sankoku8Terminator(Terminator): class Terminator(BaseTerminator):
def _definition_tags(self, entry): def _definition_tags(self, entry):
return "" return ""

View file

@ -1,12 +1,11 @@
from bot.entries.smk8.kanji_entry import KanjiEntry from bot.entries.smk8.kanji_entry import KanjiEntry
from bot.entries.smk8.phrase_entry import PhraseEntry from bot.entries.smk8.phrase_entry import PhraseEntry
from bot.yomichan.terms.base.terminator import BaseTerminator
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.smk8 import make_glossary from bot.yomichan.glossary.smk8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Smk8Terminator(Terminator): class Terminator(BaseTerminator):
def __init__(self, target): def __init__(self, target):
super().__init__(target) super().__init__(target)

View file

@ -21,7 +21,7 @@ import sys
import argparse import argparse
import subprocess import subprocess
from bot.targets import Targets from bot.targets import Targets
from bot.crawlers.factory import new_crawler from bot.factory import new_crawler
def filename(f): def filename(f):

View file

@ -1,5 +1,7 @@
#!/bin/sh #!/bin/sh
export PYTHONPYCACHEPREFIX=/tmp/pycache
python -m unittest discover -s tests python -m unittest discover -s tests
python jitenbot.py jitenon-kokugo python jitenbot.py jitenon-kokugo