Organize Yomichan export logic into classes

This commit is contained in:
stephenmk 2023-04-22 16:49:29 -05:00
parent e73c4d3d7f
commit 4721eed4c6
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
4 changed files with 162 additions and 94 deletions

View file

@ -3,7 +3,6 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper
import bot.yomichan.export as YomichanExport
from bot.entries.jitenon_kotowaza import JitenonKotowaza
from bot.entries.jitenon_yoji import JitenonYoji
@ -14,48 +13,67 @@ def run_all():
def jitenon_yoji():
seq_to_entries = {}
print("Scraping jitenon-yoji...")
entry_id_to_entry_path = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc = jitenon.scrape(gojuon_href)
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
if sequence in seq_to_entries:
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
if entry_id in entry_id_to_entry_path:
continue
yoji_doc = jitenon.scrape(kana_href)
entry = JitenonYoji(sequence)
entry.add_document(yoji_doc)
seq_to_entries[sequence] = entry
entries = seq_to_entries.values()
YomichanExport.jitenon_yoji(entries)
_, entry_path = jitenon.scrape(kana_href)
entry_id_to_entry_path[entry_id] = entry_path
entries_len = len(entry_id_to_entry_path)
print(f"Finished scraping {entries_len} entries")
entries = []
items = entry_id_to_entry_path.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = JitenonYoji(entry_id)
entry.add_document(entry_path)
entries.append(entry)
print()
exporter = YomichanExport.JitenonYojiExporter()
exporter.export(entries)
def jitenon_kotowaza():
seq_to_entries = {}
print("Scraping jitenon-kotowaza...")
entry_id_to_entry_path = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
gojuon_href = gojuon_a['href']
kana_doc = jitenon.scrape(gojuon_href)
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
m = re.search(r"([0-9]+).php", kana_href)
if m:
sequence = int(m.group(1))
else:
if not m:
continue
if sequence in seq_to_entries:
entry_id = int(m.group(1))
if entry_id in entry_id_to_entry_path:
continue
kotowaza_doc = jitenon.scrape(kana_href)
entry = JitenonKotowaza(sequence)
entry.add_document(kotowaza_doc)
seq_to_entries[sequence] = entry
entries = seq_to_entries.values()
YomichanExport.jitenon_kotowaza(entries)
_, entry_path = jitenon.scrape(kana_href)
entry_id_to_entry_path[entry_id] = entry_path
entries_len = len(entry_id_to_entry_path)
print(f"Finished scraping {entries_len} entries")
entries = []
items = entry_id_to_entry_path.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = JitenonKotowaza(entry_id)
entry.add_document(entry_path)
entries.append(entry)
print()
exporter = YomichanExport.JitenonKotowazaExporter()
exporter.export(entries)

View file

@ -15,7 +15,9 @@ class Jitenon:
for column in self.columns.values():
setattr(self, column[0], column[1])
def add_document(self, html):
def add_document(self, path):
with open(path, "r") as f:
html = f.read()
yoji_soup = BeautifulSoup(html, features="html5lib")
self.__set_modified_date(html)
self.attribution = yoji_soup.find(class_="copyright").text

View file

@ -28,13 +28,14 @@ class Scraper():
url = urlparse(urlstring, scheme='https://', allow_fragments=True)
self.__validate_url(url)
cache_path = self.__cache_path(url)
cache_contents = self.__read_cache(cache_path)
if cache_contents is not None:
return cache_contents
html = self.__get(urlstring)
with open(cache_path, "w") as f:
f.write(html)
return html
html = self.__read_cache(cache_path)
if html is None:
html = self.__get(urlstring)
with open(cache_path, "w") as f:
f.write(html)
else:
print("Discovering cached files...", end='\r', flush=True)
return html, cache_path
def __set_session(self):
retry_strategy = Retry(
@ -106,4 +107,4 @@ class Scraper():
class Jitenon(Scraper):
def __init__(self):
self.domain = r"jitenon\.jp"
Scraper.__init__(self)
super().__init__()

View file

@ -8,74 +8,121 @@ from platformdirs import user_documents_dir, user_cache_dir
import bot.data as Data
def jitenon_yoji(entries):
__jitenon(entries, "jitenon-yoji")
class Exporter:
def __init__(self):
self._build_dir = None
self._terms_per_file = 2000
def export(self, entries):
meta = Data.yomichan_metadata()
index = meta[self._name]["index"]
index["revision"] = self._get_revision(entries)
index["attribution"] = self._get_attribution(entries)
tags = meta[self._name]["tags"]
terms = self.__get_terms(entries)
self.__make_dictionary(terms, index, tags)
def jitenon_kotowaza(entries):
__jitenon(entries, "jitenon-kotowaza")
def _get_build_dir(self):
if self._build_dir is not None:
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
self._build_dir = build_directory
return self._build_dir
def __get_terms(self, entries):
terms = []
entries_len = len(entries)
for idx, entry in enumerate(entries):
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
for term in entry.yomichan_terms():
terms.append(term)
print()
return terms
def __jitenon(entries, name):
terms, modified_date, attribution = __terms(entries)
meta = Data.yomichan_metadata()
def __make_dictionary(self, terms, index, tags):
print(f"Exporting {len(terms)} Yomichan terms to zip file...")
self.__write_term_banks(terms)
self.__write_index(index)
self.__write_tag_bank(tags)
self.__write_archive(index["title"])
self.__rm_build_dir()
index = meta[name]["index"]
index["revision"] = f"{name}.{modified_date}"
index["attribution"] = attribution
tags = meta[name]["tags"]
def __write_term_banks(self, terms):
build_dir = self._get_build_dir()
max_i = int(len(terms) / self._terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = self._terms_per_file * i
end = self._terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
__create_zip(terms, index, tags)
def __write_index(self, index):
build_dir = self._get_build_dir()
index_file = os.path.join(build_dir, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
def __terms(entries):
terms = []
modified_date = None
attribution = ""
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
return terms, modified_date, attribution
def __create_zip(terms, index, tags):
cache_dir = user_cache_dir("jitenbot")
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = terms_per_file * i
end = terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
index_file = os.path.join(build_directory, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
if len(tags) > 0:
tag_file = os.path.join(build_directory, "tag_bank_1.json")
def __write_tag_bank(self, tags):
if len(tags) == 0:
return
build_dir = self._get_build_dir()
tag_file = os.path.join(build_dir, "tag_bank_1.json")
with open(tag_file, 'w', encoding='utf8') as f:
json.dump(tags, f, indent=4, ensure_ascii=False)
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory)
def __write_archive(self, filename):
archive_format = "zip"
out_dir = os.path.join(user_documents_dir(), "jitenbot")
if not Path(out_dir).is_dir():
os.makedirs(out_dir)
out_file = f"{filename}.{archive_format}"
out_filepath = os.path.join(out_dir, out_file)
if Path(out_filepath).is_file():
os.remove(out_filepath)
base_filename = os.path.join(out_dir, filename)
build_dir = self._get_build_dir()
shutil.make_archive(base_filename, archive_format, build_dir)
print(f"Dictionary file exported to {out_filepath}")
out_dir = os.path.join(user_documents_dir(), "jitenbot")
out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir():
os.mkdir(out_dir)
elif Path(out_file).is_file():
os.remove(out_file)
shutil.move(zip_file, out_dir)
shutil.rmtree(build_directory)
def __rm_build_dir(self):
build_dir = self._get_build_dir()
shutil.rmtree(build_dir)
class JitenonExporter(Exporter):
def __init__(self):
super().__init__()
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = f"{self._name}.{modified_date}"
return revision
def _get_attribution(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
attribution = entry.attribution
return attribution
class JitenonYojiExporter(JitenonExporter):
def __init__(self):
super().__init__()
self._name = "jitenon-yoji"
class JitenonKotowazaExporter(JitenonExporter):
def __init__(self):
super().__init__()
self._name = "jitenon-kotowaza"