Organize Yomichan export logic into classes
This commit is contained in:
parent
e73c4d3d7f
commit
4721eed4c6
|
@ -3,7 +3,6 @@ from bs4 import BeautifulSoup
|
|||
|
||||
import bot.scraper as Scraper
|
||||
import bot.yomichan.export as YomichanExport
|
||||
|
||||
from bot.entries.jitenon_kotowaza import JitenonKotowaza
|
||||
from bot.entries.jitenon_yoji import JitenonYoji
|
||||
|
||||
|
@ -14,48 +13,67 @@ def run_all():
|
|||
|
||||
|
||||
def jitenon_yoji():
|
||||
seq_to_entries = {}
|
||||
print("Scraping jitenon-yoji...")
|
||||
entry_id_to_entry_path = {}
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||
gojuon_doc, _ = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||
gojuon_href = gojuon_a['href']
|
||||
kana_doc = jitenon.scrape(gojuon_href)
|
||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||
if sequence in seq_to_entries:
|
||||
entry_id = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||
if entry_id in entry_id_to_entry_path:
|
||||
continue
|
||||
yoji_doc = jitenon.scrape(kana_href)
|
||||
entry = JitenonYoji(sequence)
|
||||
entry.add_document(yoji_doc)
|
||||
seq_to_entries[sequence] = entry
|
||||
entries = seq_to_entries.values()
|
||||
YomichanExport.jitenon_yoji(entries)
|
||||
_, entry_path = jitenon.scrape(kana_href)
|
||||
entry_id_to_entry_path[entry_id] = entry_path
|
||||
entries_len = len(entry_id_to_entry_path)
|
||||
print(f"Finished scraping {entries_len} entries")
|
||||
entries = []
|
||||
items = entry_id_to_entry_path.items()
|
||||
for idx, (entry_id, entry_path) in enumerate(items):
|
||||
update = f"Reading entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = JitenonYoji(entry_id)
|
||||
entry.add_document(entry_path)
|
||||
entries.append(entry)
|
||||
print()
|
||||
exporter = YomichanExport.JitenonYojiExporter()
|
||||
exporter.export(entries)
|
||||
|
||||
|
||||
def jitenon_kotowaza():
|
||||
seq_to_entries = {}
|
||||
print("Scraping jitenon-kotowaza...")
|
||||
entry_id_to_entry_path = {}
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||
gojuon_doc, _ = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||
gojuon_href = gojuon_a['href']
|
||||
kana_doc = jitenon.scrape(gojuon_href)
|
||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
m = re.search(r"([0-9]+).php", kana_href)
|
||||
if m:
|
||||
sequence = int(m.group(1))
|
||||
else:
|
||||
if not m:
|
||||
continue
|
||||
if sequence in seq_to_entries:
|
||||
entry_id = int(m.group(1))
|
||||
if entry_id in entry_id_to_entry_path:
|
||||
continue
|
||||
kotowaza_doc = jitenon.scrape(kana_href)
|
||||
entry = JitenonKotowaza(sequence)
|
||||
entry.add_document(kotowaza_doc)
|
||||
seq_to_entries[sequence] = entry
|
||||
entries = seq_to_entries.values()
|
||||
YomichanExport.jitenon_kotowaza(entries)
|
||||
_, entry_path = jitenon.scrape(kana_href)
|
||||
entry_id_to_entry_path[entry_id] = entry_path
|
||||
entries_len = len(entry_id_to_entry_path)
|
||||
print(f"Finished scraping {entries_len} entries")
|
||||
entries = []
|
||||
items = entry_id_to_entry_path.items()
|
||||
for idx, (entry_id, entry_path) in enumerate(items):
|
||||
update = f"Reading entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = JitenonKotowaza(entry_id)
|
||||
entry.add_document(entry_path)
|
||||
entries.append(entry)
|
||||
print()
|
||||
exporter = YomichanExport.JitenonKotowazaExporter()
|
||||
exporter.export(entries)
|
||||
|
|
|
@ -15,7 +15,9 @@ class Jitenon:
|
|||
for column in self.columns.values():
|
||||
setattr(self, column[0], column[1])
|
||||
|
||||
def add_document(self, html):
|
||||
def add_document(self, path):
|
||||
with open(path, "r") as f:
|
||||
html = f.read()
|
||||
yoji_soup = BeautifulSoup(html, features="html5lib")
|
||||
self.__set_modified_date(html)
|
||||
self.attribution = yoji_soup.find(class_="copyright").text
|
||||
|
|
|
@ -28,13 +28,14 @@ class Scraper():
|
|||
url = urlparse(urlstring, scheme='https://', allow_fragments=True)
|
||||
self.__validate_url(url)
|
||||
cache_path = self.__cache_path(url)
|
||||
cache_contents = self.__read_cache(cache_path)
|
||||
if cache_contents is not None:
|
||||
return cache_contents
|
||||
html = self.__get(urlstring)
|
||||
with open(cache_path, "w") as f:
|
||||
f.write(html)
|
||||
return html
|
||||
html = self.__read_cache(cache_path)
|
||||
if html is None:
|
||||
html = self.__get(urlstring)
|
||||
with open(cache_path, "w") as f:
|
||||
f.write(html)
|
||||
else:
|
||||
print("Discovering cached files...", end='\r', flush=True)
|
||||
return html, cache_path
|
||||
|
||||
def __set_session(self):
|
||||
retry_strategy = Retry(
|
||||
|
@ -106,4 +107,4 @@ class Scraper():
|
|||
class Jitenon(Scraper):
|
||||
def __init__(self):
|
||||
self.domain = r"jitenon\.jp"
|
||||
Scraper.__init__(self)
|
||||
super().__init__()
|
||||
|
|
|
@ -8,74 +8,121 @@ from platformdirs import user_documents_dir, user_cache_dir
|
|||
import bot.data as Data
|
||||
|
||||
|
||||
def jitenon_yoji(entries):
|
||||
__jitenon(entries, "jitenon-yoji")
|
||||
class Exporter:
|
||||
def __init__(self):
|
||||
self._build_dir = None
|
||||
self._terms_per_file = 2000
|
||||
|
||||
def export(self, entries):
|
||||
meta = Data.yomichan_metadata()
|
||||
index = meta[self._name]["index"]
|
||||
index["revision"] = self._get_revision(entries)
|
||||
index["attribution"] = self._get_attribution(entries)
|
||||
tags = meta[self._name]["tags"]
|
||||
terms = self.__get_terms(entries)
|
||||
self.__make_dictionary(terms, index, tags)
|
||||
|
||||
def jitenon_kotowaza(entries):
|
||||
__jitenon(entries, "jitenon-kotowaza")
|
||||
def _get_build_dir(self):
|
||||
if self._build_dir is not None:
|
||||
return self._build_dir
|
||||
cache_dir = user_cache_dir("jitenbot")
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
|
||||
if Path(build_directory).is_dir():
|
||||
shutil.rmtree(build_directory)
|
||||
os.makedirs(build_directory)
|
||||
self._build_dir = build_directory
|
||||
return self._build_dir
|
||||
|
||||
def __get_terms(self, entries):
|
||||
terms = []
|
||||
entries_len = len(entries)
|
||||
for idx, entry in enumerate(entries):
|
||||
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
for term in entry.yomichan_terms():
|
||||
terms.append(term)
|
||||
print()
|
||||
return terms
|
||||
|
||||
def __jitenon(entries, name):
|
||||
terms, modified_date, attribution = __terms(entries)
|
||||
meta = Data.yomichan_metadata()
|
||||
def __make_dictionary(self, terms, index, tags):
|
||||
print(f"Exporting {len(terms)} Yomichan terms to zip file...")
|
||||
self.__write_term_banks(terms)
|
||||
self.__write_index(index)
|
||||
self.__write_tag_bank(tags)
|
||||
self.__write_archive(index["title"])
|
||||
self.__rm_build_dir()
|
||||
|
||||
index = meta[name]["index"]
|
||||
index["revision"] = f"{name}.{modified_date}"
|
||||
index["attribution"] = attribution
|
||||
tags = meta[name]["tags"]
|
||||
def __write_term_banks(self, terms):
|
||||
build_dir = self._get_build_dir()
|
||||
max_i = int(len(terms) / self._terms_per_file) + 1
|
||||
for i in range(max_i):
|
||||
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
|
||||
with open(term_file, "w", encoding='utf8') as f:
|
||||
start = self._terms_per_file * i
|
||||
end = self._terms_per_file * (i + 1)
|
||||
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
||||
|
||||
__create_zip(terms, index, tags)
|
||||
def __write_index(self, index):
|
||||
build_dir = self._get_build_dir()
|
||||
index_file = os.path.join(build_dir, "index.json")
|
||||
with open(index_file, 'w', encoding='utf8') as f:
|
||||
json.dump(index, f, indent=4, ensure_ascii=False)
|
||||
|
||||
|
||||
def __terms(entries):
|
||||
terms = []
|
||||
modified_date = None
|
||||
attribution = ""
|
||||
for entry in entries:
|
||||
if modified_date is None or entry.modified_date > modified_date:
|
||||
modified_date = entry.modified_date
|
||||
attribution = entry.attribution
|
||||
for term in entry.yomichan_terms():
|
||||
terms.append(term)
|
||||
return terms, modified_date, attribution
|
||||
|
||||
|
||||
def __create_zip(terms, index, tags):
|
||||
cache_dir = user_cache_dir("jitenbot")
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
|
||||
if Path(build_directory).is_dir():
|
||||
shutil.rmtree(build_directory)
|
||||
os.makedirs(build_directory)
|
||||
|
||||
terms_per_file = 1000
|
||||
max_i = int(len(terms) / terms_per_file) + 1
|
||||
for i in range(max_i):
|
||||
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
||||
with open(term_file, "w", encoding='utf8') as f:
|
||||
start = terms_per_file * i
|
||||
end = terms_per_file * (i + 1)
|
||||
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
||||
|
||||
index_file = os.path.join(build_directory, "index.json")
|
||||
with open(index_file, 'w', encoding='utf8') as f:
|
||||
json.dump(index, f, indent=4, ensure_ascii=False)
|
||||
|
||||
if len(tags) > 0:
|
||||
tag_file = os.path.join(build_directory, "tag_bank_1.json")
|
||||
def __write_tag_bank(self, tags):
|
||||
if len(tags) == 0:
|
||||
return
|
||||
build_dir = self._get_build_dir()
|
||||
tag_file = os.path.join(build_dir, "tag_bank_1.json")
|
||||
with open(tag_file, 'w', encoding='utf8') as f:
|
||||
json.dump(tags, f, indent=4, ensure_ascii=False)
|
||||
|
||||
zip_filename = index["title"]
|
||||
zip_file = f"{zip_filename}.zip"
|
||||
shutil.make_archive(zip_filename, "zip", build_directory)
|
||||
def __write_archive(self, filename):
|
||||
archive_format = "zip"
|
||||
out_dir = os.path.join(user_documents_dir(), "jitenbot")
|
||||
if not Path(out_dir).is_dir():
|
||||
os.makedirs(out_dir)
|
||||
out_file = f"{filename}.{archive_format}"
|
||||
out_filepath = os.path.join(out_dir, out_file)
|
||||
if Path(out_filepath).is_file():
|
||||
os.remove(out_filepath)
|
||||
base_filename = os.path.join(out_dir, filename)
|
||||
build_dir = self._get_build_dir()
|
||||
shutil.make_archive(base_filename, archive_format, build_dir)
|
||||
print(f"Dictionary file exported to {out_filepath}")
|
||||
|
||||
out_dir = os.path.join(user_documents_dir(), "jitenbot")
|
||||
out_file = os.path.join(out_dir, zip_file)
|
||||
if not Path(out_dir).is_dir():
|
||||
os.mkdir(out_dir)
|
||||
elif Path(out_file).is_file():
|
||||
os.remove(out_file)
|
||||
shutil.move(zip_file, out_dir)
|
||||
shutil.rmtree(build_directory)
|
||||
def __rm_build_dir(self):
|
||||
build_dir = self._get_build_dir()
|
||||
shutil.rmtree(build_dir)
|
||||
|
||||
|
||||
class JitenonExporter(Exporter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def _get_revision(self, entries):
|
||||
modified_date = None
|
||||
for entry in entries:
|
||||
if modified_date is None or entry.modified_date > modified_date:
|
||||
modified_date = entry.modified_date
|
||||
revision = f"{self._name}.{modified_date}"
|
||||
return revision
|
||||
|
||||
def _get_attribution(self, entries):
|
||||
modified_date = None
|
||||
for entry in entries:
|
||||
if modified_date is None or entry.modified_date > modified_date:
|
||||
attribution = entry.attribution
|
||||
return attribution
|
||||
|
||||
|
||||
class JitenonYojiExporter(JitenonExporter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._name = "jitenon-yoji"
|
||||
|
||||
|
||||
class JitenonKotowazaExporter(JitenonExporter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._name = "jitenon-kotowaza"
|
||||
|
|
Loading…
Reference in a new issue