diff --git a/bot/crawlers/base/crawler.py b/bot/crawlers/base/crawler.py index 31c3bdc..bbbcb9b 100644 --- a/bot/crawlers/base/crawler.py +++ b/bot/crawlers/base/crawler.py @@ -21,7 +21,7 @@ class BaseCrawler(ABC): pages_len = len(self._page_map) items = self._page_map.items() for idx, (page_id, page_path) in enumerate(items): - update = f"Reading page {idx+1}/{pages_len}" + update = f"\tReading page {idx+1}/{pages_len}" print(update, end='\r', flush=True) entry = new_entry(self._target, page_id) with open(page_path, "r", encoding="utf-8") as f: diff --git a/bot/crawlers/base/jitenon.py b/bot/crawlers/base/jitenon.py index ddbf3e5..49e4626 100644 --- a/bot/crawlers/base/jitenon.py +++ b/bot/crawlers/base/jitenon.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup +from bot.time import timestamp from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper from bot.crawlers.base.crawler import BaseCrawler @@ -10,7 +11,7 @@ class JitenonCrawler(BaseCrawler): self._gojuon_url = None def collect_pages(self, page_dir): - print("Scraping jitenon.jp") + print(f"{timestamp()} Scraping {self._gojuon_url}") jitenon = JitenonScraper() gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") @@ -26,4 +27,4 @@ class JitenonCrawler(BaseCrawler): _, page_path = jitenon.scrape(page_link) self._page_map[page_id] = page_path pages_len = len(self._page_map) - print(f"Finished scraping {pages_len} pages") + print(f"\n{timestamp()} Found {pages_len} entry pages") diff --git a/bot/crawlers/base/monokakido.py b/bot/crawlers/base/monokakido.py index 057f8d4..ca98545 100644 --- a/bot/crawlers/base/monokakido.py +++ b/bot/crawlers/base/monokakido.py @@ -1,4 +1,5 @@ import os +from bot.time import timestamp from bot.crawlers.base.crawler import BaseCrawler @@ -8,7 +9,7 @@ class MonokakidoCrawler(BaseCrawler): self._page_id_pattern = r"^([0-9]+)\.xml$" def collect_pages(self, page_dir): - print(f"Searching for page files in `{page_dir}`") + print(f"{timestamp()} Searching for page files in `{page_dir}`") for pagefile in os.listdir(page_dir): page_id = self._parse_page_id(pagefile) if page_id is None or page_id == 0: @@ -16,4 +17,4 @@ class MonokakidoCrawler(BaseCrawler): path = os.path.join(page_dir, pagefile) self._page_map[page_id] = path pages_len = len(self._page_map) - print(f"Found {pages_len} page files for processing") + print(f"{timestamp()} Found {pages_len} page files for processing") diff --git a/bot/crawlers/jitenon_kokugo.py b/bot/crawlers/jitenon_kokugo.py index 6d5cd66..e748ea1 100644 --- a/bot/crawlers/jitenon_kokugo.py +++ b/bot/crawlers/jitenon_kokugo.py @@ -1,6 +1,7 @@ import re from bs4 import BeautifulSoup +from bot.time import timestamp from bot.crawlers.base.crawler import BaseCrawler from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper @@ -12,6 +13,7 @@ class Crawler(BaseCrawler): self._page_id_pattern = r"word/p([0-9]+)$" def collect_pages(self, page_dir): + print(f"{timestamp()} Scraping {self._gojuon_url}") jitenon = JitenonScraper() gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") @@ -35,4 +37,4 @@ class Crawler(BaseCrawler): _, page_path = jitenon.scrape(page_link) self._page_map[page_id] = page_path pages_len = len(self._page_map) - print(f"Finished scraping {pages_len} pages") + print(f"\n{timestamp()} Found {pages_len} entry pages") diff --git a/bot/crawlers/scrapers/scraper.py b/bot/crawlers/scrapers/scraper.py index 113d090..eeb9534 100644 --- a/bot/crawlers/scrapers/scraper.py +++ b/bot/crawlers/scrapers/scraper.py @@ -2,6 +2,8 @@ import time import re import os import hashlib +import random +import math from datetime import datetime from urllib.parse import urlparse from pathlib import Path @@ -12,11 +14,13 @@ from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from platformdirs import user_cache_dir +from bot.time import timestamp from bot.data import load_config class BaseScraper(ABC): def __init__(self): + self.cache_count = 0 self._config = load_config() self.netloc_re = self._get_netloc_re() self.__set_session() @@ -31,7 +35,8 @@ class BaseScraper(ABC): with open(cache_path, "w", encoding="utf-8") as f: f.write(html) else: - print("Discovering cached files...", end='\r', flush=True) + self.cache_count += 1 + print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True) return html, cache_path @abstractmethod @@ -91,15 +96,14 @@ class BaseScraper(ABC): def __get(self, urlstring): delay = 10 time.sleep(delay) - now = datetime.now().strftime("%H:%M:%S") - print(f"{now} scraping {urlstring} ...", end='') + print(f"{timestamp()} Scraping {urlstring} ...", end='') try: response = self.session.get(urlstring, timeout=10) - print("OK") + print(f"{timestamp()} OK") return response.text - except Exception: - print("failed") - print("resetting session and trying again") + except Exception as ex: + print(f"\tFailed: {str(ex)}") + print(f"{timestamp()} Resetting session and trying again") self.__set_session() response = self.session.get(urlstring, timeout=10) return response.text diff --git a/bot/mdict/exporters/base/exporter.py b/bot/mdict/exporters/base/exporter.py index 26dc662..37ed376 100644 --- a/bot/mdict/exporters/base/exporter.py +++ b/bot/mdict/exporters/base/exporter.py @@ -6,6 +6,7 @@ from pathlib import Path from platformdirs import user_documents_dir, user_cache_dir +from bot.time import timestamp from bot.factory import new_mdict_terminator @@ -32,7 +33,7 @@ class BaseExporter(ABC): return self._build_dir cache_dir = user_cache_dir("jitenbot") build_directory = os.path.join(cache_dir, "mdict_build") - print(f"Initializing build directory `{build_directory}`") + print(f"{timestamp()} Initializing build directory `{build_directory}`") if Path(build_directory).is_dir(): shutil.rmtree(build_directory) os.makedirs(build_directory) @@ -43,7 +44,7 @@ class BaseExporter(ABC): build_dir = self._get_build_dir() build_media_dir = os.path.join(build_dir, self._target.value) if media_dir is not None: - print("Copying media files to build directory...") + print(f"{timestamp()} Copying media files to build directory...") shutil.copytree(media_dir, build_media_dir) else: os.makedirs(build_media_dir) @@ -69,7 +70,7 @@ class BaseExporter(ABC): def _write_mdx_file(self, entries): terms = self._get_terms(entries) - print(f"Exporting {len(terms)} Mdict keys...") + print(f"{timestamp()} Exporting {len(terms)} Mdict keys...") out_dir = self._get_out_dir() out_file = os.path.join(out_dir, f"{self._target.value}.mdx") params = [ @@ -85,7 +86,7 @@ class BaseExporter(ABC): terms = [] entries_len = len(entries) for idx, entry in enumerate(entries): - update = f"Creating Mdict terms for entry {idx+1}/{entries_len}" + update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}" print(update, end='\r', flush=True) new_terms = self._terminator.make_terms(entry) for term in new_terms: @@ -124,7 +125,7 @@ class BaseExporter(ABC): return self._out_dir out_dir = os.path.join( user_documents_dir(), "jitenbot", "mdict", self._target.value) - print(f"Initializing output directory `{out_dir}`") + print(f"{timestamp()} Initializing output directory `{out_dir}`") if Path(out_dir).is_dir(): shutil.rmtree(out_dir) os.makedirs(out_dir) diff --git a/bot/time.py b/bot/time.py new file mode 100644 index 0000000..f8dae94 --- /dev/null +++ b/bot/time.py @@ -0,0 +1,5 @@ +import time + + +def timestamp(): + return time.strftime('%X') diff --git a/bot/yomichan/exporters/base/exporter.py b/bot/yomichan/exporters/base/exporter.py index 9389202..5e4e870 100644 --- a/bot/yomichan/exporters/base/exporter.py +++ b/bot/yomichan/exporters/base/exporter.py @@ -8,6 +8,7 @@ from abc import ABC, abstractmethod import fastjsonschema from platformdirs import user_documents_dir, user_cache_dir +from bot.time import timestamp from bot.data import load_yomichan_metadata from bot.data import load_yomichan_term_schema from bot.factory import new_yomichan_terminator @@ -45,7 +46,7 @@ class BaseExporter(ABC): return self._build_dir cache_dir = user_cache_dir("jitenbot") build_directory = os.path.join(cache_dir, "yomichan_build") - print(f"Initializing build directory `{build_directory}`") + print(f"{timestamp()} Initializing build directory `{build_directory}`") if Path(build_directory).is_dir(): shutil.rmtree(build_directory) os.makedirs(build_directory) @@ -64,8 +65,9 @@ class BaseExporter(ABC): build_dir = self._get_build_dir() build_img_dir = os.path.join(build_dir, self._target.value) if image_dir is not None: - print("Copying media files to build directory...") + print(f"{timestamp()} Copying media files to build directory...") shutil.copytree(image_dir, build_img_dir) + print(f"{timestamp()} Finished copying files") else: os.makedirs(build_img_dir) self._terminator.set_image_dir(build_img_dir) @@ -74,7 +76,7 @@ class BaseExporter(ABC): terms = [] entries_len = len(entries) for idx, entry in enumerate(entries): - update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}" + update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}" print(update, end='\r', flush=True) new_terms = self._terminator.make_terms(entry) for term in new_terms: @@ -83,7 +85,7 @@ class BaseExporter(ABC): return terms def __validate_terms(self, terms): - print("Making a copy of term data for validation...") + print(f"{timestamp()} Making a copy of term data for validation...") terms_copy = copy.deepcopy(terms) # because validator will alter data! term_count = len(terms_copy) log_dir = self.__get_invalid_term_dir() @@ -91,7 +93,7 @@ class BaseExporter(ABC): validator = fastjsonschema.compile(schema) failure_count = 0 for idx, term in enumerate(terms_copy): - update = f"Validating term {idx+1}/{term_count}" + update = f"\tValidating term {idx+1}/{term_count}" print(update, end='\r', flush=True) try: validator([term]) @@ -100,9 +102,9 @@ class BaseExporter(ABC): term_file = os.path.join(log_dir, f"{idx}.json") with open(term_file, "w", encoding='utf8') as f: json.dump([term], f, indent=4, ensure_ascii=False) - print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}") + print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}") if failure_count > 0: - print(f"Invalid terms saved to `{log_dir}` for debugging") + print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging") def __make_dictionary(self, terms, index, tags): self.__write_term_banks(terms) @@ -112,11 +114,11 @@ class BaseExporter(ABC): self.__rm_build_dir() def __write_term_banks(self, terms): - print(f"Exporting {len(terms)} JSON terms") + print(f"{timestamp()} Exporting {len(terms)} JSON terms") build_dir = self._get_build_dir() max_i = int(len(terms) / self._terms_per_file) + 1 for i in range(max_i): - update = f"Writing terms to term bank {i+1}/{max_i}" + update = f"\tWriting terms to term bank {i+1}/{max_i}" print(update, end='\r', flush=True) start = self._terms_per_file * i end = self._terms_per_file * (i + 1) @@ -141,7 +143,7 @@ class BaseExporter(ABC): def __write_archive(self, filename): archive_format = "zip" - print(f"Archiving data to {archive_format.upper()} file...") + print(f"{timestamp()} Archiving data to {archive_format.upper()} file...") out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan") if not Path(out_dir).is_dir(): os.makedirs(out_dir) @@ -152,7 +154,7 @@ class BaseExporter(ABC): base_filename = os.path.join(out_dir, filename) build_dir = self._get_build_dir() shutil.make_archive(base_filename, archive_format, build_dir) - print(f"Dictionary file saved to `{out_filepath}`") + print(f"{timestamp()} Dictionary file saved to `{out_filepath}`") def __rm_build_dir(self): build_dir = self._get_build_dir()