Add timestamps to command line messages

This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway.
2023-07-28 23:17:42 -05:00 · 2023-07-28 23:17:42 -05:00 · b03978d1f7
parent 8f30f9419d
commit b03978d1f7
8 changed files with 45 additions and 29 deletions
--- a/bot/crawlers/base/crawler.py
+++ b/bot/crawlers/base/crawler.py
@ -21,7 +21,7 @@ class BaseCrawler(ABC):
        pages_len = len(self._page_map)
        items = self._page_map.items()
        for idx, (page_id, page_path) in enumerate(items):
-            update = f"Reading page {idx+1}/{pages_len}"
+            update = f"\tReading page {idx+1}/{pages_len}"
            print(update, end='\r', flush=True)
            entry = new_entry(self._target, page_id)
            with open(page_path, "r", encoding="utf-8") as f:
--- a/bot/crawlers/base/jitenon.py
+++ b/bot/crawlers/base/jitenon.py
@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup

+from bot.time import timestamp
 from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
 from bot.crawlers.base.crawler import BaseCrawler

@ -10,7 +11,7 @@ class JitenonCrawler(BaseCrawler):
        self._gojuon_url = None

    def collect_pages(self, page_dir):
-        print("Scraping jitenon.jp")
+        print(f"{timestamp()} Scraping {self._gojuon_url}")
        jitenon = JitenonScraper()
        gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -26,4 +27,4 @@ class JitenonCrawler(BaseCrawler):
                _, page_path = jitenon.scrape(page_link)
                self._page_map[page_id] = page_path
        pages_len = len(self._page_map)
-        print(f"Finished scraping {pages_len} pages")
+        print(f"\n{timestamp()} Found {pages_len} entry pages")
--- a/bot/crawlers/base/monokakido.py
+++ b/bot/crawlers/base/monokakido.py
@ -1,4 +1,5 @@
 import os
+from bot.time import timestamp
 from bot.crawlers.base.crawler import BaseCrawler


@ -8,7 +9,7 @@ class MonokakidoCrawler(BaseCrawler):
        self._page_id_pattern = r"^([0-9]+)\.xml$"

    def collect_pages(self, page_dir):
-        print(f"Searching for page files in `{page_dir}`")
+        print(f"{timestamp()} Searching for page files in `{page_dir}`")
        for pagefile in os.listdir(page_dir):
            page_id = self._parse_page_id(pagefile)
            if page_id is None or page_id == 0:
@ -16,4 +17,4 @@ class MonokakidoCrawler(BaseCrawler):
            path = os.path.join(page_dir, pagefile)
            self._page_map[page_id] = path
        pages_len = len(self._page_map)
-        print(f"Found {pages_len} page files for processing")
+        print(f"{timestamp()} Found {pages_len} page files for processing")
--- a/bot/crawlers/jitenon_kokugo.py
+++ b/bot/crawlers/jitenon_kokugo.py
@ -1,6 +1,7 @@
 import re
 from bs4 import BeautifulSoup

+from bot.time import timestamp
 from bot.crawlers.base.crawler import BaseCrawler
 from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper

@ -12,6 +13,7 @@ class Crawler(BaseCrawler):
        self._page_id_pattern = r"word/p([0-9]+)$"

    def collect_pages(self, page_dir):
+        print(f"{timestamp()} Scraping {self._gojuon_url}")
        jitenon = JitenonScraper()
        gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
        gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -35,4 +37,4 @@ class Crawler(BaseCrawler):
                    _, page_path = jitenon.scrape(page_link)
                    self._page_map[page_id] = page_path
        pages_len = len(self._page_map)
-        print(f"Finished scraping {pages_len} pages")
+        print(f"\n{timestamp()} Found {pages_len} entry pages")
--- a/bot/crawlers/scrapers/scraper.py
+++ b/bot/crawlers/scrapers/scraper.py
@ -2,6 +2,8 @@ import time
 import re
 import os
 import hashlib
+import random
+import math
 from datetime import datetime
 from urllib.parse import urlparse
 from pathlib import Path
@ -12,11 +14,13 @@ from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from platformdirs import user_cache_dir

+from bot.time import timestamp
 from bot.data import load_config


 class BaseScraper(ABC):
    def __init__(self):
+        self.cache_count = 0
        self._config = load_config()
        self.netloc_re = self._get_netloc_re()
        self.__set_session()
@ -31,7 +35,8 @@ class BaseScraper(ABC):
            with open(cache_path, "w", encoding="utf-8") as f:
                f.write(html)
        else:
-            print("Discovering cached files...", end='\r', flush=True)
+            self.cache_count += 1
+            print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
        return html, cache_path

    @abstractmethod
@ -91,15 +96,14 @@ class BaseScraper(ABC):
    def __get(self, urlstring):
        delay = 10
        time.sleep(delay)
-        now = datetime.now().strftime("%H:%M:%S")
-        print(f"{now} scraping {urlstring} ...", end='')
+        print(f"{timestamp()} Scraping {urlstring} ...", end='')
        try:
            response = self.session.get(urlstring, timeout=10)
-            print("OK")
+            print(f"{timestamp()} OK")
            return response.text
-        except Exception:
-            print("failed")
-            print("resetting session and trying again")
+        except Exception as ex:
+            print(f"\tFailed: {str(ex)}")
+            print(f"{timestamp()} Resetting session and trying again")
            self.__set_session()
            response = self.session.get(urlstring, timeout=10)
            return response.text
--- a/bot/mdict/exporters/base/exporter.py
+++ b/bot/mdict/exporters/base/exporter.py
@ -6,6 +6,7 @@ from pathlib import Path

 from platformdirs import user_documents_dir, user_cache_dir

+from bot.time import timestamp
 from bot.factory import new_mdict_terminator


@ -32,7 +33,7 @@ class BaseExporter(ABC):
            return self._build_dir
        cache_dir = user_cache_dir("jitenbot")
        build_directory = os.path.join(cache_dir, "mdict_build")
-        print(f"Initializing build directory `{build_directory}`")
+        print(f"{timestamp()} Initializing build directory `{build_directory}`")
        if Path(build_directory).is_dir():
            shutil.rmtree(build_directory)
        os.makedirs(build_directory)
@ -43,7 +44,7 @@ class BaseExporter(ABC):
        build_dir = self._get_build_dir()
        build_media_dir = os.path.join(build_dir, self._target.value)
        if media_dir is not None:
-            print("Copying media files to build directory...")
+            print(f"{timestamp()} Copying media files to build directory...")
            shutil.copytree(media_dir, build_media_dir)
        else:
            os.makedirs(build_media_dir)
@ -69,7 +70,7 @@ class BaseExporter(ABC):

    def _write_mdx_file(self, entries):
        terms = self._get_terms(entries)
-        print(f"Exporting {len(terms)} Mdict keys...")
+        print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
        out_dir = self._get_out_dir()
        out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
        params = [
@ -85,7 +86,7 @@ class BaseExporter(ABC):
        terms = []
        entries_len = len(entries)
        for idx, entry in enumerate(entries):
-            update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
+            update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
            print(update, end='\r', flush=True)
            new_terms = self._terminator.make_terms(entry)
            for term in new_terms:
@ -124,7 +125,7 @@ class BaseExporter(ABC):
            return self._out_dir
        out_dir = os.path.join(
            user_documents_dir(), "jitenbot", "mdict", self._target.value)
-        print(f"Initializing output directory `{out_dir}`")
+        print(f"{timestamp()} Initializing output directory `{out_dir}`")
        if Path(out_dir).is_dir():
            shutil.rmtree(out_dir)
        os.makedirs(out_dir)
--- a/bot/time.py
+++ b/bot/time.py
@ -0,0 +1,5 @@
+import time
+
+
+def timestamp():
+    return time.strftime('%X')
--- a/bot/yomichan/exporters/base/exporter.py
+++ b/bot/yomichan/exporters/base/exporter.py
@ -8,6 +8,7 @@ from abc import ABC, abstractmethod
 import fastjsonschema
 from platformdirs import user_documents_dir, user_cache_dir

+from bot.time import timestamp
 from bot.data import load_yomichan_metadata
 from bot.data import load_yomichan_term_schema
 from bot.factory import new_yomichan_terminator
@ -45,7 +46,7 @@ class BaseExporter(ABC):
            return self._build_dir
        cache_dir = user_cache_dir("jitenbot")
        build_directory = os.path.join(cache_dir, "yomichan_build")
-        print(f"Initializing build directory `{build_directory}`")
+        print(f"{timestamp()} Initializing build directory `{build_directory}`")
        if Path(build_directory).is_dir():
            shutil.rmtree(build_directory)
        os.makedirs(build_directory)
@ -64,8 +65,9 @@ class BaseExporter(ABC):
        build_dir = self._get_build_dir()
        build_img_dir = os.path.join(build_dir, self._target.value)
        if image_dir is not None:
-            print("Copying media files to build directory...")
+            print(f"{timestamp()} Copying media files to build directory...")
            shutil.copytree(image_dir, build_img_dir)
+            print(f"{timestamp()} Finished copying files")
        else:
            os.makedirs(build_img_dir)
        self._terminator.set_image_dir(build_img_dir)
@ -74,7 +76,7 @@ class BaseExporter(ABC):
        terms = []
        entries_len = len(entries)
        for idx, entry in enumerate(entries):
-            update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
+            update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
            print(update, end='\r', flush=True)
            new_terms = self._terminator.make_terms(entry)
            for term in new_terms:
@ -83,7 +85,7 @@ class BaseExporter(ABC):
        return terms

    def __validate_terms(self, terms):
-        print("Making a copy of term data for validation...")
+        print(f"{timestamp()} Making a copy of term data for validation...")
        terms_copy = copy.deepcopy(terms)  # because validator will alter data!
        term_count = len(terms_copy)
        log_dir = self.__get_invalid_term_dir()
@ -91,7 +93,7 @@ class BaseExporter(ABC):
        validator = fastjsonschema.compile(schema)
        failure_count = 0
        for idx, term in enumerate(terms_copy):
-            update = f"Validating term {idx+1}/{term_count}"
+            update = f"\tValidating term {idx+1}/{term_count}"
            print(update, end='\r', flush=True)
            try:
                validator([term])
@ -100,9 +102,9 @@ class BaseExporter(ABC):
                term_file = os.path.join(log_dir, f"{idx}.json")
                with open(term_file, "w", encoding='utf8') as f:
                    json.dump([term], f, indent=4, ensure_ascii=False)
-        print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
+        print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
        if failure_count > 0:
-            print(f"Invalid terms saved to `{log_dir}` for debugging")
+            print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")

    def __make_dictionary(self, terms, index, tags):
        self.__write_term_banks(terms)
@ -112,11 +114,11 @@ class BaseExporter(ABC):
        self.__rm_build_dir()

    def __write_term_banks(self, terms):
-        print(f"Exporting {len(terms)} JSON terms")
+        print(f"{timestamp()} Exporting {len(terms)} JSON terms")
        build_dir = self._get_build_dir()
        max_i = int(len(terms) / self._terms_per_file) + 1
        for i in range(max_i):
-            update = f"Writing terms to term bank {i+1}/{max_i}"
+            update = f"\tWriting terms to term bank {i+1}/{max_i}"
            print(update, end='\r', flush=True)
            start = self._terms_per_file * i
            end = self._terms_per_file * (i + 1)
@ -141,7 +143,7 @@ class BaseExporter(ABC):

    def __write_archive(self, filename):
        archive_format = "zip"
-        print(f"Archiving data to {archive_format.upper()} file...")
+        print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
        out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
        if not Path(out_dir).is_dir():
            os.makedirs(out_dir)
@ -152,7 +154,7 @@ class BaseExporter(ABC):
        base_filename = os.path.join(out_dir, filename)
        build_dir = self._get_build_dir()
        shutil.make_archive(base_filename, archive_format, build_dir)
-        print(f"Dictionary file saved to `{out_filepath}`")
+        print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")

    def __rm_build_dir(self):
        build_dir = self._get_build_dir()