From b03978d1f73293ad29caff9654e5fbe7579b0d60 Mon Sep 17 00:00:00 2001
From: stephenmk <stephenmk@users.noreply.github.com>
Date: Fri, 28 Jul 2023 23:17:42 -0500
Subject: [PATCH] Add timestamps to command line messages

This is a clumsy way of doing it (since it would be better to have a
wrapper function append the timestamp), but that will be taken care of
when the logging logic is all overhauled anyway.
---
 bot/crawlers/base/crawler.py            |  2 +-
 bot/crawlers/base/jitenon.py            |  5 +++--
 bot/crawlers/base/monokakido.py         |  5 +++--
 bot/crawlers/jitenon_kokugo.py          |  4 +++-
 bot/crawlers/scrapers/scraper.py        | 18 +++++++++++-------
 bot/mdict/exporters/base/exporter.py    | 11 ++++++-----
 bot/time.py                             |  5 +++++
 bot/yomichan/exporters/base/exporter.py | 24 +++++++++++++-----------
 8 files changed, 45 insertions(+), 29 deletions(-)
 create mode 100644 bot/time.py

diff --git a/bot/crawlers/base/crawler.py b/bot/crawlers/base/crawler.py
index 31c3bdc..bbbcb9b 100644
--- a/bot/crawlers/base/crawler.py
+++ b/bot/crawlers/base/crawler.py
@@ -21,7 +21,7 @@ class BaseCrawler(ABC):
         pages_len = len(self._page_map)
         items = self._page_map.items()
         for idx, (page_id, page_path) in enumerate(items):
-            update = f"Reading page {idx+1}/{pages_len}"
+            update = f"\tReading page {idx+1}/{pages_len}"
             print(update, end='\r', flush=True)
             entry = new_entry(self._target, page_id)
             with open(page_path, "r", encoding="utf-8") as f:
diff --git a/bot/crawlers/base/jitenon.py b/bot/crawlers/base/jitenon.py
index ddbf3e5..49e4626 100644
--- a/bot/crawlers/base/jitenon.py
+++ b/bot/crawlers/base/jitenon.py
@@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup
 
+from bot.time import timestamp
 from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
 from bot.crawlers.base.crawler import BaseCrawler
 
@@ -10,7 +11,7 @@ class JitenonCrawler(BaseCrawler):
         self._gojuon_url = None
 
     def collect_pages(self, page_dir):
-        print("Scraping jitenon.jp")
+        print(f"{timestamp()} Scraping {self._gojuon_url}")
         jitenon = JitenonScraper()
         gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
         gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@@ -26,4 +27,4 @@ class JitenonCrawler(BaseCrawler):
                 _, page_path = jitenon.scrape(page_link)
                 self._page_map[page_id] = page_path
         pages_len = len(self._page_map)
-        print(f"Finished scraping {pages_len} pages")
+        print(f"\n{timestamp()} Found {pages_len} entry pages")
diff --git a/bot/crawlers/base/monokakido.py b/bot/crawlers/base/monokakido.py
index 057f8d4..ca98545 100644
--- a/bot/crawlers/base/monokakido.py
+++ b/bot/crawlers/base/monokakido.py
@@ -1,4 +1,5 @@
 import os
+from bot.time import timestamp
 from bot.crawlers.base.crawler import BaseCrawler
 
 
@@ -8,7 +9,7 @@ class MonokakidoCrawler(BaseCrawler):
         self._page_id_pattern = r"^([0-9]+)\.xml$"
 
     def collect_pages(self, page_dir):
-        print(f"Searching for page files in `{page_dir}`")
+        print(f"{timestamp()} Searching for page files in `{page_dir}`")
         for pagefile in os.listdir(page_dir):
             page_id = self._parse_page_id(pagefile)
             if page_id is None or page_id == 0:
@@ -16,4 +17,4 @@ class MonokakidoCrawler(BaseCrawler):
             path = os.path.join(page_dir, pagefile)
             self._page_map[page_id] = path
         pages_len = len(self._page_map)
-        print(f"Found {pages_len} page files for processing")
+        print(f"{timestamp()} Found {pages_len} page files for processing")
diff --git a/bot/crawlers/jitenon_kokugo.py b/bot/crawlers/jitenon_kokugo.py
index 6d5cd66..e748ea1 100644
--- a/bot/crawlers/jitenon_kokugo.py
+++ b/bot/crawlers/jitenon_kokugo.py
@@ -1,6 +1,7 @@
 import re
 from bs4 import BeautifulSoup
 
+from bot.time import timestamp
 from bot.crawlers.base.crawler import BaseCrawler
 from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
 
@@ -12,6 +13,7 @@ class Crawler(BaseCrawler):
         self._page_id_pattern = r"word/p([0-9]+)$"
 
     def collect_pages(self, page_dir):
+        print(f"{timestamp()} Scraping {self._gojuon_url}")
         jitenon = JitenonScraper()
         gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
         gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@@ -35,4 +37,4 @@ class Crawler(BaseCrawler):
                     _, page_path = jitenon.scrape(page_link)
                     self._page_map[page_id] = page_path
         pages_len = len(self._page_map)
-        print(f"Finished scraping {pages_len} pages")
+        print(f"\n{timestamp()} Found {pages_len} entry pages")
diff --git a/bot/crawlers/scrapers/scraper.py b/bot/crawlers/scrapers/scraper.py
index 113d090..eeb9534 100644
--- a/bot/crawlers/scrapers/scraper.py
+++ b/bot/crawlers/scrapers/scraper.py
@@ -2,6 +2,8 @@ import time
 import re
 import os
 import hashlib
+import random
+import math
 from datetime import datetime
 from urllib.parse import urlparse
 from pathlib import Path
@@ -12,11 +14,13 @@ from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from platformdirs import user_cache_dir
 
+from bot.time import timestamp
 from bot.data import load_config
 
 
 class BaseScraper(ABC):
     def __init__(self):
+        self.cache_count = 0
         self._config = load_config()
         self.netloc_re = self._get_netloc_re()
         self.__set_session()
@@ -31,7 +35,8 @@ class BaseScraper(ABC):
             with open(cache_path, "w", encoding="utf-8") as f:
                 f.write(html)
         else:
-            print("Discovering cached files...", end='\r', flush=True)
+            self.cache_count += 1
+            print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
         return html, cache_path
 
     @abstractmethod
@@ -91,15 +96,14 @@ class BaseScraper(ABC):
     def __get(self, urlstring):
         delay = 10
         time.sleep(delay)
-        now = datetime.now().strftime("%H:%M:%S")
-        print(f"{now} scraping {urlstring} ...", end='')
+        print(f"{timestamp()} Scraping {urlstring} ...", end='')
         try:
             response = self.session.get(urlstring, timeout=10)
-            print("OK")
+            print(f"{timestamp()} OK")
             return response.text
-        except Exception:
-            print("failed")
-            print("resetting session and trying again")
+        except Exception as ex:
+            print(f"\tFailed: {str(ex)}")
+            print(f"{timestamp()} Resetting session and trying again")
             self.__set_session()
             response = self.session.get(urlstring, timeout=10)
             return response.text
diff --git a/bot/mdict/exporters/base/exporter.py b/bot/mdict/exporters/base/exporter.py
index 26dc662..37ed376 100644
--- a/bot/mdict/exporters/base/exporter.py
+++ b/bot/mdict/exporters/base/exporter.py
@@ -6,6 +6,7 @@ from pathlib import Path
 
 from platformdirs import user_documents_dir, user_cache_dir
 
+from bot.time import timestamp
 from bot.factory import new_mdict_terminator
 
 
@@ -32,7 +33,7 @@ class BaseExporter(ABC):
             return self._build_dir
         cache_dir = user_cache_dir("jitenbot")
         build_directory = os.path.join(cache_dir, "mdict_build")
-        print(f"Initializing build directory `{build_directory}`")
+        print(f"{timestamp()} Initializing build directory `{build_directory}`")
         if Path(build_directory).is_dir():
             shutil.rmtree(build_directory)
         os.makedirs(build_directory)
@@ -43,7 +44,7 @@ class BaseExporter(ABC):
         build_dir = self._get_build_dir()
         build_media_dir = os.path.join(build_dir, self._target.value)
         if media_dir is not None:
-            print("Copying media files to build directory...")
+            print(f"{timestamp()} Copying media files to build directory...")
             shutil.copytree(media_dir, build_media_dir)
         else:
             os.makedirs(build_media_dir)
@@ -69,7 +70,7 @@ class BaseExporter(ABC):
 
     def _write_mdx_file(self, entries):
         terms = self._get_terms(entries)
-        print(f"Exporting {len(terms)} Mdict keys...")
+        print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
         out_dir = self._get_out_dir()
         out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
         params = [
@@ -85,7 +86,7 @@ class BaseExporter(ABC):
         terms = []
         entries_len = len(entries)
         for idx, entry in enumerate(entries):
-            update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
+            update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
             print(update, end='\r', flush=True)
             new_terms = self._terminator.make_terms(entry)
             for term in new_terms:
@@ -124,7 +125,7 @@ class BaseExporter(ABC):
             return self._out_dir
         out_dir = os.path.join(
             user_documents_dir(), "jitenbot", "mdict", self._target.value)
-        print(f"Initializing output directory `{out_dir}`")
+        print(f"{timestamp()} Initializing output directory `{out_dir}`")
         if Path(out_dir).is_dir():
             shutil.rmtree(out_dir)
         os.makedirs(out_dir)
diff --git a/bot/time.py b/bot/time.py
new file mode 100644
index 0000000..f8dae94
--- /dev/null
+++ b/bot/time.py
@@ -0,0 +1,5 @@
+import time
+
+
+def timestamp():
+    return time.strftime('%X')
diff --git a/bot/yomichan/exporters/base/exporter.py b/bot/yomichan/exporters/base/exporter.py
index 9389202..5e4e870 100644
--- a/bot/yomichan/exporters/base/exporter.py
+++ b/bot/yomichan/exporters/base/exporter.py
@@ -8,6 +8,7 @@ from abc import ABC, abstractmethod
 import fastjsonschema
 from platformdirs import user_documents_dir, user_cache_dir
 
+from bot.time import timestamp
 from bot.data import load_yomichan_metadata
 from bot.data import load_yomichan_term_schema
 from bot.factory import new_yomichan_terminator
@@ -45,7 +46,7 @@ class BaseExporter(ABC):
             return self._build_dir
         cache_dir = user_cache_dir("jitenbot")
         build_directory = os.path.join(cache_dir, "yomichan_build")
-        print(f"Initializing build directory `{build_directory}`")
+        print(f"{timestamp()} Initializing build directory `{build_directory}`")
         if Path(build_directory).is_dir():
             shutil.rmtree(build_directory)
         os.makedirs(build_directory)
@@ -64,8 +65,9 @@ class BaseExporter(ABC):
         build_dir = self._get_build_dir()
         build_img_dir = os.path.join(build_dir, self._target.value)
         if image_dir is not None:
-            print("Copying media files to build directory...")
+            print(f"{timestamp()} Copying media files to build directory...")
             shutil.copytree(image_dir, build_img_dir)
+            print(f"{timestamp()} Finished copying files")
         else:
             os.makedirs(build_img_dir)
         self._terminator.set_image_dir(build_img_dir)
@@ -74,7 +76,7 @@ class BaseExporter(ABC):
         terms = []
         entries_len = len(entries)
         for idx, entry in enumerate(entries):
-            update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
+            update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
             print(update, end='\r', flush=True)
             new_terms = self._terminator.make_terms(entry)
             for term in new_terms:
@@ -83,7 +85,7 @@ class BaseExporter(ABC):
         return terms
 
     def __validate_terms(self, terms):
-        print("Making a copy of term data for validation...")
+        print(f"{timestamp()} Making a copy of term data for validation...")
         terms_copy = copy.deepcopy(terms)  # because validator will alter data!
         term_count = len(terms_copy)
         log_dir = self.__get_invalid_term_dir()
@@ -91,7 +93,7 @@ class BaseExporter(ABC):
         validator = fastjsonschema.compile(schema)
         failure_count = 0
         for idx, term in enumerate(terms_copy):
-            update = f"Validating term {idx+1}/{term_count}"
+            update = f"\tValidating term {idx+1}/{term_count}"
             print(update, end='\r', flush=True)
             try:
                 validator([term])
@@ -100,9 +102,9 @@ class BaseExporter(ABC):
                 term_file = os.path.join(log_dir, f"{idx}.json")
                 with open(term_file, "w", encoding='utf8') as f:
                     json.dump([term], f, indent=4, ensure_ascii=False)
-        print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
+        print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
         if failure_count > 0:
-            print(f"Invalid terms saved to `{log_dir}` for debugging")
+            print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
 
     def __make_dictionary(self, terms, index, tags):
         self.__write_term_banks(terms)
@@ -112,11 +114,11 @@ class BaseExporter(ABC):
         self.__rm_build_dir()
 
     def __write_term_banks(self, terms):
-        print(f"Exporting {len(terms)} JSON terms")
+        print(f"{timestamp()} Exporting {len(terms)} JSON terms")
         build_dir = self._get_build_dir()
         max_i = int(len(terms) / self._terms_per_file) + 1
         for i in range(max_i):
-            update = f"Writing terms to term bank {i+1}/{max_i}"
+            update = f"\tWriting terms to term bank {i+1}/{max_i}"
             print(update, end='\r', flush=True)
             start = self._terms_per_file * i
             end = self._terms_per_file * (i + 1)
@@ -141,7 +143,7 @@ class BaseExporter(ABC):
 
     def __write_archive(self, filename):
         archive_format = "zip"
-        print(f"Archiving data to {archive_format.upper()} file...")
+        print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
         out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
         if not Path(out_dir).is_dir():
             os.makedirs(out_dir)
@@ -152,7 +154,7 @@ class BaseExporter(ABC):
         base_filename = os.path.join(out_dir, filename)
         build_dir = self._get_build_dir()
         shutil.make_archive(base_filename, archive_format, build_dir)
-        print(f"Dictionary file saved to `{out_filepath}`")
+        print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
 
     def __rm_build_dir(self):
         build_dir = self._get_build_dir()