Add timestamps to command line messages

This is a clumsy way of doing it (since it would be better to have a
wrapper function append the timestamp), but that will be taken care of
when the logging logic is all overhauled anyway.
This commit is contained in:
stephenmk 2023-07-28 23:17:42 -05:00
parent 8f30f9419d
commit b03978d1f7
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
8 changed files with 45 additions and 29 deletions

View file

@ -21,7 +21,7 @@ class BaseCrawler(ABC):
pages_len = len(self._page_map)
items = self._page_map.items()
for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}"
update = f"\tReading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True)
entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f:

View file

@ -1,5 +1,6 @@
from bs4 import BeautifulSoup
from bot.time import timestamp
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
from bot.crawlers.base.crawler import BaseCrawler
@ -10,7 +11,7 @@ class JitenonCrawler(BaseCrawler):
self._gojuon_url = None
def collect_pages(self, page_dir):
print("Scraping jitenon.jp")
print(f"{timestamp()} Scraping {self._gojuon_url}")
jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -26,4 +27,4 @@ class JitenonCrawler(BaseCrawler):
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
print(f"\n{timestamp()} Found {pages_len} entry pages")

View file

@ -1,4 +1,5 @@
import os
from bot.time import timestamp
from bot.crawlers.base.crawler import BaseCrawler
@ -8,7 +9,7 @@ class MonokakidoCrawler(BaseCrawler):
self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self, page_dir):
print(f"Searching for page files in `{page_dir}`")
print(f"{timestamp()} Searching for page files in `{page_dir}`")
for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0:
@ -16,4 +17,4 @@ class MonokakidoCrawler(BaseCrawler):
path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path
pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing")
print(f"{timestamp()} Found {pages_len} page files for processing")

View file

@ -1,6 +1,7 @@
import re
from bs4 import BeautifulSoup
from bot.time import timestamp
from bot.crawlers.base.crawler import BaseCrawler
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
@ -12,6 +13,7 @@ class Crawler(BaseCrawler):
self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self, page_dir):
print(f"{timestamp()} Scraping {self._gojuon_url}")
jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -35,4 +37,4 @@ class Crawler(BaseCrawler):
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
print(f"\n{timestamp()} Found {pages_len} entry pages")

View file

@ -2,6 +2,8 @@ import time
import re
import os
import hashlib
import random
import math
from datetime import datetime
from urllib.parse import urlparse
from pathlib import Path
@ -12,11 +14,13 @@ from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from platformdirs import user_cache_dir
from bot.time import timestamp
from bot.data import load_config
class BaseScraper(ABC):
def __init__(self):
self.cache_count = 0
self._config = load_config()
self.netloc_re = self._get_netloc_re()
self.__set_session()
@ -31,7 +35,8 @@ class BaseScraper(ABC):
with open(cache_path, "w", encoding="utf-8") as f:
f.write(html)
else:
print("Discovering cached files...", end='\r', flush=True)
self.cache_count += 1
print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
return html, cache_path
@abstractmethod
@ -91,15 +96,14 @@ class BaseScraper(ABC):
def __get(self, urlstring):
delay = 10
time.sleep(delay)
now = datetime.now().strftime("%H:%M:%S")
print(f"{now} scraping {urlstring} ...", end='')
print(f"{timestamp()} Scraping {urlstring} ...", end='')
try:
response = self.session.get(urlstring, timeout=10)
print("OK")
print(f"{timestamp()} OK")
return response.text
except Exception:
print("failed")
print("resetting session and trying again")
except Exception as ex:
print(f"\tFailed: {str(ex)}")
print(f"{timestamp()} Resetting session and trying again")
self.__set_session()
response = self.session.get(urlstring, timeout=10)
return response.text

View file

@ -6,6 +6,7 @@ from pathlib import Path
from platformdirs import user_documents_dir, user_cache_dir
from bot.time import timestamp
from bot.factory import new_mdict_terminator
@ -32,7 +33,7 @@ class BaseExporter(ABC):
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "mdict_build")
print(f"Initializing build directory `{build_directory}`")
print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
@ -43,7 +44,7 @@ class BaseExporter(ABC):
build_dir = self._get_build_dir()
build_media_dir = os.path.join(build_dir, self._target.value)
if media_dir is not None:
print("Copying media files to build directory...")
print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(media_dir, build_media_dir)
else:
os.makedirs(build_media_dir)
@ -69,7 +70,7 @@ class BaseExporter(ABC):
def _write_mdx_file(self, entries):
terms = self._get_terms(entries)
print(f"Exporting {len(terms)} Mdict keys...")
print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
params = [
@ -85,7 +86,7 @@ class BaseExporter(ABC):
terms = []
entries_len = len(entries)
for idx, entry in enumerate(entries):
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry)
for term in new_terms:
@ -124,7 +125,7 @@ class BaseExporter(ABC):
return self._out_dir
out_dir = os.path.join(
user_documents_dir(), "jitenbot", "mdict", self._target.value)
print(f"Initializing output directory `{out_dir}`")
print(f"{timestamp()} Initializing output directory `{out_dir}`")
if Path(out_dir).is_dir():
shutil.rmtree(out_dir)
os.makedirs(out_dir)

5
bot/time.py Normal file
View file

@ -0,0 +1,5 @@
import time
def timestamp():
return time.strftime('%X')

View file

@ -8,6 +8,7 @@ from abc import ABC, abstractmethod
import fastjsonschema
from platformdirs import user_documents_dir, user_cache_dir
from bot.time import timestamp
from bot.data import load_yomichan_metadata
from bot.data import load_yomichan_term_schema
from bot.factory import new_yomichan_terminator
@ -45,7 +46,7 @@ class BaseExporter(ABC):
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "yomichan_build")
print(f"Initializing build directory `{build_directory}`")
print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
@ -64,8 +65,9 @@ class BaseExporter(ABC):
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None:
print("Copying media files to build directory...")
print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(image_dir, build_img_dir)
print(f"{timestamp()} Finished copying files")
else:
os.makedirs(build_img_dir)
self._terminator.set_image_dir(build_img_dir)
@ -74,7 +76,7 @@ class BaseExporter(ABC):
terms = []
entries_len = len(entries)
for idx, entry in enumerate(entries):
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry)
for term in new_terms:
@ -83,7 +85,7 @@ class BaseExporter(ABC):
return terms
def __validate_terms(self, terms):
print("Making a copy of term data for validation...")
print(f"{timestamp()} Making a copy of term data for validation...")
terms_copy = copy.deepcopy(terms) # because validator will alter data!
term_count = len(terms_copy)
log_dir = self.__get_invalid_term_dir()
@ -91,7 +93,7 @@ class BaseExporter(ABC):
validator = fastjsonschema.compile(schema)
failure_count = 0
for idx, term in enumerate(terms_copy):
update = f"Validating term {idx+1}/{term_count}"
update = f"\tValidating term {idx+1}/{term_count}"
print(update, end='\r', flush=True)
try:
validator([term])
@ -100,9 +102,9 @@ class BaseExporter(ABC):
term_file = os.path.join(log_dir, f"{idx}.json")
with open(term_file, "w", encoding='utf8') as f:
json.dump([term], f, indent=4, ensure_ascii=False)
print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
if failure_count > 0:
print(f"Invalid terms saved to `{log_dir}` for debugging")
print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
def __make_dictionary(self, terms, index, tags):
self.__write_term_banks(terms)
@ -112,11 +114,11 @@ class BaseExporter(ABC):
self.__rm_build_dir()
def __write_term_banks(self, terms):
print(f"Exporting {len(terms)} JSON terms")
print(f"{timestamp()} Exporting {len(terms)} JSON terms")
build_dir = self._get_build_dir()
max_i = int(len(terms) / self._terms_per_file) + 1
for i in range(max_i):
update = f"Writing terms to term bank {i+1}/{max_i}"
update = f"\tWriting terms to term bank {i+1}/{max_i}"
print(update, end='\r', flush=True)
start = self._terms_per_file * i
end = self._terms_per_file * (i + 1)
@ -141,7 +143,7 @@ class BaseExporter(ABC):
def __write_archive(self, filename):
archive_format = "zip"
print(f"Archiving data to {archive_format.upper()} file...")
print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir():
os.makedirs(out_dir)
@ -152,7 +154,7 @@ class BaseExporter(ABC):
base_filename = os.path.join(out_dir, filename)
build_dir = self._get_build_dir()
shutil.make_archive(base_filename, archive_format, build_dir)
print(f"Dictionary file saved to `{out_filepath}`")
print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
def __rm_build_dir(self):
build_dir = self._get_build_dir()