Add timestamps to command line messages

This is a clumsy way of doing it (since it would be better to have a
wrapper function append the timestamp), but that will be taken care of
when the logging logic is all overhauled anyway.
This commit is contained in:
stephenmk 2023-07-28 23:17:42 -05:00
parent 8f30f9419d
commit b03978d1f7
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
8 changed files with 45 additions and 29 deletions

View file

@ -21,7 +21,7 @@ class BaseCrawler(ABC):
pages_len = len(self._page_map) pages_len = len(self._page_map)
items = self._page_map.items() items = self._page_map.items()
for idx, (page_id, page_path) in enumerate(items): for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}" update = f"\tReading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
entry = new_entry(self._target, page_id) entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f: with open(page_path, "r", encoding="utf-8") as f:

View file

@ -1,5 +1,6 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bot.time import timestamp
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
from bot.crawlers.base.crawler import BaseCrawler from bot.crawlers.base.crawler import BaseCrawler
@ -10,7 +11,7 @@ class JitenonCrawler(BaseCrawler):
self._gojuon_url = None self._gojuon_url = None
def collect_pages(self, page_dir): def collect_pages(self, page_dir):
print("Scraping jitenon.jp") print(f"{timestamp()} Scraping {self._gojuon_url}")
jitenon = JitenonScraper() jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -26,4 +27,4 @@ class JitenonCrawler(BaseCrawler):
_, page_path = jitenon.scrape(page_link) _, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path self._page_map[page_id] = page_path
pages_len = len(self._page_map) pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages") print(f"\n{timestamp()} Found {pages_len} entry pages")

View file

@ -1,4 +1,5 @@
import os import os
from bot.time import timestamp
from bot.crawlers.base.crawler import BaseCrawler from bot.crawlers.base.crawler import BaseCrawler
@ -8,7 +9,7 @@ class MonokakidoCrawler(BaseCrawler):
self._page_id_pattern = r"^([0-9]+)\.xml$" self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self, page_dir): def collect_pages(self, page_dir):
print(f"Searching for page files in `{page_dir}`") print(f"{timestamp()} Searching for page files in `{page_dir}`")
for pagefile in os.listdir(page_dir): for pagefile in os.listdir(page_dir):
page_id = self._parse_page_id(pagefile) page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0: if page_id is None or page_id == 0:
@ -16,4 +17,4 @@ class MonokakidoCrawler(BaseCrawler):
path = os.path.join(page_dir, pagefile) path = os.path.join(page_dir, pagefile)
self._page_map[page_id] = path self._page_map[page_id] = path
pages_len = len(self._page_map) pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing") print(f"{timestamp()} Found {pages_len} page files for processing")

View file

@ -1,6 +1,7 @@
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bot.time import timestamp
from bot.crawlers.base.crawler import BaseCrawler from bot.crawlers.base.crawler import BaseCrawler
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
@ -12,6 +13,7 @@ class Crawler(BaseCrawler):
self._page_id_pattern = r"word/p([0-9]+)$" self._page_id_pattern = r"word/p([0-9]+)$"
def collect_pages(self, page_dir): def collect_pages(self, page_dir):
print(f"{timestamp()} Scraping {self._gojuon_url}")
jitenon = JitenonScraper() jitenon = JitenonScraper()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url) gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -35,4 +37,4 @@ class Crawler(BaseCrawler):
_, page_path = jitenon.scrape(page_link) _, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path self._page_map[page_id] = page_path
pages_len = len(self._page_map) pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages") print(f"\n{timestamp()} Found {pages_len} entry pages")

View file

@ -2,6 +2,8 @@ import time
import re import re
import os import os
import hashlib import hashlib
import random
import math
from datetime import datetime from datetime import datetime
from urllib.parse import urlparse from urllib.parse import urlparse
from pathlib import Path from pathlib import Path
@ -12,11 +14,13 @@ from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from platformdirs import user_cache_dir from platformdirs import user_cache_dir
from bot.time import timestamp
from bot.data import load_config from bot.data import load_config
class BaseScraper(ABC): class BaseScraper(ABC):
def __init__(self): def __init__(self):
self.cache_count = 0
self._config = load_config() self._config = load_config()
self.netloc_re = self._get_netloc_re() self.netloc_re = self._get_netloc_re()
self.__set_session() self.__set_session()
@ -31,7 +35,8 @@ class BaseScraper(ABC):
with open(cache_path, "w", encoding="utf-8") as f: with open(cache_path, "w", encoding="utf-8") as f:
f.write(html) f.write(html)
else: else:
print("Discovering cached files...", end='\r', flush=True) self.cache_count += 1
print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
return html, cache_path return html, cache_path
@abstractmethod @abstractmethod
@ -91,15 +96,14 @@ class BaseScraper(ABC):
def __get(self, urlstring): def __get(self, urlstring):
delay = 10 delay = 10
time.sleep(delay) time.sleep(delay)
now = datetime.now().strftime("%H:%M:%S") print(f"{timestamp()} Scraping {urlstring} ...", end='')
print(f"{now} scraping {urlstring} ...", end='')
try: try:
response = self.session.get(urlstring, timeout=10) response = self.session.get(urlstring, timeout=10)
print("OK") print(f"{timestamp()} OK")
return response.text return response.text
except Exception: except Exception as ex:
print("failed") print(f"\tFailed: {str(ex)}")
print("resetting session and trying again") print(f"{timestamp()} Resetting session and trying again")
self.__set_session() self.__set_session()
response = self.session.get(urlstring, timeout=10) response = self.session.get(urlstring, timeout=10)
return response.text return response.text

View file

@ -6,6 +6,7 @@ from pathlib import Path
from platformdirs import user_documents_dir, user_cache_dir from platformdirs import user_documents_dir, user_cache_dir
from bot.time import timestamp
from bot.factory import new_mdict_terminator from bot.factory import new_mdict_terminator
@ -32,7 +33,7 @@ class BaseExporter(ABC):
return self._build_dir return self._build_dir
cache_dir = user_cache_dir("jitenbot") cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "mdict_build") build_directory = os.path.join(cache_dir, "mdict_build")
print(f"Initializing build directory `{build_directory}`") print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir(): if Path(build_directory).is_dir():
shutil.rmtree(build_directory) shutil.rmtree(build_directory)
os.makedirs(build_directory) os.makedirs(build_directory)
@ -43,7 +44,7 @@ class BaseExporter(ABC):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
build_media_dir = os.path.join(build_dir, self._target.value) build_media_dir = os.path.join(build_dir, self._target.value)
if media_dir is not None: if media_dir is not None:
print("Copying media files to build directory...") print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(media_dir, build_media_dir) shutil.copytree(media_dir, build_media_dir)
else: else:
os.makedirs(build_media_dir) os.makedirs(build_media_dir)
@ -69,7 +70,7 @@ class BaseExporter(ABC):
def _write_mdx_file(self, entries): def _write_mdx_file(self, entries):
terms = self._get_terms(entries) terms = self._get_terms(entries)
print(f"Exporting {len(terms)} Mdict keys...") print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
out_dir = self._get_out_dir() out_dir = self._get_out_dir()
out_file = os.path.join(out_dir, f"{self._target.value}.mdx") out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
params = [ params = [
@ -85,7 +86,7 @@ class BaseExporter(ABC):
terms = [] terms = []
entries_len = len(entries) entries_len = len(entries)
for idx, entry in enumerate(entries): for idx, entry in enumerate(entries):
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}" update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry) new_terms = self._terminator.make_terms(entry)
for term in new_terms: for term in new_terms:
@ -124,7 +125,7 @@ class BaseExporter(ABC):
return self._out_dir return self._out_dir
out_dir = os.path.join( out_dir = os.path.join(
user_documents_dir(), "jitenbot", "mdict", self._target.value) user_documents_dir(), "jitenbot", "mdict", self._target.value)
print(f"Initializing output directory `{out_dir}`") print(f"{timestamp()} Initializing output directory `{out_dir}`")
if Path(out_dir).is_dir(): if Path(out_dir).is_dir():
shutil.rmtree(out_dir) shutil.rmtree(out_dir)
os.makedirs(out_dir) os.makedirs(out_dir)

5
bot/time.py Normal file
View file

@ -0,0 +1,5 @@
import time
def timestamp():
return time.strftime('%X')

View file

@ -8,6 +8,7 @@ from abc import ABC, abstractmethod
import fastjsonschema import fastjsonschema
from platformdirs import user_documents_dir, user_cache_dir from platformdirs import user_documents_dir, user_cache_dir
from bot.time import timestamp
from bot.data import load_yomichan_metadata from bot.data import load_yomichan_metadata
from bot.data import load_yomichan_term_schema from bot.data import load_yomichan_term_schema
from bot.factory import new_yomichan_terminator from bot.factory import new_yomichan_terminator
@ -45,7 +46,7 @@ class BaseExporter(ABC):
return self._build_dir return self._build_dir
cache_dir = user_cache_dir("jitenbot") cache_dir = user_cache_dir("jitenbot")
build_directory = os.path.join(cache_dir, "yomichan_build") build_directory = os.path.join(cache_dir, "yomichan_build")
print(f"Initializing build directory `{build_directory}`") print(f"{timestamp()} Initializing build directory `{build_directory}`")
if Path(build_directory).is_dir(): if Path(build_directory).is_dir():
shutil.rmtree(build_directory) shutil.rmtree(build_directory)
os.makedirs(build_directory) os.makedirs(build_directory)
@ -64,8 +65,9 @@ class BaseExporter(ABC):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._target.value) build_img_dir = os.path.join(build_dir, self._target.value)
if image_dir is not None: if image_dir is not None:
print("Copying media files to build directory...") print(f"{timestamp()} Copying media files to build directory...")
shutil.copytree(image_dir, build_img_dir) shutil.copytree(image_dir, build_img_dir)
print(f"{timestamp()} Finished copying files")
else: else:
os.makedirs(build_img_dir) os.makedirs(build_img_dir)
self._terminator.set_image_dir(build_img_dir) self._terminator.set_image_dir(build_img_dir)
@ -74,7 +76,7 @@ class BaseExporter(ABC):
terms = [] terms = []
entries_len = len(entries) entries_len = len(entries)
for idx, entry in enumerate(entries): for idx, entry in enumerate(entries):
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}" update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
new_terms = self._terminator.make_terms(entry) new_terms = self._terminator.make_terms(entry)
for term in new_terms: for term in new_terms:
@ -83,7 +85,7 @@ class BaseExporter(ABC):
return terms return terms
def __validate_terms(self, terms): def __validate_terms(self, terms):
print("Making a copy of term data for validation...") print(f"{timestamp()} Making a copy of term data for validation...")
terms_copy = copy.deepcopy(terms) # because validator will alter data! terms_copy = copy.deepcopy(terms) # because validator will alter data!
term_count = len(terms_copy) term_count = len(terms_copy)
log_dir = self.__get_invalid_term_dir() log_dir = self.__get_invalid_term_dir()
@ -91,7 +93,7 @@ class BaseExporter(ABC):
validator = fastjsonschema.compile(schema) validator = fastjsonschema.compile(schema)
failure_count = 0 failure_count = 0
for idx, term in enumerate(terms_copy): for idx, term in enumerate(terms_copy):
update = f"Validating term {idx+1}/{term_count}" update = f"\tValidating term {idx+1}/{term_count}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
try: try:
validator([term]) validator([term])
@ -100,9 +102,9 @@ class BaseExporter(ABC):
term_file = os.path.join(log_dir, f"{idx}.json") term_file = os.path.join(log_dir, f"{idx}.json")
with open(term_file, "w", encoding='utf8') as f: with open(term_file, "w", encoding='utf8') as f:
json.dump([term], f, indent=4, ensure_ascii=False) json.dump([term], f, indent=4, ensure_ascii=False)
print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}") print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
if failure_count > 0: if failure_count > 0:
print(f"Invalid terms saved to `{log_dir}` for debugging") print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
def __make_dictionary(self, terms, index, tags): def __make_dictionary(self, terms, index, tags):
self.__write_term_banks(terms) self.__write_term_banks(terms)
@ -112,11 +114,11 @@ class BaseExporter(ABC):
self.__rm_build_dir() self.__rm_build_dir()
def __write_term_banks(self, terms): def __write_term_banks(self, terms):
print(f"Exporting {len(terms)} JSON terms") print(f"{timestamp()} Exporting {len(terms)} JSON terms")
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
max_i = int(len(terms) / self._terms_per_file) + 1 max_i = int(len(terms) / self._terms_per_file) + 1
for i in range(max_i): for i in range(max_i):
update = f"Writing terms to term bank {i+1}/{max_i}" update = f"\tWriting terms to term bank {i+1}/{max_i}"
print(update, end='\r', flush=True) print(update, end='\r', flush=True)
start = self._terms_per_file * i start = self._terms_per_file * i
end = self._terms_per_file * (i + 1) end = self._terms_per_file * (i + 1)
@ -141,7 +143,7 @@ class BaseExporter(ABC):
def __write_archive(self, filename): def __write_archive(self, filename):
archive_format = "zip" archive_format = "zip"
print(f"Archiving data to {archive_format.upper()} file...") print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan") out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
if not Path(out_dir).is_dir(): if not Path(out_dir).is_dir():
os.makedirs(out_dir) os.makedirs(out_dir)
@ -152,7 +154,7 @@ class BaseExporter(ABC):
base_filename = os.path.join(out_dir, filename) base_filename = os.path.join(out_dir, filename)
build_dir = self._get_build_dir() build_dir = self._get_build_dir()
shutil.make_archive(base_filename, archive_format, build_dir) shutil.make_archive(base_filename, archive_format, build_dir)
print(f"Dictionary file saved to `{out_filepath}`") print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
def __rm_build_dir(self): def __rm_build_dir(self):
build_dir = self._get_build_dir() build_dir = self._get_build_dir()