Add timestamps to command line messages
This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway.
This commit is contained in:
parent
8f30f9419d
commit
b03978d1f7
|
@ -21,7 +21,7 @@ class BaseCrawler(ABC):
|
|||
pages_len = len(self._page_map)
|
||||
items = self._page_map.items()
|
||||
for idx, (page_id, page_path) in enumerate(items):
|
||||
update = f"Reading page {idx+1}/{pages_len}"
|
||||
update = f"\tReading page {idx+1}/{pages_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = new_entry(self._target, page_id)
|
||||
with open(page_path, "r", encoding="utf-8") as f:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.time import timestamp
|
||||
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
||||
from bot.crawlers.base.crawler import BaseCrawler
|
||||
|
||||
|
@ -10,7 +11,7 @@ class JitenonCrawler(BaseCrawler):
|
|||
self._gojuon_url = None
|
||||
|
||||
def collect_pages(self, page_dir):
|
||||
print("Scraping jitenon.jp")
|
||||
print(f"{timestamp()} Scraping {self._gojuon_url}")
|
||||
jitenon = JitenonScraper()
|
||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
|
@ -26,4 +27,4 @@ class JitenonCrawler(BaseCrawler):
|
|||
_, page_path = jitenon.scrape(page_link)
|
||||
self._page_map[page_id] = page_path
|
||||
pages_len = len(self._page_map)
|
||||
print(f"Finished scraping {pages_len} pages")
|
||||
print(f"\n{timestamp()} Found {pages_len} entry pages")
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
from bot.time import timestamp
|
||||
from bot.crawlers.base.crawler import BaseCrawler
|
||||
|
||||
|
||||
|
@ -8,7 +9,7 @@ class MonokakidoCrawler(BaseCrawler):
|
|||
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
||||
|
||||
def collect_pages(self, page_dir):
|
||||
print(f"Searching for page files in `{page_dir}`")
|
||||
print(f"{timestamp()} Searching for page files in `{page_dir}`")
|
||||
for pagefile in os.listdir(page_dir):
|
||||
page_id = self._parse_page_id(pagefile)
|
||||
if page_id is None or page_id == 0:
|
||||
|
@ -16,4 +17,4 @@ class MonokakidoCrawler(BaseCrawler):
|
|||
path = os.path.join(page_dir, pagefile)
|
||||
self._page_map[page_id] = path
|
||||
pages_len = len(self._page_map)
|
||||
print(f"Found {pages_len} page files for processing")
|
||||
print(f"{timestamp()} Found {pages_len} page files for processing")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.time import timestamp
|
||||
from bot.crawlers.base.crawler import BaseCrawler
|
||||
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
||||
|
||||
|
@ -12,6 +13,7 @@ class Crawler(BaseCrawler):
|
|||
self._page_id_pattern = r"word/p([0-9]+)$"
|
||||
|
||||
def collect_pages(self, page_dir):
|
||||
print(f"{timestamp()} Scraping {self._gojuon_url}")
|
||||
jitenon = JitenonScraper()
|
||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
|
@ -35,4 +37,4 @@ class Crawler(BaseCrawler):
|
|||
_, page_path = jitenon.scrape(page_link)
|
||||
self._page_map[page_id] = page_path
|
||||
pages_len = len(self._page_map)
|
||||
print(f"Finished scraping {pages_len} pages")
|
||||
print(f"\n{timestamp()} Found {pages_len} entry pages")
|
||||
|
|
|
@ -2,6 +2,8 @@ import time
|
|||
import re
|
||||
import os
|
||||
import hashlib
|
||||
import random
|
||||
import math
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
|
@ -12,11 +14,13 @@ from requests.adapters import HTTPAdapter
|
|||
from requests.packages.urllib3.util.retry import Retry
|
||||
from platformdirs import user_cache_dir
|
||||
|
||||
from bot.time import timestamp
|
||||
from bot.data import load_config
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
def __init__(self):
|
||||
self.cache_count = 0
|
||||
self._config = load_config()
|
||||
self.netloc_re = self._get_netloc_re()
|
||||
self.__set_session()
|
||||
|
@ -31,7 +35,8 @@ class BaseScraper(ABC):
|
|||
with open(cache_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
else:
|
||||
print("Discovering cached files...", end='\r', flush=True)
|
||||
self.cache_count += 1
|
||||
print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
|
||||
return html, cache_path
|
||||
|
||||
@abstractmethod
|
||||
|
@ -91,15 +96,14 @@ class BaseScraper(ABC):
|
|||
def __get(self, urlstring):
|
||||
delay = 10
|
||||
time.sleep(delay)
|
||||
now = datetime.now().strftime("%H:%M:%S")
|
||||
print(f"{now} scraping {urlstring} ...", end='')
|
||||
print(f"{timestamp()} Scraping {urlstring} ...", end='')
|
||||
try:
|
||||
response = self.session.get(urlstring, timeout=10)
|
||||
print("OK")
|
||||
print(f"{timestamp()} OK")
|
||||
return response.text
|
||||
except Exception:
|
||||
print("failed")
|
||||
print("resetting session and trying again")
|
||||
except Exception as ex:
|
||||
print(f"\tFailed: {str(ex)}")
|
||||
print(f"{timestamp()} Resetting session and trying again")
|
||||
self.__set_session()
|
||||
response = self.session.get(urlstring, timeout=10)
|
||||
return response.text
|
||||
|
|
|
@ -6,6 +6,7 @@ from pathlib import Path
|
|||
|
||||
from platformdirs import user_documents_dir, user_cache_dir
|
||||
|
||||
from bot.time import timestamp
|
||||
from bot.factory import new_mdict_terminator
|
||||
|
||||
|
||||
|
@ -32,7 +33,7 @@ class BaseExporter(ABC):
|
|||
return self._build_dir
|
||||
cache_dir = user_cache_dir("jitenbot")
|
||||
build_directory = os.path.join(cache_dir, "mdict_build")
|
||||
print(f"Initializing build directory `{build_directory}`")
|
||||
print(f"{timestamp()} Initializing build directory `{build_directory}`")
|
||||
if Path(build_directory).is_dir():
|
||||
shutil.rmtree(build_directory)
|
||||
os.makedirs(build_directory)
|
||||
|
@ -43,7 +44,7 @@ class BaseExporter(ABC):
|
|||
build_dir = self._get_build_dir()
|
||||
build_media_dir = os.path.join(build_dir, self._target.value)
|
||||
if media_dir is not None:
|
||||
print("Copying media files to build directory...")
|
||||
print(f"{timestamp()} Copying media files to build directory...")
|
||||
shutil.copytree(media_dir, build_media_dir)
|
||||
else:
|
||||
os.makedirs(build_media_dir)
|
||||
|
@ -69,7 +70,7 @@ class BaseExporter(ABC):
|
|||
|
||||
def _write_mdx_file(self, entries):
|
||||
terms = self._get_terms(entries)
|
||||
print(f"Exporting {len(terms)} Mdict keys...")
|
||||
print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
|
||||
out_dir = self._get_out_dir()
|
||||
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
|
||||
params = [
|
||||
|
@ -85,7 +86,7 @@ class BaseExporter(ABC):
|
|||
terms = []
|
||||
entries_len = len(entries)
|
||||
for idx, entry in enumerate(entries):
|
||||
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
|
||||
update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
new_terms = self._terminator.make_terms(entry)
|
||||
for term in new_terms:
|
||||
|
@ -124,7 +125,7 @@ class BaseExporter(ABC):
|
|||
return self._out_dir
|
||||
out_dir = os.path.join(
|
||||
user_documents_dir(), "jitenbot", "mdict", self._target.value)
|
||||
print(f"Initializing output directory `{out_dir}`")
|
||||
print(f"{timestamp()} Initializing output directory `{out_dir}`")
|
||||
if Path(out_dir).is_dir():
|
||||
shutil.rmtree(out_dir)
|
||||
os.makedirs(out_dir)
|
||||
|
|
5
bot/time.py
Normal file
5
bot/time.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
import time
|
||||
|
||||
|
||||
def timestamp():
|
||||
return time.strftime('%X')
|
|
@ -8,6 +8,7 @@ from abc import ABC, abstractmethod
|
|||
import fastjsonschema
|
||||
from platformdirs import user_documents_dir, user_cache_dir
|
||||
|
||||
from bot.time import timestamp
|
||||
from bot.data import load_yomichan_metadata
|
||||
from bot.data import load_yomichan_term_schema
|
||||
from bot.factory import new_yomichan_terminator
|
||||
|
@ -45,7 +46,7 @@ class BaseExporter(ABC):
|
|||
return self._build_dir
|
||||
cache_dir = user_cache_dir("jitenbot")
|
||||
build_directory = os.path.join(cache_dir, "yomichan_build")
|
||||
print(f"Initializing build directory `{build_directory}`")
|
||||
print(f"{timestamp()} Initializing build directory `{build_directory}`")
|
||||
if Path(build_directory).is_dir():
|
||||
shutil.rmtree(build_directory)
|
||||
os.makedirs(build_directory)
|
||||
|
@ -64,8 +65,9 @@ class BaseExporter(ABC):
|
|||
build_dir = self._get_build_dir()
|
||||
build_img_dir = os.path.join(build_dir, self._target.value)
|
||||
if image_dir is not None:
|
||||
print("Copying media files to build directory...")
|
||||
print(f"{timestamp()} Copying media files to build directory...")
|
||||
shutil.copytree(image_dir, build_img_dir)
|
||||
print(f"{timestamp()} Finished copying files")
|
||||
else:
|
||||
os.makedirs(build_img_dir)
|
||||
self._terminator.set_image_dir(build_img_dir)
|
||||
|
@ -74,7 +76,7 @@ class BaseExporter(ABC):
|
|||
terms = []
|
||||
entries_len = len(entries)
|
||||
for idx, entry in enumerate(entries):
|
||||
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
|
||||
update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
new_terms = self._terminator.make_terms(entry)
|
||||
for term in new_terms:
|
||||
|
@ -83,7 +85,7 @@ class BaseExporter(ABC):
|
|||
return terms
|
||||
|
||||
def __validate_terms(self, terms):
|
||||
print("Making a copy of term data for validation...")
|
||||
print(f"{timestamp()} Making a copy of term data for validation...")
|
||||
terms_copy = copy.deepcopy(terms) # because validator will alter data!
|
||||
term_count = len(terms_copy)
|
||||
log_dir = self.__get_invalid_term_dir()
|
||||
|
@ -91,7 +93,7 @@ class BaseExporter(ABC):
|
|||
validator = fastjsonschema.compile(schema)
|
||||
failure_count = 0
|
||||
for idx, term in enumerate(terms_copy):
|
||||
update = f"Validating term {idx+1}/{term_count}"
|
||||
update = f"\tValidating term {idx+1}/{term_count}"
|
||||
print(update, end='\r', flush=True)
|
||||
try:
|
||||
validator([term])
|
||||
|
@ -100,9 +102,9 @@ class BaseExporter(ABC):
|
|||
term_file = os.path.join(log_dir, f"{idx}.json")
|
||||
with open(term_file, "w", encoding='utf8') as f:
|
||||
json.dump([term], f, indent=4, ensure_ascii=False)
|
||||
print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
|
||||
print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
|
||||
if failure_count > 0:
|
||||
print(f"Invalid terms saved to `{log_dir}` for debugging")
|
||||
print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
|
||||
|
||||
def __make_dictionary(self, terms, index, tags):
|
||||
self.__write_term_banks(terms)
|
||||
|
@ -112,11 +114,11 @@ class BaseExporter(ABC):
|
|||
self.__rm_build_dir()
|
||||
|
||||
def __write_term_banks(self, terms):
|
||||
print(f"Exporting {len(terms)} JSON terms")
|
||||
print(f"{timestamp()} Exporting {len(terms)} JSON terms")
|
||||
build_dir = self._get_build_dir()
|
||||
max_i = int(len(terms) / self._terms_per_file) + 1
|
||||
for i in range(max_i):
|
||||
update = f"Writing terms to term bank {i+1}/{max_i}"
|
||||
update = f"\tWriting terms to term bank {i+1}/{max_i}"
|
||||
print(update, end='\r', flush=True)
|
||||
start = self._terms_per_file * i
|
||||
end = self._terms_per_file * (i + 1)
|
||||
|
@ -141,7 +143,7 @@ class BaseExporter(ABC):
|
|||
|
||||
def __write_archive(self, filename):
|
||||
archive_format = "zip"
|
||||
print(f"Archiving data to {archive_format.upper()} file...")
|
||||
print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
|
||||
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
|
||||
if not Path(out_dir).is_dir():
|
||||
os.makedirs(out_dir)
|
||||
|
@ -152,7 +154,7 @@ class BaseExporter(ABC):
|
|||
base_filename = os.path.join(out_dir, filename)
|
||||
build_dir = self._get_build_dir()
|
||||
shutil.make_archive(base_filename, archive_format, build_dir)
|
||||
print(f"Dictionary file saved to `{out_filepath}`")
|
||||
print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
|
||||
|
||||
def __rm_build_dir(self):
|
||||
build_dir = self._get_build_dir()
|
||||
|
|
Loading…
Reference in a new issue