Add timestamps to command line messages
This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway.
This commit is contained in:
parent
8f30f9419d
commit
b03978d1f7
|
@ -21,7 +21,7 @@ class BaseCrawler(ABC):
|
||||||
pages_len = len(self._page_map)
|
pages_len = len(self._page_map)
|
||||||
items = self._page_map.items()
|
items = self._page_map.items()
|
||||||
for idx, (page_id, page_path) in enumerate(items):
|
for idx, (page_id, page_path) in enumerate(items):
|
||||||
update = f"Reading page {idx+1}/{pages_len}"
|
update = f"\tReading page {idx+1}/{pages_len}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
entry = new_entry(self._target, page_id)
|
entry = new_entry(self._target, page_id)
|
||||||
with open(page_path, "r", encoding="utf-8") as f:
|
with open(page_path, "r", encoding="utf-8") as f:
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
||||||
from bot.crawlers.base.crawler import BaseCrawler
|
from bot.crawlers.base.crawler import BaseCrawler
|
||||||
|
|
||||||
|
@ -10,7 +11,7 @@ class JitenonCrawler(BaseCrawler):
|
||||||
self._gojuon_url = None
|
self._gojuon_url = None
|
||||||
|
|
||||||
def collect_pages(self, page_dir):
|
def collect_pages(self, page_dir):
|
||||||
print("Scraping jitenon.jp")
|
print(f"{timestamp()} Scraping {self._gojuon_url}")
|
||||||
jitenon = JitenonScraper()
|
jitenon = JitenonScraper()
|
||||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
@ -26,4 +27,4 @@ class JitenonCrawler(BaseCrawler):
|
||||||
_, page_path = jitenon.scrape(page_link)
|
_, page_path = jitenon.scrape(page_link)
|
||||||
self._page_map[page_id] = page_path
|
self._page_map[page_id] = page_path
|
||||||
pages_len = len(self._page_map)
|
pages_len = len(self._page_map)
|
||||||
print(f"Finished scraping {pages_len} pages")
|
print(f"\n{timestamp()} Found {pages_len} entry pages")
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.crawlers.base.crawler import BaseCrawler
|
from bot.crawlers.base.crawler import BaseCrawler
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,7 +9,7 @@ class MonokakidoCrawler(BaseCrawler):
|
||||||
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
||||||
|
|
||||||
def collect_pages(self, page_dir):
|
def collect_pages(self, page_dir):
|
||||||
print(f"Searching for page files in `{page_dir}`")
|
print(f"{timestamp()} Searching for page files in `{page_dir}`")
|
||||||
for pagefile in os.listdir(page_dir):
|
for pagefile in os.listdir(page_dir):
|
||||||
page_id = self._parse_page_id(pagefile)
|
page_id = self._parse_page_id(pagefile)
|
||||||
if page_id is None or page_id == 0:
|
if page_id is None or page_id == 0:
|
||||||
|
@ -16,4 +17,4 @@ class MonokakidoCrawler(BaseCrawler):
|
||||||
path = os.path.join(page_dir, pagefile)
|
path = os.path.join(page_dir, pagefile)
|
||||||
self._page_map[page_id] = path
|
self._page_map[page_id] = path
|
||||||
pages_len = len(self._page_map)
|
pages_len = len(self._page_map)
|
||||||
print(f"Found {pages_len} page files for processing")
|
print(f"{timestamp()} Found {pages_len} page files for processing")
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.crawlers.base.crawler import BaseCrawler
|
from bot.crawlers.base.crawler import BaseCrawler
|
||||||
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
||||||
|
|
||||||
|
@ -12,6 +13,7 @@ class Crawler(BaseCrawler):
|
||||||
self._page_id_pattern = r"word/p([0-9]+)$"
|
self._page_id_pattern = r"word/p([0-9]+)$"
|
||||||
|
|
||||||
def collect_pages(self, page_dir):
|
def collect_pages(self, page_dir):
|
||||||
|
print(f"{timestamp()} Scraping {self._gojuon_url}")
|
||||||
jitenon = JitenonScraper()
|
jitenon = JitenonScraper()
|
||||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
@ -35,4 +37,4 @@ class Crawler(BaseCrawler):
|
||||||
_, page_path = jitenon.scrape(page_link)
|
_, page_path = jitenon.scrape(page_link)
|
||||||
self._page_map[page_id] = page_path
|
self._page_map[page_id] = page_path
|
||||||
pages_len = len(self._page_map)
|
pages_len = len(self._page_map)
|
||||||
print(f"Finished scraping {pages_len} pages")
|
print(f"\n{timestamp()} Found {pages_len} entry pages")
|
||||||
|
|
|
@ -2,6 +2,8 @@ import time
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import random
|
||||||
|
import math
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -12,11 +14,13 @@ from requests.adapters import HTTPAdapter
|
||||||
from requests.packages.urllib3.util.retry import Retry
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
from platformdirs import user_cache_dir
|
from platformdirs import user_cache_dir
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.data import load_config
|
from bot.data import load_config
|
||||||
|
|
||||||
|
|
||||||
class BaseScraper(ABC):
|
class BaseScraper(ABC):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.cache_count = 0
|
||||||
self._config = load_config()
|
self._config = load_config()
|
||||||
self.netloc_re = self._get_netloc_re()
|
self.netloc_re = self._get_netloc_re()
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
|
@ -31,7 +35,8 @@ class BaseScraper(ABC):
|
||||||
with open(cache_path, "w", encoding="utf-8") as f:
|
with open(cache_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
else:
|
else:
|
||||||
print("Discovering cached files...", end='\r', flush=True)
|
self.cache_count += 1
|
||||||
|
print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
|
||||||
return html, cache_path
|
return html, cache_path
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -91,15 +96,14 @@ class BaseScraper(ABC):
|
||||||
def __get(self, urlstring):
|
def __get(self, urlstring):
|
||||||
delay = 10
|
delay = 10
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
now = datetime.now().strftime("%H:%M:%S")
|
print(f"{timestamp()} Scraping {urlstring} ...", end='')
|
||||||
print(f"{now} scraping {urlstring} ...", end='')
|
|
||||||
try:
|
try:
|
||||||
response = self.session.get(urlstring, timeout=10)
|
response = self.session.get(urlstring, timeout=10)
|
||||||
print("OK")
|
print(f"{timestamp()} OK")
|
||||||
return response.text
|
return response.text
|
||||||
except Exception:
|
except Exception as ex:
|
||||||
print("failed")
|
print(f"\tFailed: {str(ex)}")
|
||||||
print("resetting session and trying again")
|
print(f"{timestamp()} Resetting session and trying again")
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
response = self.session.get(urlstring, timeout=10)
|
response = self.session.get(urlstring, timeout=10)
|
||||||
return response.text
|
return response.text
|
||||||
|
|
|
@ -6,6 +6,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from platformdirs import user_documents_dir, user_cache_dir
|
from platformdirs import user_documents_dir, user_cache_dir
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.factory import new_mdict_terminator
|
from bot.factory import new_mdict_terminator
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ class BaseExporter(ABC):
|
||||||
return self._build_dir
|
return self._build_dir
|
||||||
cache_dir = user_cache_dir("jitenbot")
|
cache_dir = user_cache_dir("jitenbot")
|
||||||
build_directory = os.path.join(cache_dir, "mdict_build")
|
build_directory = os.path.join(cache_dir, "mdict_build")
|
||||||
print(f"Initializing build directory `{build_directory}`")
|
print(f"{timestamp()} Initializing build directory `{build_directory}`")
|
||||||
if Path(build_directory).is_dir():
|
if Path(build_directory).is_dir():
|
||||||
shutil.rmtree(build_directory)
|
shutil.rmtree(build_directory)
|
||||||
os.makedirs(build_directory)
|
os.makedirs(build_directory)
|
||||||
|
@ -43,7 +44,7 @@ class BaseExporter(ABC):
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
build_media_dir = os.path.join(build_dir, self._target.value)
|
build_media_dir = os.path.join(build_dir, self._target.value)
|
||||||
if media_dir is not None:
|
if media_dir is not None:
|
||||||
print("Copying media files to build directory...")
|
print(f"{timestamp()} Copying media files to build directory...")
|
||||||
shutil.copytree(media_dir, build_media_dir)
|
shutil.copytree(media_dir, build_media_dir)
|
||||||
else:
|
else:
|
||||||
os.makedirs(build_media_dir)
|
os.makedirs(build_media_dir)
|
||||||
|
@ -69,7 +70,7 @@ class BaseExporter(ABC):
|
||||||
|
|
||||||
def _write_mdx_file(self, entries):
|
def _write_mdx_file(self, entries):
|
||||||
terms = self._get_terms(entries)
|
terms = self._get_terms(entries)
|
||||||
print(f"Exporting {len(terms)} Mdict keys...")
|
print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
|
||||||
out_dir = self._get_out_dir()
|
out_dir = self._get_out_dir()
|
||||||
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
|
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
|
||||||
params = [
|
params = [
|
||||||
|
@ -85,7 +86,7 @@ class BaseExporter(ABC):
|
||||||
terms = []
|
terms = []
|
||||||
entries_len = len(entries)
|
entries_len = len(entries)
|
||||||
for idx, entry in enumerate(entries):
|
for idx, entry in enumerate(entries):
|
||||||
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
|
update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
new_terms = self._terminator.make_terms(entry)
|
new_terms = self._terminator.make_terms(entry)
|
||||||
for term in new_terms:
|
for term in new_terms:
|
||||||
|
@ -124,7 +125,7 @@ class BaseExporter(ABC):
|
||||||
return self._out_dir
|
return self._out_dir
|
||||||
out_dir = os.path.join(
|
out_dir = os.path.join(
|
||||||
user_documents_dir(), "jitenbot", "mdict", self._target.value)
|
user_documents_dir(), "jitenbot", "mdict", self._target.value)
|
||||||
print(f"Initializing output directory `{out_dir}`")
|
print(f"{timestamp()} Initializing output directory `{out_dir}`")
|
||||||
if Path(out_dir).is_dir():
|
if Path(out_dir).is_dir():
|
||||||
shutil.rmtree(out_dir)
|
shutil.rmtree(out_dir)
|
||||||
os.makedirs(out_dir)
|
os.makedirs(out_dir)
|
||||||
|
|
5
bot/time.py
Normal file
5
bot/time.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp():
|
||||||
|
return time.strftime('%X')
|
|
@ -8,6 +8,7 @@ from abc import ABC, abstractmethod
|
||||||
import fastjsonschema
|
import fastjsonschema
|
||||||
from platformdirs import user_documents_dir, user_cache_dir
|
from platformdirs import user_documents_dir, user_cache_dir
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.data import load_yomichan_metadata
|
from bot.data import load_yomichan_metadata
|
||||||
from bot.data import load_yomichan_term_schema
|
from bot.data import load_yomichan_term_schema
|
||||||
from bot.factory import new_yomichan_terminator
|
from bot.factory import new_yomichan_terminator
|
||||||
|
@ -45,7 +46,7 @@ class BaseExporter(ABC):
|
||||||
return self._build_dir
|
return self._build_dir
|
||||||
cache_dir = user_cache_dir("jitenbot")
|
cache_dir = user_cache_dir("jitenbot")
|
||||||
build_directory = os.path.join(cache_dir, "yomichan_build")
|
build_directory = os.path.join(cache_dir, "yomichan_build")
|
||||||
print(f"Initializing build directory `{build_directory}`")
|
print(f"{timestamp()} Initializing build directory `{build_directory}`")
|
||||||
if Path(build_directory).is_dir():
|
if Path(build_directory).is_dir():
|
||||||
shutil.rmtree(build_directory)
|
shutil.rmtree(build_directory)
|
||||||
os.makedirs(build_directory)
|
os.makedirs(build_directory)
|
||||||
|
@ -64,8 +65,9 @@ class BaseExporter(ABC):
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
build_img_dir = os.path.join(build_dir, self._target.value)
|
build_img_dir = os.path.join(build_dir, self._target.value)
|
||||||
if image_dir is not None:
|
if image_dir is not None:
|
||||||
print("Copying media files to build directory...")
|
print(f"{timestamp()} Copying media files to build directory...")
|
||||||
shutil.copytree(image_dir, build_img_dir)
|
shutil.copytree(image_dir, build_img_dir)
|
||||||
|
print(f"{timestamp()} Finished copying files")
|
||||||
else:
|
else:
|
||||||
os.makedirs(build_img_dir)
|
os.makedirs(build_img_dir)
|
||||||
self._terminator.set_image_dir(build_img_dir)
|
self._terminator.set_image_dir(build_img_dir)
|
||||||
|
@ -74,7 +76,7 @@ class BaseExporter(ABC):
|
||||||
terms = []
|
terms = []
|
||||||
entries_len = len(entries)
|
entries_len = len(entries)
|
||||||
for idx, entry in enumerate(entries):
|
for idx, entry in enumerate(entries):
|
||||||
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
|
update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
new_terms = self._terminator.make_terms(entry)
|
new_terms = self._terminator.make_terms(entry)
|
||||||
for term in new_terms:
|
for term in new_terms:
|
||||||
|
@ -83,7 +85,7 @@ class BaseExporter(ABC):
|
||||||
return terms
|
return terms
|
||||||
|
|
||||||
def __validate_terms(self, terms):
|
def __validate_terms(self, terms):
|
||||||
print("Making a copy of term data for validation...")
|
print(f"{timestamp()} Making a copy of term data for validation...")
|
||||||
terms_copy = copy.deepcopy(terms) # because validator will alter data!
|
terms_copy = copy.deepcopy(terms) # because validator will alter data!
|
||||||
term_count = len(terms_copy)
|
term_count = len(terms_copy)
|
||||||
log_dir = self.__get_invalid_term_dir()
|
log_dir = self.__get_invalid_term_dir()
|
||||||
|
@ -91,7 +93,7 @@ class BaseExporter(ABC):
|
||||||
validator = fastjsonschema.compile(schema)
|
validator = fastjsonschema.compile(schema)
|
||||||
failure_count = 0
|
failure_count = 0
|
||||||
for idx, term in enumerate(terms_copy):
|
for idx, term in enumerate(terms_copy):
|
||||||
update = f"Validating term {idx+1}/{term_count}"
|
update = f"\tValidating term {idx+1}/{term_count}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
try:
|
try:
|
||||||
validator([term])
|
validator([term])
|
||||||
|
@ -100,9 +102,9 @@ class BaseExporter(ABC):
|
||||||
term_file = os.path.join(log_dir, f"{idx}.json")
|
term_file = os.path.join(log_dir, f"{idx}.json")
|
||||||
with open(term_file, "w", encoding='utf8') as f:
|
with open(term_file, "w", encoding='utf8') as f:
|
||||||
json.dump([term], f, indent=4, ensure_ascii=False)
|
json.dump([term], f, indent=4, ensure_ascii=False)
|
||||||
print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
|
print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
|
||||||
if failure_count > 0:
|
if failure_count > 0:
|
||||||
print(f"Invalid terms saved to `{log_dir}` for debugging")
|
print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
|
||||||
|
|
||||||
def __make_dictionary(self, terms, index, tags):
|
def __make_dictionary(self, terms, index, tags):
|
||||||
self.__write_term_banks(terms)
|
self.__write_term_banks(terms)
|
||||||
|
@ -112,11 +114,11 @@ class BaseExporter(ABC):
|
||||||
self.__rm_build_dir()
|
self.__rm_build_dir()
|
||||||
|
|
||||||
def __write_term_banks(self, terms):
|
def __write_term_banks(self, terms):
|
||||||
print(f"Exporting {len(terms)} JSON terms")
|
print(f"{timestamp()} Exporting {len(terms)} JSON terms")
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
max_i = int(len(terms) / self._terms_per_file) + 1
|
max_i = int(len(terms) / self._terms_per_file) + 1
|
||||||
for i in range(max_i):
|
for i in range(max_i):
|
||||||
update = f"Writing terms to term bank {i+1}/{max_i}"
|
update = f"\tWriting terms to term bank {i+1}/{max_i}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
start = self._terms_per_file * i
|
start = self._terms_per_file * i
|
||||||
end = self._terms_per_file * (i + 1)
|
end = self._terms_per_file * (i + 1)
|
||||||
|
@ -141,7 +143,7 @@ class BaseExporter(ABC):
|
||||||
|
|
||||||
def __write_archive(self, filename):
|
def __write_archive(self, filename):
|
||||||
archive_format = "zip"
|
archive_format = "zip"
|
||||||
print(f"Archiving data to {archive_format.upper()} file...")
|
print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
|
||||||
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
|
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
|
||||||
if not Path(out_dir).is_dir():
|
if not Path(out_dir).is_dir():
|
||||||
os.makedirs(out_dir)
|
os.makedirs(out_dir)
|
||||||
|
@ -152,7 +154,7 @@ class BaseExporter(ABC):
|
||||||
base_filename = os.path.join(out_dir, filename)
|
base_filename = os.path.join(out_dir, filename)
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
shutil.make_archive(base_filename, archive_format, build_dir)
|
shutil.make_archive(base_filename, archive_format, build_dir)
|
||||||
print(f"Dictionary file saved to `{out_filepath}`")
|
print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
|
||||||
|
|
||||||
def __rm_build_dir(self):
|
def __rm_build_dir(self):
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
|
|
Loading…
Reference in a new issue