Use standard platform directories for cache, config, and output data

This commit is contained in:
stephenmk 2023-04-22 13:37:34 -05:00
parent 071144c808
commit 90eb5dc285
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
5 changed files with 77 additions and 10 deletions

43
bot/data.py Normal file
View file

@ -0,0 +1,43 @@
import os
import sys
import json
from pathlib import Path
from platformdirs import user_config_dir
def config():
config_dir = user_config_dir("jitenbot")
if not Path(config_dir).is_dir():
os.makedirs(config_dir)
config_file = os.path.join(config_dir, "config.json")
if Path(config_file).is_file():
with open(config_file, "r") as f:
config = json.load(f)
else:
config = __default_config()
with open(config_file, "w") as f:
json.dump(config, f, indent=4)
return config
def yomichan_inflection_categories():
file_name = "yomichan_inflection_categories.json"
data = __load_json(file_name)
return data
def __default_config():
file_name = "default_config.json"
data = __load_json(file_name)
return data
def __load_json(file_name):
file_path = os.path.join("data", file_name)
if not Path(file_path).is_file():
print(f"Missing data file: {file_path}")
sys.exit(1)
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return data

View file

@ -2,18 +2,24 @@ import time
import requests
import re
import os
import json
import hashlib
from datetime import datetime
from pathlib import Path
from platformdirs import user_cache_dir
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from datetime import datetime
import bot.data as Data
class Scraper():
__CONFIG = None
def __init__(self):
if self.__CONFIG is None:
self.__CONFIG = Data.config()
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
self.netloc_re = re.compile(pattern)
self.__set_session()
@ -38,9 +44,7 @@ class Scraper():
allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
with open("config.json", "r") as f:
config = json.load(f)
headers = config["http-request-headers"]
headers = self.__CONFIG["http-request-headers"]
self.session = requests.Session()
self.session.mount("https://", adapter)
self.session.headers.update(headers)
@ -54,7 +58,9 @@ class Scraper():
raise Exception(f"Invalid URL: {url.geturl()}")
def __cache_path(self, url):
cache_dir = os.path.join("webcache", self.__class__.__name__.lower())
class_name = self.__class__.__name__.lower()
cache_dir = user_cache_dir("jitenbot")
cache_dir = os.path.join(cache_dir, class_name)
netloc_match = self.netloc_re.match(url.netloc)
if netloc_match.group(1) is not None:
subdomain = netloc_match.group(1)

View file

@ -3,6 +3,9 @@ import os
import shutil
import uuid
from pathlib import Path
from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir
def jitenon_yoji(entries):
@ -54,8 +57,12 @@ def __terms(entries):
def __create_zip(terms, index, tags=[]):
build_directory = str(uuid.uuid4())
os.mkdir(build_directory)
cache_dir = user_cache_dir("jitenbot")
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1
@ -78,7 +85,8 @@ def __create_zip(terms, index, tags=[]):
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory)
out_dir = "output"
out_dir = os.path.join(user_documents_dir(), "jitenbot")
out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir():
os.mkdir(out_dir)

View file

@ -0,0 +1,10 @@
{
"sudachi": {
"sahen": ["サ行", "サ行変格", "ザ行変格", "文語サ行変格"],
"godan": ["五段", "文語四段", "文語上二段", "文語下二段", "マス", "ヤス", "デス"],
"ichidan": ["上一段", "下一段", "文語上一段", "文語下一段", "レル"],
"keiyoushi": ["形容詞", "ナイ", "タイ", "ラシイ"],
"kahen": ["カ行変格"],
"sudachi": []
}
}