Use standard platform directories for cache, config, and output data

This commit is contained in:
stephenmk 2023-04-22 13:37:34 -05:00
parent 071144c808
commit 90eb5dc285
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
5 changed files with 77 additions and 10 deletions

43
bot/data.py Normal file
View file

@ -0,0 +1,43 @@
import os
import sys
import json
from pathlib import Path
from platformdirs import user_config_dir
def config():
config_dir = user_config_dir("jitenbot")
if not Path(config_dir).is_dir():
os.makedirs(config_dir)
config_file = os.path.join(config_dir, "config.json")
if Path(config_file).is_file():
with open(config_file, "r") as f:
config = json.load(f)
else:
config = __default_config()
with open(config_file, "w") as f:
json.dump(config, f, indent=4)
return config
def yomichan_inflection_categories():
file_name = "yomichan_inflection_categories.json"
data = __load_json(file_name)
return data
def __default_config():
file_name = "default_config.json"
data = __load_json(file_name)
return data
def __load_json(file_name):
file_path = os.path.join("data", file_name)
if not Path(file_path).is_file():
print(f"Missing data file: {file_path}")
sys.exit(1)
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return data

View file

@ -2,18 +2,24 @@ import time
import requests import requests
import re import re
import os import os
import json
import hashlib import hashlib
from datetime import datetime
from pathlib import Path from pathlib import Path
from platformdirs import user_cache_dir
from urllib.parse import urlparse from urllib.parse import urlparse
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry from requests.packages.urllib3.util.retry import Retry
from datetime import datetime
import bot.data as Data
class Scraper(): class Scraper():
__CONFIG = None
def __init__(self): def __init__(self):
if self.__CONFIG is None:
self.__CONFIG = Data.config()
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$" pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
self.netloc_re = re.compile(pattern) self.netloc_re = re.compile(pattern)
self.__set_session() self.__set_session()
@ -38,9 +44,7 @@ class Scraper():
allowed_methods=["HEAD", "GET", "OPTIONS"] allowed_methods=["HEAD", "GET", "OPTIONS"]
) )
adapter = HTTPAdapter(max_retries=retry_strategy) adapter = HTTPAdapter(max_retries=retry_strategy)
with open("config.json", "r") as f: headers = self.__CONFIG["http-request-headers"]
config = json.load(f)
headers = config["http-request-headers"]
self.session = requests.Session() self.session = requests.Session()
self.session.mount("https://", adapter) self.session.mount("https://", adapter)
self.session.headers.update(headers) self.session.headers.update(headers)
@ -54,7 +58,9 @@ class Scraper():
raise Exception(f"Invalid URL: {url.geturl()}") raise Exception(f"Invalid URL: {url.geturl()}")
def __cache_path(self, url): def __cache_path(self, url):
cache_dir = os.path.join("webcache", self.__class__.__name__.lower()) class_name = self.__class__.__name__.lower()
cache_dir = user_cache_dir("jitenbot")
cache_dir = os.path.join(cache_dir, class_name)
netloc_match = self.netloc_re.match(url.netloc) netloc_match = self.netloc_re.match(url.netloc)
if netloc_match.group(1) is not None: if netloc_match.group(1) is not None:
subdomain = netloc_match.group(1) subdomain = netloc_match.group(1)

View file

@ -3,6 +3,9 @@ import os
import shutil import shutil
import uuid import uuid
from pathlib import Path from pathlib import Path
from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir
def jitenon_yoji(entries): def jitenon_yoji(entries):
@ -54,8 +57,12 @@ def __terms(entries):
def __create_zip(terms, index, tags=[]): def __create_zip(terms, index, tags=[]):
build_directory = str(uuid.uuid4()) cache_dir = user_cache_dir("jitenbot")
os.mkdir(build_directory) timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
terms_per_file = 1000 terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1 max_i = int(len(terms) / terms_per_file) + 1
@ -78,7 +85,8 @@ def __create_zip(terms, index, tags=[]):
zip_filename = index["title"] zip_filename = index["title"]
zip_file = f"{zip_filename}.zip" zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory) shutil.make_archive(zip_filename, "zip", build_directory)
out_dir = "output"
out_dir = os.path.join(user_documents_dir(), "jitenbot")
out_file = os.path.join(out_dir, zip_file) out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir(): if not Path(out_dir).is_dir():
os.mkdir(out_dir) os.mkdir(out_dir)

View file

@ -0,0 +1,10 @@
{
"sudachi": {
"sahen": ["サ行", "サ行変格", "ザ行変格", "文語サ行変格"],
"godan": ["五段", "文語四段", "文語上二段", "文語下二段", "マス", "ヤス", "デス"],
"ichidan": ["上一段", "下一段", "文語上一段", "文語下一段", "レル"],
"keiyoushi": ["形容詞", "ナイ", "タイ", "ラシイ"],
"kahen": ["カ行変格"],
"sudachi": []
}
}