Use standard platform directories for cache, config, and output data
This commit is contained in:
parent
071144c808
commit
90eb5dc285
43
bot/data.py
Normal file
43
bot/data.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from platformdirs import user_config_dir
|
||||||
|
|
||||||
|
|
||||||
|
def config():
|
||||||
|
config_dir = user_config_dir("jitenbot")
|
||||||
|
if not Path(config_dir).is_dir():
|
||||||
|
os.makedirs(config_dir)
|
||||||
|
config_file = os.path.join(config_dir, "config.json")
|
||||||
|
if Path(config_file).is_file():
|
||||||
|
with open(config_file, "r") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
else:
|
||||||
|
config = __default_config()
|
||||||
|
with open(config_file, "w") as f:
|
||||||
|
json.dump(config, f, indent=4)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def yomichan_inflection_categories():
|
||||||
|
file_name = "yomichan_inflection_categories.json"
|
||||||
|
data = __load_json(file_name)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def __default_config():
|
||||||
|
file_name = "default_config.json"
|
||||||
|
data = __load_json(file_name)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def __load_json(file_name):
|
||||||
|
file_path = os.path.join("data", file_name)
|
||||||
|
if not Path(file_path).is_file():
|
||||||
|
print(f"Missing data file: {file_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return data
|
|
@ -2,18 +2,24 @@ import time
|
||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import json
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from platformdirs import user_cache_dir
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from requests.packages.urllib3.util.retry import Retry
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
from datetime import datetime
|
|
||||||
|
import bot.data as Data
|
||||||
|
|
||||||
|
|
||||||
class Scraper():
|
class Scraper():
|
||||||
|
__CONFIG = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
if self.__CONFIG is None:
|
||||||
|
self.__CONFIG = Data.config()
|
||||||
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
|
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
|
||||||
self.netloc_re = re.compile(pattern)
|
self.netloc_re = re.compile(pattern)
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
|
@ -38,9 +44,7 @@ class Scraper():
|
||||||
allowed_methods=["HEAD", "GET", "OPTIONS"]
|
allowed_methods=["HEAD", "GET", "OPTIONS"]
|
||||||
)
|
)
|
||||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||||
with open("config.json", "r") as f:
|
headers = self.__CONFIG["http-request-headers"]
|
||||||
config = json.load(f)
|
|
||||||
headers = config["http-request-headers"]
|
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.session.mount("https://", adapter)
|
self.session.mount("https://", adapter)
|
||||||
self.session.headers.update(headers)
|
self.session.headers.update(headers)
|
||||||
|
@ -54,7 +58,9 @@ class Scraper():
|
||||||
raise Exception(f"Invalid URL: {url.geturl()}")
|
raise Exception(f"Invalid URL: {url.geturl()}")
|
||||||
|
|
||||||
def __cache_path(self, url):
|
def __cache_path(self, url):
|
||||||
cache_dir = os.path.join("webcache", self.__class__.__name__.lower())
|
class_name = self.__class__.__name__.lower()
|
||||||
|
cache_dir = user_cache_dir("jitenbot")
|
||||||
|
cache_dir = os.path.join(cache_dir, class_name)
|
||||||
netloc_match = self.netloc_re.match(url.netloc)
|
netloc_match = self.netloc_re.match(url.netloc)
|
||||||
if netloc_match.group(1) is not None:
|
if netloc_match.group(1) is not None:
|
||||||
subdomain = netloc_match.group(1)
|
subdomain = netloc_match.group(1)
|
||||||
|
|
|
@ -3,6 +3,9 @@ import os
|
||||||
import shutil
|
import shutil
|
||||||
import uuid
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from platformdirs import user_documents_dir, user_cache_dir
|
||||||
|
|
||||||
|
|
||||||
def jitenon_yoji(entries):
|
def jitenon_yoji(entries):
|
||||||
|
@ -54,8 +57,12 @@ def __terms(entries):
|
||||||
|
|
||||||
|
|
||||||
def __create_zip(terms, index, tags=[]):
|
def __create_zip(terms, index, tags=[]):
|
||||||
build_directory = str(uuid.uuid4())
|
cache_dir = user_cache_dir("jitenbot")
|
||||||
os.mkdir(build_directory)
|
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||||
|
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
|
||||||
|
if Path(build_directory).is_dir():
|
||||||
|
shutil.rmtree(build_directory)
|
||||||
|
os.makedirs(build_directory)
|
||||||
|
|
||||||
terms_per_file = 1000
|
terms_per_file = 1000
|
||||||
max_i = int(len(terms) / terms_per_file) + 1
|
max_i = int(len(terms) / terms_per_file) + 1
|
||||||
|
@ -78,7 +85,8 @@ def __create_zip(terms, index, tags=[]):
|
||||||
zip_filename = index["title"]
|
zip_filename = index["title"]
|
||||||
zip_file = f"{zip_filename}.zip"
|
zip_file = f"{zip_filename}.zip"
|
||||||
shutil.make_archive(zip_filename, "zip", build_directory)
|
shutil.make_archive(zip_filename, "zip", build_directory)
|
||||||
out_dir = "output"
|
|
||||||
|
out_dir = os.path.join(user_documents_dir(), "jitenbot")
|
||||||
out_file = os.path.join(out_dir, zip_file)
|
out_file = os.path.join(out_dir, zip_file)
|
||||||
if not Path(out_dir).is_dir():
|
if not Path(out_dir).is_dir():
|
||||||
os.mkdir(out_dir)
|
os.mkdir(out_dir)
|
||||||
|
|
10
data/yomichan_inflection_categories.json
Normal file
10
data/yomichan_inflection_categories.json
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"sudachi": {
|
||||||
|
"sahen": ["サ行", "サ行変格", "ザ行変格", "文語サ行変格"],
|
||||||
|
"godan": ["五段", "文語四段", "文語上二段", "文語下二段", "マス", "ヤス", "デス"],
|
||||||
|
"ichidan": ["上一段", "下一段", "文語上一段", "文語下一段", "レル"],
|
||||||
|
"keiyoushi": ["形容詞", "ナイ", "タイ", "ラシイ"],
|
||||||
|
"kahen": ["カ行変格"],
|
||||||
|
"sudachi": []
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue