jitenbot/bot/crawlers/scrapers/scraper.py

import time
import re
import os
import hashlib
import random
import math
from datetime import datetime
from urllib.parse import urlparse
from pathlib import Path
from abc import ABC, abstractmethod

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from platformdirs import user_cache_dir

from bot.time import timestamp
from bot.data import load_config


class BaseScraper(ABC):
    def __init__(self):
        self.cache_count = 0
        self._config = load_config()
        self.netloc_re = self._get_netloc_re()
        self.__set_session()

    def scrape(self, urlstring):
        url = urlparse(urlstring, scheme='https://', allow_fragments=True)
        self.__validate_url(url)
        cache_path = self.__cache_path(url)
        html = self.__read_cache(cache_path)
        if html is None:
            html = self.__get(urlstring)
            with open(cache_path, "w", encoding="utf-8") as f:
                f.write(html)
        else:
            self.cache_count += 1
            print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
        return html, cache_path

    @abstractmethod
    def _get_netloc_re(self):
        raise NotImplementedError

    def __set_session(self):
        retry_strategy = Retry(
            total=3,
            backoff_factor=1,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        headers = self._config["http-request-headers"]
        self.session = requests.Session()
        self.session.mount("https://", adapter)
        self.session.headers.update(headers)

    def __validate_url(self, url):
        valid = False
        if self.netloc_re.match(url.netloc):
            valid = True
        # may add more validators later
        if not valid:
            raise Exception(f"Invalid URL: {url.geturl()}")

    def __cache_path(self, url):
        class_name = self.__class__.__name__.lower()
        cache_dir = user_cache_dir("jitenbot")
        cache_dir = os.path.join(cache_dir, class_name)
        netloc_match = self.netloc_re.match(url.netloc)
        if netloc_match.group(1) is not None:
            subdomain = netloc_match.group(1)
            cache_dir = os.path.join(cache_dir, subdomain)
        paths = re.findall(r"/([^/]+)", url.path)
        if len(paths) < 1:
            raise Exception(f"Invalid path in URL: {url.geturl()}")
        for x in paths[:len(paths)-1]:
            cache_dir = os.path.join(cache_dir, x)
        if not Path(cache_dir).is_dir():
            os.makedirs(cache_dir)
        basename = paths[-1].replace(".", "_")
        urlstring_hash = hashlib.md5(url.geturl().encode()).hexdigest()
        filename = f"{basename}-{urlstring_hash}.html"
        cache_path = os.path.join(cache_dir, filename)
        return cache_path

    def __read_cache(self, cache_path):
        if Path(cache_path).is_file():
            with open(cache_path, "r", encoding="utf-8") as f:
                file_contents = f.read()
        else:
            file_contents = None
        return file_contents

    def __get(self, urlstring):
        delay = 10
        time.sleep(delay)
        print(f"{timestamp()} Scraping {urlstring} ...", end='')
        try:
            response = self.session.get(urlstring, timeout=10)
            print(f"{timestamp()} OK")
            return response.text
        except Exception as ex:
            print(f"\tFailed: {str(ex)}")
            print(f"{timestamp()} Resetting session and trying again")
            self.__set_session()
            response = self.session.get(urlstring, timeout=10)
            return response.text
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`import time`
			`import re`
			`import os`
			`import hashlib`
Add timestamps to command line messages This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway. 2023-07-29 04:17:42 +00:00			`import random`
			`import math`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`from datetime import datetime`
Reorganize file structure of all other modules 2023-07-27 04:48:24 +00:00			`from urllib.parse import urlparse`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`from pathlib import Path`
Reorganize file structure of all other modules 2023-07-27 04:48:24 +00:00			`from abc import ABC, abstractmethod`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00
Reorganize file structure of all other modules 2023-07-27 04:48:24 +00:00			`import requests`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`from requests.adapters import HTTPAdapter`
			`from requests.packages.urllib3.util.retry import Retry`
Reorganize file structure of all other modules 2023-07-27 04:48:24 +00:00			`from platformdirs import user_cache_dir`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00
Add timestamps to command line messages This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway. 2023-07-29 04:17:42 +00:00			`from bot.time import timestamp`
Import functions explicitly 2023-04-23 17:33:42 +00:00			`from bot.data import load_config`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00

Reorganize file structure of all other modules 2023-07-27 04:48:24 +00:00			`class BaseScraper(ABC):`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`def __init__(self):`
Add timestamps to command line messages This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway. 2023-07-29 04:17:42 +00:00			`self.cache_count = 0`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`self._config = load_config()`
Reorganize file structure of all other modules 2023-07-27 04:48:24 +00:00			`self.netloc_re = self._get_netloc_re()`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`self.__set_session()`

			`def scrape(self, urlstring):`
			`url = urlparse(urlstring, scheme='https://', allow_fragments=True)`
			`self.__validate_url(url)`
			`cache_path = self.__cache_path(url)`
Organize Yomichan export logic into classes 2023-04-22 21:49:29 +00:00			`html = self.__read_cache(cache_path)`
			`if html is None:`
			`html = self.__get(urlstring)`
Specify UTF-8 encoding with file i/o 2023-05-02 01:03:03 +00:00			`with open(cache_path, "w", encoding="utf-8") as f:`
Organize Yomichan export logic into classes 2023-04-22 21:49:29 +00:00			`f.write(html)`
			`else:`
Add timestamps to command line messages This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway. 2023-07-29 04:17:42 +00:00			`self.cache_count += 1`
			`print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)`
Organize Yomichan export logic into classes 2023-04-22 21:49:29 +00:00			`return html, cache_path`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00
Reorganize file structure of all other modules 2023-07-27 04:48:24 +00:00			`@abstractmethod`
			`def _get_netloc_re(self):`
			`raise NotImplementedError`

First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`def __set_session(self):`
			`retry_strategy = Retry(`
			`total=3,`
			`backoff_factor=1,`
			`status_forcelist=[429, 500, 502, 503, 504],`
			`allowed_methods=["HEAD", "GET", "OPTIONS"]`
			`)`
			`adapter = HTTPAdapter(max_retries=retry_strategy)`
Add support for Shinmeikai 8th edition & Daijirin 4th edition 2023-05-01 22:31:28 +00:00			`headers = self._config["http-request-headers"]`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`self.session = requests.Session()`
			`self.session.mount("https://", adapter)`
			`self.session.headers.update(headers)`

			`def __validate_url(self, url):`
			`valid = False`
			`if self.netloc_re.match(url.netloc):`
			`valid = True`
			`# may add more validators later`
			`if not valid:`
			`raise Exception(f"Invalid URL: {url.geturl()}")`

			`def __cache_path(self, url):`
Use standard platform directories for cache, config, and output data 2023-04-22 18:37:34 +00:00			`class_name = self.__class__.__name__.lower()`
			`cache_dir = user_cache_dir("jitenbot")`
			`cache_dir = os.path.join(cache_dir, class_name)`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`netloc_match = self.netloc_re.match(url.netloc)`
			`if netloc_match.group(1) is not None:`
			`subdomain = netloc_match.group(1)`
			`cache_dir = os.path.join(cache_dir, subdomain)`
			`paths = re.findall(r"/([^/]+)", url.path)`
			`if len(paths) < 1:`
			`raise Exception(f"Invalid path in URL: {url.geturl()}")`
			`for x in paths[:len(paths)-1]:`
			`cache_dir = os.path.join(cache_dir, x)`
			`if not Path(cache_dir).is_dir():`
			`os.makedirs(cache_dir)`
			`basename = paths[-1].replace(".", "_")`
			`urlstring_hash = hashlib.md5(url.geturl().encode()).hexdigest()`
			`filename = f"{basename}-{urlstring_hash}.html"`
			`cache_path = os.path.join(cache_dir, filename)`
			`return cache_path`

			`def __read_cache(self, cache_path):`
			`if Path(cache_path).is_file():`
Specify UTF-8 encoding with file i/o 2023-05-02 01:03:03 +00:00			`with open(cache_path, "r", encoding="utf-8") as f:`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`file_contents = f.read()`
			`else:`
			`file_contents = None`
			`return file_contents`

Add support Jitenon Kotowaza 2023-04-10 16:14:52 +00:00			`def __get(self, urlstring):`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`delay = 10`
			`time.sleep(delay)`
Add timestamps to command line messages This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway. 2023-07-29 04:17:42 +00:00			`print(f"{timestamp()} Scraping {urlstring} ...", end='')`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`try:`
Add support Jitenon Kotowaza 2023-04-10 16:14:52 +00:00			`response = self.session.get(urlstring, timeout=10)`
Add timestamps to command line messages This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway. 2023-07-29 04:17:42 +00:00			`print(f"{timestamp()} OK")`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`return response.text`
Add timestamps to command line messages This is a clumsy way of doing it (since it would be better to have a wrapper function append the timestamp), but that will be taken care of when the logging logic is all overhauled anyway. 2023-07-29 04:17:42 +00:00			`except Exception as ex:`
			`print(f"\tFailed: {str(ex)}")`
			`print(f"{timestamp()} Resetting session and trying again")`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`self.__set_session()`
Add support Jitenon Kotowaza 2023-04-10 16:14:52 +00:00			`response = self.session.get(urlstring, timeout=10)`
First version Support for Jitenon's yoji dictionary 2023-04-08 03:05:36 +00:00			`return response.text`