20 lines
683 B
Python
20 lines
683 B
Python
import os
|
|
from bot.crawlers.base.crawler import BaseCrawler
|
|
|
|
|
|
class MonokakidoCrawler(BaseCrawler):
|
|
def __init__(self, target):
|
|
super().__init__(target)
|
|
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
|
|
|
def collect_pages(self, page_dir):
|
|
print(f"Searching for page files in `{page_dir}`")
|
|
for pagefile in os.listdir(page_dir):
|
|
page_id = self._parse_page_id(pagefile)
|
|
if page_id is None or page_id == 0:
|
|
continue
|
|
path = os.path.join(page_dir, pagefile)
|
|
self._page_map[page_id] = path
|
|
pages_len = len(self._page_map)
|
|
print(f"Found {pages_len} page files for processing")
|