Warn user when corrupted page data is downloaded from jitenon

This commit is contained in:
stephenmk 2023-05-06 22:07:06 -05:00
parent c737f10885
commit ef878143d7
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
2 changed files with 18 additions and 3 deletions

View file

@ -28,7 +28,13 @@ class Crawler(ABC):
entry = new_entry(self._target, page_id) entry = new_entry(self._target, page_id)
with open(page_path, "r", encoding="utf-8") as f: with open(page_path, "r", encoding="utf-8") as f:
page = f.read() page = f.read()
entry.set_page(page) try:
entry.set_page(page)
except ValueError as err:
print(err)
print("Try deleting and redownloading file:")
print(f"\t{page_path}\n")
continue
self._entries.append(entry) self._entries.append(entry)
print() print()

View file

@ -23,8 +23,10 @@ class _JitenonEntry(Entry):
def set_page(self, page): def set_page(self, page):
soup = BeautifulSoup(page, features="html5lib") soup = BeautifulSoup(page, features="html5lib")
self.__set_modified_date(page) self.__set_modified_date(page)
self.attribution = soup.find(class_="copyright").text self.__set_attribution(soup)
table = soup.find(class_="kanjirighttb") table = soup.find(class_="kanjirighttb")
if table is None:
raise ValueError("Error: table data not found in page.")
rows = table.find("tbody").find_all("tr") rows = table.find("tbody").find_all("tr")
colname = "" colname = ""
for row in rows: for row in rows:
@ -63,11 +65,18 @@ class _JitenonEntry(Entry):
def __set_modified_date(self, page): def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
if not m: if m is None:
return return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date() date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date self.modified_date = date
def __set_attribution(self, soup):
attribution = soup.find(class_="copyright")
if attribution is not None:
self.attribution = soup.find(class_="copyright").text
else:
self.attribution = ""
def __set_column(self, colname, colval): def __set_column(self, colname, colval):
attr_name = self._COLUMNS[colname][0] attr_name = self._COLUMNS[colname][0]
attr_value = getattr(self, attr_name) attr_value = getattr(self, attr_name)