From ef878143d7177b7ed42cd611ca12476154273cec Mon Sep 17 00:00:00 2001 From: stephenmk Date: Sat, 6 May 2023 22:07:06 -0500 Subject: [PATCH] Warn user when corrupted page data is downloaded from jitenon --- bot/crawlers/crawlers.py | 8 +++++++- bot/entries/jitenon.py | 13 +++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/bot/crawlers/crawlers.py b/bot/crawlers/crawlers.py index 4df33a8..c7bf8ea 100644 --- a/bot/crawlers/crawlers.py +++ b/bot/crawlers/crawlers.py @@ -28,7 +28,13 @@ class Crawler(ABC): entry = new_entry(self._target, page_id) with open(page_path, "r", encoding="utf-8") as f: page = f.read() - entry.set_page(page) + try: + entry.set_page(page) + except ValueError as err: + print(err) + print("Try deleting and redownloading file:") + print(f"\t{page_path}\n") + continue self._entries.append(entry) print() diff --git a/bot/entries/jitenon.py b/bot/entries/jitenon.py index d97a41b..fd9fcd2 100644 --- a/bot/entries/jitenon.py +++ b/bot/entries/jitenon.py @@ -23,8 +23,10 @@ class _JitenonEntry(Entry): def set_page(self, page): soup = BeautifulSoup(page, features="html5lib") self.__set_modified_date(page) - self.attribution = soup.find(class_="copyright").text + self.__set_attribution(soup) table = soup.find(class_="kanjirighttb") + if table is None: + raise ValueError("Error: table data not found in page.") rows = table.find("tbody").find_all("tr") colname = "" for row in rows: @@ -63,11 +65,18 @@ class _JitenonEntry(Entry): def __set_modified_date(self, page): m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page) - if not m: + if m is None: return date = datetime.strptime(m.group(1), '%Y-%m-%d').date() self.modified_date = date + def __set_attribution(self, soup): + attribution = soup.find(class_="copyright") + if attribution is not None: + self.attribution = soup.find(class_="copyright").text + else: + self.attribution = "" + def __set_column(self, colname, colval): attr_name = self._COLUMNS[colname][0] attr_value = getattr(self, attr_name)