Warn user when corrupted page data is downloaded from jitenon
This commit is contained in:
parent
c737f10885
commit
ef878143d7
|
@ -28,7 +28,13 @@ class Crawler(ABC):
|
|||
entry = new_entry(self._target, page_id)
|
||||
with open(page_path, "r", encoding="utf-8") as f:
|
||||
page = f.read()
|
||||
entry.set_page(page)
|
||||
try:
|
||||
entry.set_page(page)
|
||||
except ValueError as err:
|
||||
print(err)
|
||||
print("Try deleting and redownloading file:")
|
||||
print(f"\t{page_path}\n")
|
||||
continue
|
||||
self._entries.append(entry)
|
||||
print()
|
||||
|
||||
|
|
|
@ -23,8 +23,10 @@ class _JitenonEntry(Entry):
|
|||
def set_page(self, page):
|
||||
soup = BeautifulSoup(page, features="html5lib")
|
||||
self.__set_modified_date(page)
|
||||
self.attribution = soup.find(class_="copyright").text
|
||||
self.__set_attribution(soup)
|
||||
table = soup.find(class_="kanjirighttb")
|
||||
if table is None:
|
||||
raise ValueError("Error: table data not found in page.")
|
||||
rows = table.find("tbody").find_all("tr")
|
||||
colname = ""
|
||||
for row in rows:
|
||||
|
@ -63,11 +65,18 @@ class _JitenonEntry(Entry):
|
|||
|
||||
def __set_modified_date(self, page):
|
||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
||||
if not m:
|
||||
if m is None:
|
||||
return
|
||||
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
||||
self.modified_date = date
|
||||
|
||||
def __set_attribution(self, soup):
|
||||
attribution = soup.find(class_="copyright")
|
||||
if attribution is not None:
|
||||
self.attribution = soup.find(class_="copyright").text
|
||||
else:
|
||||
self.attribution = ""
|
||||
|
||||
def __set_column(self, colname, colval):
|
||||
attr_name = self._COLUMNS[colname][0]
|
||||
attr_value = getattr(self, attr_name)
|
||||
|
|
Loading…
Reference in a new issue