Warn user when corrupted page data is downloaded from jitenon
This commit is contained in:
parent
c737f10885
commit
ef878143d7
|
@ -28,7 +28,13 @@ class Crawler(ABC):
|
||||||
entry = new_entry(self._target, page_id)
|
entry = new_entry(self._target, page_id)
|
||||||
with open(page_path, "r", encoding="utf-8") as f:
|
with open(page_path, "r", encoding="utf-8") as f:
|
||||||
page = f.read()
|
page = f.read()
|
||||||
entry.set_page(page)
|
try:
|
||||||
|
entry.set_page(page)
|
||||||
|
except ValueError as err:
|
||||||
|
print(err)
|
||||||
|
print("Try deleting and redownloading file:")
|
||||||
|
print(f"\t{page_path}\n")
|
||||||
|
continue
|
||||||
self._entries.append(entry)
|
self._entries.append(entry)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,10 @@ class _JitenonEntry(Entry):
|
||||||
def set_page(self, page):
|
def set_page(self, page):
|
||||||
soup = BeautifulSoup(page, features="html5lib")
|
soup = BeautifulSoup(page, features="html5lib")
|
||||||
self.__set_modified_date(page)
|
self.__set_modified_date(page)
|
||||||
self.attribution = soup.find(class_="copyright").text
|
self.__set_attribution(soup)
|
||||||
table = soup.find(class_="kanjirighttb")
|
table = soup.find(class_="kanjirighttb")
|
||||||
|
if table is None:
|
||||||
|
raise ValueError("Error: table data not found in page.")
|
||||||
rows = table.find("tbody").find_all("tr")
|
rows = table.find("tbody").find_all("tr")
|
||||||
colname = ""
|
colname = ""
|
||||||
for row in rows:
|
for row in rows:
|
||||||
|
@ -63,11 +65,18 @@ class _JitenonEntry(Entry):
|
||||||
|
|
||||||
def __set_modified_date(self, page):
|
def __set_modified_date(self, page):
|
||||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
||||||
if not m:
|
if m is None:
|
||||||
return
|
return
|
||||||
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
||||||
self.modified_date = date
|
self.modified_date = date
|
||||||
|
|
||||||
|
def __set_attribution(self, soup):
|
||||||
|
attribution = soup.find(class_="copyright")
|
||||||
|
if attribution is not None:
|
||||||
|
self.attribution = soup.find(class_="copyright").text
|
||||||
|
else:
|
||||||
|
self.attribution = ""
|
||||||
|
|
||||||
def __set_column(self, colname, colval):
|
def __set_column(self, colname, colval):
|
||||||
attr_name = self._COLUMNS[colname][0]
|
attr_name = self._COLUMNS[colname][0]
|
||||||
attr_value = getattr(self, attr_name)
|
attr_value = getattr(self, attr_name)
|
||||||
|
|
Loading…
Reference in a new issue