Move preprocessing logic for yomichan markup to entry classes

This commit is contained in:
stephenmk 2023-04-10 17:33:10 -05:00
parent 83a182e682
commit f26270cf7e
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
2 changed files with 31 additions and 29 deletions

View file

@ -24,9 +24,10 @@ class Jitenon:
colname = "" colname = ""
for row in rows: for row in rows:
colname = row.th.text if row.th is not None else colname colname = row.th.text if row.th is not None else colname
colval = row.td.text colval = self.__clean(row.td.text)
self.__set_column(colname, colval) self.__set_column(colname, colval)
gloss = YomichanSoup.make_gloss(table) # note: modifies table self.__prepare_yomichan_soup(table)
gloss = YomichanSoup.make_gloss(table)
self.yomichan_glossary = [gloss] self.yomichan_glossary = [gloss]
def __set_modified_date(self, html): def __set_modified_date(self, html):
@ -36,10 +37,16 @@ class Jitenon:
date = datetime.strptime(m.group(1), '%Y-%m-%d').date() date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date self.modified_date = date
def __clean(self, text):
text = text.replace("\n", "")
text = text.replace(",", "")
text = text.replace(" ", "")
text = text.strip()
return text
def __set_column(self, colname, colval): def __set_column(self, colname, colval):
attr_name = self.columns[colname][0] attr_name = self.columns[colname][0]
attr_value = getattr(self, attr_name) attr_value = getattr(self, attr_name)
colval = colval.replace("\n", "").replace(",", "").strip()
if isinstance(attr_value, str): if isinstance(attr_value, str):
setattr(self, attr_name, colval) setattr(self, attr_name, colval)
elif isinstance(attr_value, list): elif isinstance(attr_value, list):
@ -47,7 +54,23 @@ class Jitenon:
setattr(self, attr_name, [colval]) setattr(self, attr_name, [colval])
else: else:
attr_value.append(colval) attr_value.append(colval)
setattr(self, attr_name, attr_value) # setattr(self, attr_name, attr_value)
def __prepare_yomichan_soup(self, soup):
patterns = [
r"^(.+)[ぁ-ヿ、\s]+$",
r"^(.+)[ぁ-ヿ、\s]+[ぁ-ヿ、\s][ぁ-ヿ、\s]+$"
]
for a in soup.find_all("a"):
for pattern in patterns:
m = re.search(pattern, a.text)
if m:
a['href'] = f"?query={m.group(1)}&wildcards=off"
break
for p in soup.find_all("p"):
p.name = "span"
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
def _headwords(self): def _headwords(self):
words = [] words = []
@ -63,7 +86,7 @@ class Jitenon:
return words return words
def __yomikatas(self): def __yomikatas(self):
yomikata = self.yomikata.replace(" ", "") yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata) m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
if m: if m:
return [yomikata] return [yomikata]
@ -81,12 +104,11 @@ class Jitenon:
yomikatas.append(alt.strip()) yomikatas.append(alt.strip())
return yomikatas return yomikatas
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n") print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return "" return [""]
def __ikei_headwords(self): def __ikei_headwords(self):
ikei_headwords = [] ikei_headwords = []
for val in self.ikei: for val in self.ikei:
val = val.replace(" ", "")
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val) m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if m: if m:
headword = [m.group(1), m.group(2)] headword = [m.group(1), m.group(2)]

View file

@ -1,9 +1,7 @@
import re
from css_parser import parseStyle from css_parser import parseStyle
def make_gloss(soup): def make_gloss(soup):
__preprocess_soup(soup)
structured_content = __get_markup_structure(soup) structured_content = __get_markup_structure(soup)
return { return {
"type": "structured-content", "type": "structured-content",
@ -11,35 +9,17 @@ def make_gloss(soup):
} }
def __preprocess_soup(soup):
patterns = [
r"^(.+)[ぁ-ヿ、\s]+$",
r"^(.+)[ぁ-ヿ、\s]+[ぁ-ヿ、\s][ぁ-ヿ、\s]+$"
]
for a in soup.find_all("a"):
for pattern in patterns:
m = re.search(pattern, a.text)
if m:
a['href'] = f"?query={m.group(1)}&wildcards=off"
break
for p in soup.find_all("p"):
p.name = "span"
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
def __get_markup_structure(soup): def __get_markup_structure(soup):
node = {} node = {"tag": soup.name}
content = [] content = []
for child in soup.children: for child in soup.children:
if child.name is None: if child.name is None:
text = child.text.strip() text = child.text.replace("\n", "")
if text != "": if text != "":
content.append(text) content.append(text)
else: else:
content.append(__get_markup_structure(child)) content.append(__get_markup_structure(child))
node["tag"] = soup.name
attributes = __get_attributes(soup.attrs) attributes = __get_attributes(soup.attrs)
for key, val in attributes.items(): for key, val in attributes.items():
node[key] = val node[key] = val