Move preprocessing logic for yomichan markup to entry classes
This commit is contained in:
parent
83a182e682
commit
f26270cf7e
|
@ -24,9 +24,10 @@ class Jitenon:
|
||||||
colname = ""
|
colname = ""
|
||||||
for row in rows:
|
for row in rows:
|
||||||
colname = row.th.text if row.th is not None else colname
|
colname = row.th.text if row.th is not None else colname
|
||||||
colval = row.td.text
|
colval = self.__clean(row.td.text)
|
||||||
self.__set_column(colname, colval)
|
self.__set_column(colname, colval)
|
||||||
gloss = YomichanSoup.make_gloss(table) # note: modifies table
|
self.__prepare_yomichan_soup(table)
|
||||||
|
gloss = YomichanSoup.make_gloss(table)
|
||||||
self.yomichan_glossary = [gloss]
|
self.yomichan_glossary = [gloss]
|
||||||
|
|
||||||
def __set_modified_date(self, html):
|
def __set_modified_date(self, html):
|
||||||
|
@ -36,10 +37,16 @@ class Jitenon:
|
||||||
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
||||||
self.modified_date = date
|
self.modified_date = date
|
||||||
|
|
||||||
|
def __clean(self, text):
|
||||||
|
text = text.replace("\n", "")
|
||||||
|
text = text.replace(",", "、")
|
||||||
|
text = text.replace(" ", "")
|
||||||
|
text = text.strip()
|
||||||
|
return text
|
||||||
|
|
||||||
def __set_column(self, colname, colval):
|
def __set_column(self, colname, colval):
|
||||||
attr_name = self.columns[colname][0]
|
attr_name = self.columns[colname][0]
|
||||||
attr_value = getattr(self, attr_name)
|
attr_value = getattr(self, attr_name)
|
||||||
colval = colval.replace("\n", "").replace(",", "、").strip()
|
|
||||||
if isinstance(attr_value, str):
|
if isinstance(attr_value, str):
|
||||||
setattr(self, attr_name, colval)
|
setattr(self, attr_name, colval)
|
||||||
elif isinstance(attr_value, list):
|
elif isinstance(attr_value, list):
|
||||||
|
@ -47,7 +54,23 @@ class Jitenon:
|
||||||
setattr(self, attr_name, [colval])
|
setattr(self, attr_name, [colval])
|
||||||
else:
|
else:
|
||||||
attr_value.append(colval)
|
attr_value.append(colval)
|
||||||
setattr(self, attr_name, attr_value)
|
# setattr(self, attr_name, attr_value)
|
||||||
|
|
||||||
|
def __prepare_yomichan_soup(self, soup):
|
||||||
|
patterns = [
|
||||||
|
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||||||
|
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||||||
|
]
|
||||||
|
for a in soup.find_all("a"):
|
||||||
|
for pattern in patterns:
|
||||||
|
m = re.search(pattern, a.text)
|
||||||
|
if m:
|
||||||
|
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
||||||
|
break
|
||||||
|
for p in soup.find_all("p"):
|
||||||
|
p.name = "span"
|
||||||
|
for th in soup.find_all("th"):
|
||||||
|
th['style'] = "vertical-align: middle; text-align: center;"
|
||||||
|
|
||||||
def _headwords(self):
|
def _headwords(self):
|
||||||
words = []
|
words = []
|
||||||
|
@ -63,7 +86,7 @@ class Jitenon:
|
||||||
return words
|
return words
|
||||||
|
|
||||||
def __yomikatas(self):
|
def __yomikatas(self):
|
||||||
yomikata = self.yomikata.replace(" ", "")
|
yomikata = self.yomikata
|
||||||
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return [yomikata]
|
return [yomikata]
|
||||||
|
@ -81,12 +104,11 @@ class Jitenon:
|
||||||
yomikatas.append(alt.strip())
|
yomikatas.append(alt.strip())
|
||||||
return yomikatas
|
return yomikatas
|
||||||
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
||||||
return ""
|
return [""]
|
||||||
|
|
||||||
def __ikei_headwords(self):
|
def __ikei_headwords(self):
|
||||||
ikei_headwords = []
|
ikei_headwords = []
|
||||||
for val in self.ikei:
|
for val in self.ikei:
|
||||||
val = val.replace(" ", "")
|
|
||||||
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
||||||
if m:
|
if m:
|
||||||
headword = [m.group(1), m.group(2)]
|
headword = [m.group(1), m.group(2)]
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
import re
|
|
||||||
from css_parser import parseStyle
|
from css_parser import parseStyle
|
||||||
|
|
||||||
|
|
||||||
def make_gloss(soup):
|
def make_gloss(soup):
|
||||||
__preprocess_soup(soup)
|
|
||||||
structured_content = __get_markup_structure(soup)
|
structured_content = __get_markup_structure(soup)
|
||||||
return {
|
return {
|
||||||
"type": "structured-content",
|
"type": "structured-content",
|
||||||
|
@ -11,35 +9,17 @@ def make_gloss(soup):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def __preprocess_soup(soup):
|
|
||||||
patterns = [
|
|
||||||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
|
||||||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
|
||||||
]
|
|
||||||
for a in soup.find_all("a"):
|
|
||||||
for pattern in patterns:
|
|
||||||
m = re.search(pattern, a.text)
|
|
||||||
if m:
|
|
||||||
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
|
||||||
break
|
|
||||||
for p in soup.find_all("p"):
|
|
||||||
p.name = "span"
|
|
||||||
for th in soup.find_all("th"):
|
|
||||||
th['style'] = "vertical-align: middle; text-align: center;"
|
|
||||||
|
|
||||||
|
|
||||||
def __get_markup_structure(soup):
|
def __get_markup_structure(soup):
|
||||||
node = {}
|
node = {"tag": soup.name}
|
||||||
content = []
|
content = []
|
||||||
for child in soup.children:
|
for child in soup.children:
|
||||||
if child.name is None:
|
if child.name is None:
|
||||||
text = child.text.strip()
|
text = child.text.replace("\n", "")
|
||||||
if text != "":
|
if text != "":
|
||||||
content.append(text)
|
content.append(text)
|
||||||
else:
|
else:
|
||||||
content.append(__get_markup_structure(child))
|
content.append(__get_markup_structure(child))
|
||||||
|
|
||||||
node["tag"] = soup.name
|
|
||||||
attributes = __get_attributes(soup.attrs)
|
attributes = __get_attributes(soup.attrs)
|
||||||
for key, val in attributes.items():
|
for key, val in attributes.items():
|
||||||
node[key] = val
|
node[key] = val
|
||||||
|
|
Loading…
Reference in a new issue