88 lines
2.5 KiB
Python
88 lines
2.5 KiB
Python
import re
|
||
from css_parser import parseStyle
|
||
|
||
|
||
def make_gloss(soup):
|
||
__preprocess_soup(soup)
|
||
structured_content = __get_markup_structure(soup)
|
||
return {
|
||
"type": "structured-content",
|
||
"content": structured_content
|
||
}
|
||
|
||
|
||
def __preprocess_soup(soup):
|
||
patterns = [
|
||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||
]
|
||
for a in soup.find_all("a"):
|
||
for pattern in patterns:
|
||
m = re.search(pattern, a.text)
|
||
if m:
|
||
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
||
break
|
||
for p in soup.find_all("p"):
|
||
p.name = "span"
|
||
for th in soup.find_all("th"):
|
||
th['style'] = "vertical-align: middle; text-align: center;"
|
||
|
||
|
||
def __get_markup_structure(soup):
|
||
node = {}
|
||
content = []
|
||
for child in soup.children:
|
||
if child.name is None:
|
||
text = child.text.strip()
|
||
if text != "":
|
||
content.append(text)
|
||
else:
|
||
content.append(__get_markup_structure(child))
|
||
|
||
node["tag"] = soup.name
|
||
attributes = __get_attributes(soup.attrs)
|
||
for key, val in attributes.items():
|
||
node[key] = val
|
||
|
||
if len(content) == 0:
|
||
pass
|
||
elif len(content) == 1:
|
||
node["content"] = content[0]
|
||
else:
|
||
node["content"] = content
|
||
|
||
return node
|
||
|
||
|
||
def __get_attributes(attrs):
|
||
attributes = {}
|
||
if "href" in attrs:
|
||
attributes["href"] = attrs["href"]
|
||
if "rowspan" in attrs:
|
||
attributes["rowSpan"] = int(attrs["rowspan"])
|
||
if "colspan" in attrs:
|
||
attributes["colSpan"] = int(attrs["colspan"])
|
||
if "style" in attrs:
|
||
attributes["style"] = __get_style(attrs["style"])
|
||
return attributes
|
||
|
||
|
||
def __get_style(inline_style_string):
|
||
style = {}
|
||
parsedStyle = parseStyle(inline_style_string)
|
||
if parsedStyle.fontStyle != "":
|
||
style["fontStyle"] = parsedStyle.fontStyle
|
||
if parsedStyle.fontWeight != "":
|
||
style["fontWeight"] = parsedStyle.fontWeight
|
||
if parsedStyle.fontSize != "":
|
||
style["fontSize"] = parsedStyle.fontSize
|
||
if parsedStyle.textDecoration != "":
|
||
style["textDecorationLine"] = parsedStyle.textDecoration
|
||
if parsedStyle.verticalAlign != "":
|
||
style["verticalAlign"] = parsedStyle.verticalAlign
|
||
if parsedStyle.textAlign != "":
|
||
style["textAlign"] = parsedStyle.textAlign
|
||
if parsedStyle.listStyleType != "":
|
||
style["listStyleType"] = parsedStyle.listStyleType
|
||
return style
|