jitenbot/bot/yomichan/html_gloss.py
2023-04-11 12:01:23 -05:00

74 lines
2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from css_parser import parseStyle
def make_gloss(soup):
structured_content = __get_markup_structure(soup)
return {
"type": "structured-content",
"content": structured_content
}
def __get_markup_structure(soup):
node = {"tag": soup.name}
content = []
for child in soup.children:
if child.name is None:
text = __clean(child.text)
if text != "":
content.append(text)
else:
content.append(__get_markup_structure(child))
attributes = __get_attributes(soup.attrs)
for key, val in attributes.items():
node[key] = val
if len(content) == 0:
pass
elif len(content) == 1:
node["content"] = content[0]
else:
node["content"] = content
return node
def __clean(text):
text = text.replace("/", "")
text = text.strip()
return text
def __get_attributes(attrs):
attributes = {}
if "href" in attrs:
attributes["href"] = attrs["href"]
if "rowspan" in attrs:
attributes["rowSpan"] = int(attrs["rowspan"])
if "colspan" in attrs:
attributes["colSpan"] = int(attrs["colspan"])
if "style" in attrs:
attributes["style"] = __get_style(attrs["style"])
return attributes
def __get_style(inline_style_string):
style = {}
parsedStyle = parseStyle(inline_style_string)
if parsedStyle.fontStyle != "":
style["fontStyle"] = parsedStyle.fontStyle
if parsedStyle.fontWeight != "":
style["fontWeight"] = parsedStyle.fontWeight
if parsedStyle.fontSize != "":
style["fontSize"] = parsedStyle.fontSize
if parsedStyle.textDecoration != "":
style["textDecorationLine"] = parsedStyle.textDecoration
if parsedStyle.verticalAlign != "":
style["verticalAlign"] = parsedStyle.verticalAlign
if parsedStyle.textAlign != "":
style["textAlign"] = parsedStyle.textAlign
if parsedStyle.listStyleType != "":
style["listStyleType"] = parsedStyle.listStyleType
return style