2023-04-23 02:01:52 +00:00
|
|
|
import re
|
2023-04-08 03:05:36 +00:00
|
|
|
from css_parser import parseStyle
|
|
|
|
|
|
|
|
|
2023-04-10 20:20:33 +00:00
|
|
|
def make_gloss(soup):
|
2023-05-01 22:31:28 +00:00
|
|
|
node = __get_page_structure(soup)
|
2023-04-08 03:05:36 +00:00
|
|
|
return {
|
|
|
|
"type": "structured-content",
|
2023-04-23 01:45:40 +00:00
|
|
|
"content": node["content"],
|
2023-04-08 03:05:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-05-01 22:31:28 +00:00
|
|
|
def __get_page_structure(soup):
|
2023-04-10 22:33:10 +00:00
|
|
|
node = {"tag": soup.name}
|
2023-04-08 03:05:36 +00:00
|
|
|
content = []
|
|
|
|
for child in soup.children:
|
|
|
|
if child.name is None:
|
2023-04-23 05:17:42 +00:00
|
|
|
text = child.text.strip()
|
2023-04-08 03:05:36 +00:00
|
|
|
if text != "":
|
|
|
|
content.append(text)
|
|
|
|
else:
|
2023-05-01 22:31:28 +00:00
|
|
|
content.append(__get_page_structure(child))
|
2023-04-08 03:05:36 +00:00
|
|
|
|
|
|
|
attributes = __get_attributes(soup.attrs)
|
|
|
|
for key, val in attributes.items():
|
|
|
|
node[key] = val
|
|
|
|
|
|
|
|
if len(content) == 0:
|
|
|
|
pass
|
|
|
|
elif len(content) == 1:
|
|
|
|
node["content"] = content[0]
|
|
|
|
else:
|
|
|
|
node["content"] = content
|
|
|
|
|
|
|
|
return node
|
|
|
|
|
|
|
|
|
|
|
|
def __get_attributes(attrs):
|
|
|
|
attributes = {}
|
|
|
|
if "href" in attrs:
|
|
|
|
attributes["href"] = attrs["href"]
|
|
|
|
if "rowspan" in attrs:
|
|
|
|
attributes["rowSpan"] = int(attrs["rowspan"])
|
|
|
|
if "colspan" in attrs:
|
|
|
|
attributes["colSpan"] = int(attrs["colspan"])
|
2023-04-23 01:45:40 +00:00
|
|
|
if "height" in attrs:
|
|
|
|
attributes["height"] = float(attrs["height"])
|
|
|
|
if "width" in attrs:
|
|
|
|
attributes["width"] = float(attrs["width"])
|
|
|
|
if "sizeUnits" in attrs:
|
|
|
|
attributes["sizeUnits"] = attrs["sizeUnits"]
|
|
|
|
if "appearance" in attrs:
|
|
|
|
attributes["appearance"] = attrs["appearance"]
|
|
|
|
if "title" in attrs:
|
|
|
|
attributes["title"] = attrs["title"]
|
|
|
|
if "collapsible" in attrs:
|
|
|
|
attributes["collapsible"] = bool(attrs["collapsible"])
|
|
|
|
if "collapsed" in attrs:
|
|
|
|
attributes["collapsed"] = bool(attrs["collapsed"])
|
|
|
|
if "background" in attrs:
|
|
|
|
attributes["background"] = bool(attrs["background"])
|
|
|
|
if "path" in attrs:
|
|
|
|
attributes["path"] = attrs["path"]
|
2023-04-08 03:05:36 +00:00
|
|
|
if "style" in attrs:
|
2023-04-23 01:45:40 +00:00
|
|
|
style = __get_style(attrs["style"])
|
|
|
|
if len(style) > 0:
|
|
|
|
attributes["style"] = style
|
|
|
|
data_attrs = {}
|
|
|
|
for attr_key in attrs.keys():
|
|
|
|
if attr_key.startswith("data-"):
|
|
|
|
key = attr_key.removeprefix("data-")
|
|
|
|
data_attrs[key] = attrs[attr_key]
|
|
|
|
if len(data_attrs) > 0:
|
|
|
|
attributes["data"] = data_attrs
|
2023-04-08 03:05:36 +00:00
|
|
|
return attributes
|
|
|
|
|
|
|
|
|
|
|
|
def __get_style(inline_style_string):
|
|
|
|
style = {}
|
|
|
|
parsedStyle = parseStyle(inline_style_string)
|
2023-04-08 23:17:09 +00:00
|
|
|
if parsedStyle.fontStyle != "":
|
|
|
|
style["fontStyle"] = parsedStyle.fontStyle
|
|
|
|
if parsedStyle.fontWeight != "":
|
|
|
|
style["fontWeight"] = parsedStyle.fontWeight
|
2023-04-08 03:05:36 +00:00
|
|
|
if parsedStyle.fontSize != "":
|
|
|
|
style["fontSize"] = parsedStyle.fontSize
|
|
|
|
if parsedStyle.textDecoration != "":
|
|
|
|
style["textDecorationLine"] = parsedStyle.textDecoration
|
2023-04-08 23:17:09 +00:00
|
|
|
if parsedStyle.verticalAlign != "":
|
|
|
|
style["verticalAlign"] = parsedStyle.verticalAlign
|
|
|
|
if parsedStyle.textAlign != "":
|
|
|
|
style["textAlign"] = parsedStyle.textAlign
|
2023-04-08 03:05:36 +00:00
|
|
|
if parsedStyle.listStyleType != "":
|
|
|
|
style["listStyleType"] = parsedStyle.listStyleType
|
2023-04-23 01:45:40 +00:00
|
|
|
|
|
|
|
margins = {
|
|
|
|
"marginTop": parsedStyle.marginTop,
|
|
|
|
"marginRight": parsedStyle.marginRight,
|
|
|
|
"marginBottom": parsedStyle.marginBottom,
|
|
|
|
"marginLeft": parsedStyle.marginLeft,
|
|
|
|
}
|
|
|
|
for key, val in margins.items():
|
|
|
|
m = re.search(r"(\d+(\.\d*)?|\.\d+)em", val)
|
|
|
|
if m:
|
|
|
|
style[key] = float(m.group(1))
|
|
|
|
|
2023-04-08 03:05:36 +00:00
|
|
|
return style
|