jitenbot/yomichan.py

128 lines
3.8 KiB
Python
Raw Normal View History

import json
import os
import shutil
import uuid
import re
from pathlib import Path
from css_parser import parseStyle
def create_zip(terms, index, tags=[]):
build_directory = str(uuid.uuid4())
os.mkdir(build_directory)
terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = terms_per_file * i
end = terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
index_file = os.path.join(build_directory, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
if len(tags) > 0:
tag_file = os.path.join(build_directory, "tag_bank_1.json")
with open(tag_file, 'w', encoding='utf8') as f:
json.dump(tags, f, indent=4, ensure_ascii=False)
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory)
out_dir = "output"
out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir():
os.mkdir(out_dir)
elif Path(out_file).is_file():
os.remove(out_file)
shutil.move(zip_file, out_dir)
shutil.rmtree(build_directory)
def soup_to_gloss(soup):
__sanitize_soup(soup)
structured_content = __get_markup_structure(soup)
return {
"type": "structured-content",
"content": structured_content
}
def __sanitize_soup(soup):
patterns = [
r"^(.+)[ぁ-ヿ]+$",
r"^(.+)[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$"
]
for a in soup.find_all("a"):
for pattern in patterns:
m = re.search(pattern, a.text)
if m:
a['href'] = f"?query={m.group(1)}&wildcards=off"
break
for p in soup.find_all("p"):
p.name = "span"
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
def __get_markup_structure(soup):
node = {}
content = []
for child in soup.children:
if child.name is None:
text = child.text.strip()
if text != "":
content.append(text)
else:
content.append(__get_markup_structure(child))
node["tag"] = soup.name
attributes = __get_attributes(soup.attrs)
for key, val in attributes.items():
node[key] = val
if len(content) == 0:
pass
elif len(content) == 1:
node["content"] = content[0]
else:
node["content"] = content
return node
def __get_attributes(attrs):
attributes = {}
if "href" in attrs:
attributes["href"] = attrs["href"]
if "rowspan" in attrs:
attributes["rowSpan"] = int(attrs["rowspan"])
if "colspan" in attrs:
attributes["colSpan"] = int(attrs["colspan"])
if "style" in attrs:
attributes["style"] = __get_style(attrs["style"])
return attributes
def __get_style(inline_style_string):
style = {}
parsedStyle = parseStyle(inline_style_string)
if parsedStyle.fontStyle != "":
style["fontStyle"] = parsedStyle.fontStyle
if parsedStyle.fontWeight != "":
style["fontWeight"] = parsedStyle.fontWeight
if parsedStyle.fontSize != "":
style["fontSize"] = parsedStyle.fontSize
if parsedStyle.textDecoration != "":
style["textDecorationLine"] = parsedStyle.textDecoration
if parsedStyle.verticalAlign != "":
style["verticalAlign"] = parsedStyle.verticalAlign
if parsedStyle.textAlign != "":
style["textAlign"] = parsedStyle.textAlign
if parsedStyle.listStyleType != "":
style["listStyleType"] = parsedStyle.listStyleType
return style