sankoku8 bugfix: account for images in headwords
Affects ビャンビャンめん, さんだつ (簒奪), and some words with hypens such as J-POP and CD-ROM.
This commit is contained in:
parent
4eb7e12f37
commit
a5bb8d6f40
|
@ -67,6 +67,7 @@ class BaseEntry(SanseidoEntry):
|
||||||
def _find_expressions(self, soup):
|
def _find_expressions(self, soup):
|
||||||
expressions = []
|
expressions = []
|
||||||
for hyouki in soup.find_all(self._hyouki_name):
|
for hyouki in soup.find_all(self._hyouki_name):
|
||||||
|
self._fill_alts(hyouki)
|
||||||
for expression in parse_hyouki_soup(hyouki, [""]):
|
for expression in parse_hyouki_soup(hyouki, [""]):
|
||||||
expressions.append(expression)
|
expressions.append(expression)
|
||||||
return expressions
|
return expressions
|
||||||
|
@ -95,3 +96,9 @@ class BaseEntry(SanseidoEntry):
|
||||||
]
|
]
|
||||||
for name in unused_nodes:
|
for name in unused_nodes:
|
||||||
Soup.delete_soup_nodes(soup, name)
|
Soup.delete_soup_nodes(soup, name)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fill_alts(soup):
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
if img.has_attr("alt"):
|
||||||
|
img.string = img.attrs["alt"]
|
||||||
|
|
|
@ -4,9 +4,17 @@ from bs4 import BeautifulSoup
|
||||||
from bot.data import get_adobe_glyph
|
from bot.data import get_adobe_glyph
|
||||||
|
|
||||||
|
|
||||||
|
__GAIJI = {
|
||||||
|
"svg-gaiji/byan.svg": "𰻞",
|
||||||
|
"svg-gaiji/G16EF.svg": "簒",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def preprocess_page(page):
|
def preprocess_page(page):
|
||||||
soup = BeautifulSoup(page, features="xml")
|
soup = BeautifulSoup(page, features="xml")
|
||||||
__replace_glyph_codes(soup)
|
__replace_glyph_codes(soup)
|
||||||
|
__add_image_alt_text(soup)
|
||||||
|
__replace_tatehyphen(soup)
|
||||||
page = __strip_page(soup)
|
page = __strip_page(soup)
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
@ -20,6 +28,21 @@ def __replace_glyph_codes(soup):
|
||||||
geta.replace_with(glyph)
|
geta.replace_with(glyph)
|
||||||
|
|
||||||
|
|
||||||
|
def __add_image_alt_text(soup):
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
if not img.has_attr("src"):
|
||||||
|
continue
|
||||||
|
src = img.attrs["src"]
|
||||||
|
if src in __GAIJI:
|
||||||
|
img.attrs["alt"] = __GAIJI[src]
|
||||||
|
|
||||||
|
|
||||||
|
def __replace_tatehyphen(soup):
|
||||||
|
for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}):
|
||||||
|
img.string = "−"
|
||||||
|
img.unwrap()
|
||||||
|
|
||||||
|
|
||||||
def __strip_page(soup):
|
def __strip_page(soup):
|
||||||
koumoku = soup.find(["項目"])
|
koumoku = soup.find(["項目"])
|
||||||
if koumoku is not None:
|
if koumoku is not None:
|
||||||
|
|
Loading…
Reference in a new issue