61 lines
2.2 KiB
Python
61 lines
2.2 KiB
Python
from abc import abstractmethod
|
|
from bs4 import BeautifulSoup
|
|
|
|
from bot.entries.base.entry import Entry
|
|
import bot.entries.base.expressions as Expressions
|
|
|
|
|
|
class SanseidoEntry(Entry):
|
|
def set_page(self, page):
|
|
page = self._decompose_subentries(page)
|
|
self._page = page
|
|
|
|
def get_page_soup(self):
|
|
soup = BeautifulSoup(self._page, "xml")
|
|
return soup
|
|
|
|
def get_global_identifier(self):
|
|
parent_part = format(self.entry_id[0], '06')
|
|
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
|
return f"@{self.target.value}-{parent_part}-{child_part}"
|
|
|
|
def _decompose_subentries(self, page):
|
|
soup = BeautifulSoup(page, features="xml")
|
|
for x in self._get_subentry_parameters():
|
|
subentry_class, tags, subentry_list = x
|
|
for tag in tags:
|
|
tag_soup = soup.find(tag)
|
|
while tag_soup is not None:
|
|
tag_soup.name = "項目"
|
|
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
|
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
|
subentry = subentry_class(self.target, subentry_id)
|
|
page = tag_soup.decode()
|
|
subentry.set_page(page)
|
|
subentry_list.append(subentry)
|
|
tag_soup.decompose()
|
|
tag_soup = soup.find(tag)
|
|
return soup.decode()
|
|
|
|
@abstractmethod
|
|
def _get_subentry_parameters(self):
|
|
raise NotImplementedError
|
|
|
|
def _add_variant_expressions(self, headwords):
|
|
for expressions in headwords.values():
|
|
Expressions.add_variant_kanji(expressions)
|
|
Expressions.add_fullwidth(expressions)
|
|
Expressions.remove_iteration_mark(expressions)
|
|
Expressions.add_iteration_mark(expressions)
|
|
|
|
@staticmethod
|
|
def id_string_to_entry_id(id_string):
|
|
parts = id_string.split("-")
|
|
if len(parts) == 1:
|
|
return (int(parts[0]), 0)
|
|
elif len(parts) == 2:
|
|
# subentries have a hexadecimal part
|
|
return (int(parts[0]), int(parts[1], 16))
|
|
else:
|
|
raise Exception(f"Invalid entry ID: {id_string}")
|