import re from datetime import datetime, date from bs4 import BeautifulSoup import yomichan as Yomichan import util as Util class JitenonYoji: columns = { "四字熟語": ["yojijukugo", ""], "読み方": ["yomikata", ""], "意味": ["imi", ""], "出典": ["shutten", ""], "漢検級": ["kankenkyuu", ""], "場面用途": ["bamenyouto", ""], "異形": ["ikei", []], "類義語": ["ruigigo", []], } def __init__(self, sequence): self.sequence = sequence self.yomichan_glossary = [""] self.modified_date = date(1970, 1, 1) self.attribution = "" for column in self.columns.values(): setattr(self, column[0], column[1]) def add_document(self, html): yoji_soup = BeautifulSoup(html, features="html5lib") self.__set_modified_date(html) self.attribution = yoji_soup.find(class_="copyright").text table = yoji_soup.find(class_="kanjirighttb") rows = table.find("tbody").find_all("tr") colname = "" for row in rows: colname = row.th.text if row.th is not None else colname colval = row.td.decode_contents() self.__set_column(colname, colval) self.yomichan_glossary = [Yomichan.soup_to_gloss(table)] def yomichan_terms(self): terms = [] for idx, headword in enumerate(self.__headwords()): (yoji, reading) = headword definition_tags = None inflection_rules = "" score = -idx glossary = self.yomichan_glossary sequence = self.sequence term_tags = "" term = [ yoji, reading, definition_tags, inflection_rules, score, glossary, sequence, term_tags ] terms.append(term) return terms def __set_modified_date(self, html): m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html) if not m: return date = datetime.strptime(m.group(1), '%Y-%m-%d').date() self.modified_date = date def __set_column(self, colname, colval): attr_name = self.columns[colname][0] attr_value = getattr(self, attr_name) colval = colval.replace("\n", "").replace(",", "、").strip() if isinstance(attr_value, str): setattr(self, attr_name, colval) elif isinstance(attr_value, list): if len(attr_value) == 0: setattr(self, attr_name, [colval]) else: attr_value.append(colval) setattr(self, attr_name, attr_value) def __headwords(self): words = [] for yomikata in self.__yomikatas(): headword = [self.yojijukugo, yomikata] if headword in words: words.remove(headword) words.append(headword) for headword in self.__ikei_headwords(): if headword in words: words.remove(headword) words.append(headword) return words def __yomikatas(self): m = re.search(r"^[ぁ-ヿ]+$", self.yomikata) if m: return [self.yomikata] m = re.search(r"^([ぁ-ヿ]+)
", self.yomikata) if m: return [m.group(1)] m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", self.yomikata) if m: return Util.expand_shouryaku(self.yomikata) m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", self.yomikata) if m: yomikatas = [m.group(1)] alts = m.group(2).split("/") for alt in alts: yomikatas.append(alt.strip()) return yomikatas raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}") def __ikei_headwords(self): ikei_headwords = [] for val in self.ikei: m = re.search(r"^([^(]+)(([ぁ-ヿ]+))$", val) if m: headword = [m.group(1), m.group(2)] ikei_headwords.append(headword) else: raise Exception(f"Invalid 異形 format: {val}\n{self}") return ikei_headwords def __str__(self): colvals = [str(self.sequence)] for attr in self.columns.values(): attr_val = getattr(self, attr[0]) if isinstance(attr_val, str): colvals.append(attr_val) elif isinstance(attr_val, list): colvals.append(";".join(attr_val)) return ",".join(colvals)