Organize yomichan term creation logic into separate classes

This commit is contained in:
stephenmk 2023-04-22 20:26:54 -05:00
parent 7d7e32ba45
commit 13f07c9000
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
10 changed files with 232 additions and 144 deletions

View file

@ -3,10 +3,10 @@ from bs4 import BeautifulSoup
import bot.scraper as Scraper
from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.yomichan.export import JitenonKotowazaExporter
from bot.entries.jitenon_yoji import JitenonYojiEntry
from bot.entries.jitenon import JitenonYojiEntry
from bot.yomichan.export import JitenonYojiExporter
@ -15,14 +15,14 @@ class Crawler():
self._crawl_map = {}
self.__entries = []
def make_entries(self):
def read_entries(self):
entries_len = len(self._crawl_map)
items = self._crawl_map.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
entry = self._entry_class(entry_id)
entry.add_document(entry_path)
entry.set_markup(entry_path)
self.__entries.append(entry)
print()

View file

@ -2,35 +2,52 @@ import re
from datetime import datetime, date
from bs4 import BeautifulSoup
import bot.yomichan.html_gloss as YomichanGloss
import bot.util as Util
class JitenonEntry:
def __init__(self, sequence):
self.sequence = sequence
self.yomichan_glossary = [""]
def __init__(self, entry_id):
self.entry_id = entry_id
self.markup = ""
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self.columns.values():
for column in self.COLUMNS.values():
setattr(self, column[0], column[1])
self._headwords = None
def add_document(self, path):
def set_markup(self, path):
with open(path, "r") as f:
html = f.read()
yoji_soup = BeautifulSoup(html, features="html5lib")
soup = BeautifulSoup(html, features="html5lib")
self.__set_modified_date(html)
self.attribution = yoji_soup.find(class_="copyright").text
table = yoji_soup.find(class_="kanjirighttb")
self.attribution = soup.find(class_="copyright").text
table = soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = self.__clean(row.td.text)
colval = self.__clean_text(row.td.text)
self.__set_column(colname, colval)
self.__prepare_yomichan_soup(table)
gloss = YomichanGloss.make_gloss(table)
self.yomichan_glossary = [gloss]
self.markup = table.decode()
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
return self._headwords
def _set_headwords(self):
headwords = {}
for yomikata in self.__yomikatas():
headwords[yomikata] = [self.expression]
ikei_headwords = self.__ikei_headwords()
for reading, expressions in ikei_headwords.items():
if reading not in headwords:
headwords[reading] = []
for expression in expressions:
if expression not in headwords[reading]:
headwords[reading].append(expression)
self._headwords = headwords
def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
@ -39,15 +56,8 @@ class JitenonEntry:
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
self.modified_date = date
def __clean(self, text):
text = text.replace("\n", "")
text = text.replace(",", "")
text = text.replace(" ", "")
text = text.strip()
return text
def __set_column(self, colname, colval):
attr_name = self.columns[colname][0]
attr_name = self.COLUMNS[colname][0]
attr_value = getattr(self, attr_name)
if isinstance(attr_value, str):
setattr(self, attr_name, colval)
@ -57,35 +67,6 @@ class JitenonEntry:
else:
attr_value.append(colval)
def __prepare_yomichan_soup(self, soup):
patterns = [
r"^(.+)[ぁ-ヿ、\s]+$",
r"^(.+)[ぁ-ヿ、\s]+[ぁ-ヿ、\s][ぁ-ヿ、\s]+$"
]
for a in soup.find_all("a"):
for pattern in patterns:
m = re.search(pattern, a.text)
if m:
a['href'] = f"?query={m.group(1)}&wildcards=off"
break
for p in soup.find_all("p"):
p.name = "span"
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
def _headwords(self):
words = []
for yomikata in self.__yomikatas():
headword = [self.expression, yomikata]
if headword in words:
words.remove(headword)
words.append(headword)
for headword in self.__ikei_headwords():
if headword in words:
words.remove(headword)
words.append(headword)
return words
def __yomikatas(self):
yomikata = self.yomikata
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
@ -108,22 +89,73 @@ class JitenonEntry:
return [""]
def __ikei_headwords(self):
ikei_headwords = []
ikei_headwords = {}
for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if m:
headword = [m.group(1), m.group(2)]
ikei_headwords.append(headword)
else:
if not m:
print(f"Invalid 異形 format: {val}\n{self}\n")
continue
expression = m.group(1)
reading = m.group(2)
if reading not in ikei_headwords:
ikei_headwords[reading] = []
if expression not in ikei_headwords[reading]:
ikei_headwords[reading].append(expression)
return ikei_headwords
@staticmethod
def __clean_text(text):
text = text.replace("\n", "")
text = text.replace(",", "")
text = text.replace(" ", "")
text = text.strip()
return text
def __str__(self):
colvals = [str(self.sequence)]
for attr in self.columns.values():
colvals = [str(self.entry_id)]
for attr in self.COLUMNS.values():
attr_val = getattr(self, attr[0])
if isinstance(attr_val, str):
colvals.append(attr_val)
elif isinstance(attr_val, list):
colvals.append("".join(attr_val))
return ",".join(colvals)
class JitenonYojiEntry(JitenonEntry):
COLUMNS = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
super().__init__(sequence)
class JitenonKotowazaEntry(JitenonEntry):
COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"例文": ["reibun", ""],
"異形": ["ikei", []],
"類句": ["ruiku", []],
}
def __init__(self, sequence):
super().__init__(sequence)
def _set_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
self._headwords = {
"かなぼうひき": ["金棒引き", "鉄棒引き"]
}
else:
super()._set_headwords()

View file

@ -1,41 +0,0 @@
from bot.entries.jitenon import JitenonEntry
import bot.yomichan.grammar as Grammar
class JitenonKotowazaEntry(JitenonEntry):
columns = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"例文": ["reibun", ""],
"異形": ["ikei", []],
"類句": ["ruiku", []],
}
def __init__(self, sequence):
super().__init__(sequence)
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self._headwords()):
(expression, reading) = headword
definition_tags = None
inflection_rules = Grammar.sudachi_rules(expression)
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def _headwords(self):
if self.expression == "金棒引き・鉄棒引き":
return [["金棒引き", "かなぼうひき"],
["鉄棒引き", "かなぼうひき"]]
else:
return super()._headwords()

View file

@ -1,38 +0,0 @@
from bot.entries.jitenon import JitenonEntry
class JitenonYojiEntry(JitenonEntry):
columns = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
super().__init__(sequence)
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self._headwords()):
(expression, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = self.__term_tags()
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def __term_tags(self):
tags = self.kankenkyuu.replace(" ", "").split("/")
return " ".join(tags)

View file

@ -5,7 +5,10 @@ from pathlib import Path
from datetime import datetime
from platformdirs import user_documents_dir, user_cache_dir
import bot.data as Data
from bot.data import yomichan_metadata
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
class Exporter:
@ -14,7 +17,7 @@ class Exporter:
self._terms_per_file = 2000
def export(self, entries):
meta = Data.yomichan_metadata()
meta = yomichan_metadata()
index = meta[self._name]["index"]
index["revision"] = self._get_revision(entries)
index["attribution"] = self._get_attribution(entries)
@ -40,7 +43,8 @@ class Exporter:
for idx, entry in enumerate(entries):
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
print(update, end='\r', flush=True)
for term in entry.yomichan_terms():
new_terms = self._terminator.make_terms(entry)
for term in new_terms:
terms.append(term)
print()
return terms
@ -120,9 +124,11 @@ class JitenonYojiExporter(JitenonExporter):
def __init__(self):
super().__init__()
self._name = "jitenon-yoji"
self._terminator = JitenonYojiTerminator()
class JitenonKotowazaExporter(JitenonExporter):
def __init__(self):
super().__init__()
self._name = "jitenon-kotowaza"
self._terminator = JitenonKotowazaTerminator()

View file

@ -0,0 +1,25 @@
import re
from bs4 import BeautifulSoup
from bot.yomichan.glossary.gloss import make_gloss
def make_glossary(entry):
soup = BeautifulSoup(entry.markup, "html5lib")
patterns = [
r"^(.+)[ぁ-ヿ、\s]+$",
r"^(.+)[ぁ-ヿ、\s]+[ぁ-ヿ、\s][ぁ-ヿ、\s]+$"
]
for a in soup.find_all("a"):
for pattern in patterns:
m = re.search(pattern, a.text)
if m:
a['href'] = f"?query={m.group(1)}&wildcards=off"
break
for p in soup.find_all("p"):
p.name = "span"
for th in soup.find_all("th"):
th['style'] = "vertical-align: middle; text-align: center;"
gloss = make_gloss(soup.table)
glossary = [gloss]
return glossary

View file

@ -0,0 +1,50 @@
import bot.yomichan.grammar as Grammar
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.jitenon import make_glossary
class JitenonTerminator(Terminator):
def __init__(self):
super().__init__()
def _definition_tags(self, entry):
return None
def _glossary(self, entry):
if entry.entry_id in self.glossary_cache:
return self.glossary_cache[entry.entry_id]
glossary = make_glossary(entry)
self.glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id
def _link_glossary_parameters(self, entry):
return []
def _subentry_lists(self, entry):
return []
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self):
super().__init__()
def _inflection_rules(self, entry, expression):
return ""
def _term_tags(self, entry):
tags = entry.kankenkyuu.replace(" ", "").split("/")
return " ".join(tags)
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self):
super().__init__()
def _inflection_rules(self, entry, expression):
return Grammar.sudachi_rules(expression)
def _term_tags(self, entry):
return ""

View file

@ -0,0 +1,54 @@
class Terminator:
def __init__(self):
self.glossary_cache = {}
def make_terms(self, entry):
terms = []
headwords = entry.get_headwords()
for reading, expressions in headwords.items():
for expression in expressions:
definition_tags = self._definition_tags(entry)
inflection_rules = self._inflection_rules(entry, expression)
score = -len(terms)
glossary = self._glossary(entry)
sequence = self._sequence(entry)
term_tags = ""
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
for x in self._link_glossary_parameters(entry):
(subentries, definition_tags) = x
if len(subentries) == 0:
continue
score = -len(terms)
glossary = self.__links_glossary(subentries)
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
for subentries in self._subentry_lists(entry):
for subentry in subentries:
for term in self.make_terms(subentry):
terms.append(term)
return terms
@staticmethod
def __links_glossary(subentries):
glossary = []
for subentry in subentries:
exp = subentry.get_first_expression()
gloss = {
"type": "structured-content",
"content": {
"tag": "a",
"href": f"?query={exp}&wildcards=off",
"content": exp,
}
}
glossary.append(gloss)
return glossary

View file

@ -44,7 +44,7 @@ def main():
crawler_class = crawlers[args.target]
crawler = crawler_class()
crawler.crawl()
crawler.make_entries()
crawler.read_entries()
crawler.make_yomichan_dictionary()