Organize yomichan term creation logic into separate classes
This commit is contained in:
parent
7d7e32ba45
commit
13f07c9000
|
@ -3,10 +3,10 @@ from bs4 import BeautifulSoup
|
|||
|
||||
import bot.scraper as Scraper
|
||||
|
||||
from bot.entries.jitenon_kotowaza import JitenonKotowazaEntry
|
||||
from bot.entries.jitenon import JitenonKotowazaEntry
|
||||
from bot.yomichan.export import JitenonKotowazaExporter
|
||||
|
||||
from bot.entries.jitenon_yoji import JitenonYojiEntry
|
||||
from bot.entries.jitenon import JitenonYojiEntry
|
||||
from bot.yomichan.export import JitenonYojiExporter
|
||||
|
||||
|
||||
|
@ -15,14 +15,14 @@ class Crawler():
|
|||
self._crawl_map = {}
|
||||
self.__entries = []
|
||||
|
||||
def make_entries(self):
|
||||
def read_entries(self):
|
||||
entries_len = len(self._crawl_map)
|
||||
items = self._crawl_map.items()
|
||||
for idx, (entry_id, entry_path) in enumerate(items):
|
||||
update = f"Reading entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = self._entry_class(entry_id)
|
||||
entry.add_document(entry_path)
|
||||
entry.set_markup(entry_path)
|
||||
self.__entries.append(entry)
|
||||
print()
|
||||
|
||||
|
|
|
@ -2,35 +2,52 @@ import re
|
|||
from datetime import datetime, date
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.yomichan.html_gloss as YomichanGloss
|
||||
import bot.util as Util
|
||||
|
||||
|
||||
class JitenonEntry:
|
||||
def __init__(self, sequence):
|
||||
self.sequence = sequence
|
||||
self.yomichan_glossary = [""]
|
||||
def __init__(self, entry_id):
|
||||
self.entry_id = entry_id
|
||||
self.markup = ""
|
||||
self.modified_date = date(1970, 1, 1)
|
||||
self.attribution = ""
|
||||
for column in self.columns.values():
|
||||
for column in self.COLUMNS.values():
|
||||
setattr(self, column[0], column[1])
|
||||
self._headwords = None
|
||||
|
||||
def add_document(self, path):
|
||||
def set_markup(self, path):
|
||||
with open(path, "r") as f:
|
||||
html = f.read()
|
||||
yoji_soup = BeautifulSoup(html, features="html5lib")
|
||||
soup = BeautifulSoup(html, features="html5lib")
|
||||
self.__set_modified_date(html)
|
||||
self.attribution = yoji_soup.find(class_="copyright").text
|
||||
table = yoji_soup.find(class_="kanjirighttb")
|
||||
self.attribution = soup.find(class_="copyright").text
|
||||
table = soup.find(class_="kanjirighttb")
|
||||
rows = table.find("tbody").find_all("tr")
|
||||
colname = ""
|
||||
for row in rows:
|
||||
colname = row.th.text if row.th is not None else colname
|
||||
colval = self.__clean(row.td.text)
|
||||
colval = self.__clean_text(row.td.text)
|
||||
self.__set_column(colname, colval)
|
||||
self.__prepare_yomichan_soup(table)
|
||||
gloss = YomichanGloss.make_gloss(table)
|
||||
self.yomichan_glossary = [gloss]
|
||||
self.markup = table.decode()
|
||||
|
||||
def get_headwords(self):
|
||||
if self._headwords is not None:
|
||||
return self._headwords
|
||||
self._set_headwords()
|
||||
return self._headwords
|
||||
|
||||
def _set_headwords(self):
|
||||
headwords = {}
|
||||
for yomikata in self.__yomikatas():
|
||||
headwords[yomikata] = [self.expression]
|
||||
ikei_headwords = self.__ikei_headwords()
|
||||
for reading, expressions in ikei_headwords.items():
|
||||
if reading not in headwords:
|
||||
headwords[reading] = []
|
||||
for expression in expressions:
|
||||
if expression not in headwords[reading]:
|
||||
headwords[reading].append(expression)
|
||||
self._headwords = headwords
|
||||
|
||||
def __set_modified_date(self, html):
|
||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
||||
|
@ -39,15 +56,8 @@ class JitenonEntry:
|
|||
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
||||
self.modified_date = date
|
||||
|
||||
def __clean(self, text):
|
||||
text = text.replace("\n", "")
|
||||
text = text.replace(",", "、")
|
||||
text = text.replace(" ", "")
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
def __set_column(self, colname, colval):
|
||||
attr_name = self.columns[colname][0]
|
||||
attr_name = self.COLUMNS[colname][0]
|
||||
attr_value = getattr(self, attr_name)
|
||||
if isinstance(attr_value, str):
|
||||
setattr(self, attr_name, colval)
|
||||
|
@ -57,35 +67,6 @@ class JitenonEntry:
|
|||
else:
|
||||
attr_value.append(colval)
|
||||
|
||||
def __prepare_yomichan_soup(self, soup):
|
||||
patterns = [
|
||||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||||
]
|
||||
for a in soup.find_all("a"):
|
||||
for pattern in patterns:
|
||||
m = re.search(pattern, a.text)
|
||||
if m:
|
||||
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
||||
break
|
||||
for p in soup.find_all("p"):
|
||||
p.name = "span"
|
||||
for th in soup.find_all("th"):
|
||||
th['style'] = "vertical-align: middle; text-align: center;"
|
||||
|
||||
def _headwords(self):
|
||||
words = []
|
||||
for yomikata in self.__yomikatas():
|
||||
headword = [self.expression, yomikata]
|
||||
if headword in words:
|
||||
words.remove(headword)
|
||||
words.append(headword)
|
||||
for headword in self.__ikei_headwords():
|
||||
if headword in words:
|
||||
words.remove(headword)
|
||||
words.append(headword)
|
||||
return words
|
||||
|
||||
def __yomikatas(self):
|
||||
yomikata = self.yomikata
|
||||
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
||||
|
@ -108,22 +89,73 @@ class JitenonEntry:
|
|||
return [""]
|
||||
|
||||
def __ikei_headwords(self):
|
||||
ikei_headwords = []
|
||||
ikei_headwords = {}
|
||||
for val in self.ikei:
|
||||
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
||||
if m:
|
||||
headword = [m.group(1), m.group(2)]
|
||||
ikei_headwords.append(headword)
|
||||
else:
|
||||
if not m:
|
||||
print(f"Invalid 異形 format: {val}\n{self}\n")
|
||||
continue
|
||||
expression = m.group(1)
|
||||
reading = m.group(2)
|
||||
if reading not in ikei_headwords:
|
||||
ikei_headwords[reading] = []
|
||||
if expression not in ikei_headwords[reading]:
|
||||
ikei_headwords[reading].append(expression)
|
||||
return ikei_headwords
|
||||
|
||||
@staticmethod
|
||||
def __clean_text(text):
|
||||
text = text.replace("\n", "")
|
||||
text = text.replace(",", "、")
|
||||
text = text.replace(" ", "")
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
def __str__(self):
|
||||
colvals = [str(self.sequence)]
|
||||
for attr in self.columns.values():
|
||||
colvals = [str(self.entry_id)]
|
||||
for attr in self.COLUMNS.values():
|
||||
attr_val = getattr(self, attr[0])
|
||||
if isinstance(attr_val, str):
|
||||
colvals.append(attr_val)
|
||||
elif isinstance(attr_val, list):
|
||||
colvals.append(";".join(attr_val))
|
||||
return ",".join(colvals)
|
||||
|
||||
|
||||
class JitenonYojiEntry(JitenonEntry):
|
||||
COLUMNS = {
|
||||
"四字熟語": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"漢検級": ["kankenkyuu", ""],
|
||||
"場面用途": ["bamenyouto", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類義語": ["ruigigo", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
super().__init__(sequence)
|
||||
|
||||
|
||||
class JitenonKotowazaEntry(JitenonEntry):
|
||||
COLUMNS = {
|
||||
"言葉": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"例文": ["reibun", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類句": ["ruiku", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
super().__init__(sequence)
|
||||
|
||||
def _set_headwords(self):
|
||||
if self.expression == "金棒引き・鉄棒引き":
|
||||
self._headwords = {
|
||||
"かなぼうひき": ["金棒引き", "鉄棒引き"]
|
||||
}
|
||||
else:
|
||||
super()._set_headwords()
|
||||
|
|
|
@ -1,41 +0,0 @@
|
|||
from bot.entries.jitenon import JitenonEntry
|
||||
import bot.yomichan.grammar as Grammar
|
||||
|
||||
|
||||
class JitenonKotowazaEntry(JitenonEntry):
|
||||
columns = {
|
||||
"言葉": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"例文": ["reibun", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類句": ["ruiku", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
super().__init__(sequence)
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
for idx, headword in enumerate(self._headwords()):
|
||||
(expression, reading) = headword
|
||||
definition_tags = None
|
||||
inflection_rules = Grammar.sudachi_rules(expression)
|
||||
score = -idx
|
||||
glossary = self.yomichan_glossary
|
||||
sequence = self.sequence
|
||||
term_tags = ""
|
||||
term = [
|
||||
expression, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
return terms
|
||||
|
||||
def _headwords(self):
|
||||
if self.expression == "金棒引き・鉄棒引き":
|
||||
return [["金棒引き", "かなぼうひき"],
|
||||
["鉄棒引き", "かなぼうひき"]]
|
||||
else:
|
||||
return super()._headwords()
|
|
@ -1,38 +0,0 @@
|
|||
from bot.entries.jitenon import JitenonEntry
|
||||
|
||||
|
||||
class JitenonYojiEntry(JitenonEntry):
|
||||
columns = {
|
||||
"四字熟語": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"漢検級": ["kankenkyuu", ""],
|
||||
"場面用途": ["bamenyouto", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類義語": ["ruigigo", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
super().__init__(sequence)
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
for idx, headword in enumerate(self._headwords()):
|
||||
(expression, reading) = headword
|
||||
definition_tags = None
|
||||
inflection_rules = ""
|
||||
score = -idx
|
||||
glossary = self.yomichan_glossary
|
||||
sequence = self.sequence
|
||||
term_tags = self.__term_tags()
|
||||
term = [
|
||||
expression, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
return terms
|
||||
|
||||
def __term_tags(self):
|
||||
tags = self.kankenkyuu.replace(" ", "").split("/")
|
||||
return " ".join(tags)
|
|
@ -5,7 +5,10 @@ from pathlib import Path
|
|||
from datetime import datetime
|
||||
from platformdirs import user_documents_dir, user_cache_dir
|
||||
|
||||
import bot.data as Data
|
||||
from bot.data import yomichan_metadata
|
||||
|
||||
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
|
||||
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
|
||||
|
||||
|
||||
class Exporter:
|
||||
|
@ -14,7 +17,7 @@ class Exporter:
|
|||
self._terms_per_file = 2000
|
||||
|
||||
def export(self, entries):
|
||||
meta = Data.yomichan_metadata()
|
||||
meta = yomichan_metadata()
|
||||
index = meta[self._name]["index"]
|
||||
index["revision"] = self._get_revision(entries)
|
||||
index["attribution"] = self._get_attribution(entries)
|
||||
|
@ -40,7 +43,8 @@ class Exporter:
|
|||
for idx, entry in enumerate(entries):
|
||||
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
for term in entry.yomichan_terms():
|
||||
new_terms = self._terminator.make_terms(entry)
|
||||
for term in new_terms:
|
||||
terms.append(term)
|
||||
print()
|
||||
return terms
|
||||
|
@ -120,9 +124,11 @@ class JitenonYojiExporter(JitenonExporter):
|
|||
def __init__(self):
|
||||
super().__init__()
|
||||
self._name = "jitenon-yoji"
|
||||
self._terminator = JitenonYojiTerminator()
|
||||
|
||||
|
||||
class JitenonKotowazaExporter(JitenonExporter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._name = "jitenon-kotowaza"
|
||||
self._terminator = JitenonKotowazaTerminator()
|
||||
|
|
25
bot/yomichan/glossary/jitenon.py
Normal file
25
bot/yomichan/glossary/jitenon.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.yomichan.glossary.gloss import make_gloss
|
||||
|
||||
|
||||
def make_glossary(entry):
|
||||
soup = BeautifulSoup(entry.markup, "html5lib")
|
||||
patterns = [
|
||||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||||
]
|
||||
for a in soup.find_all("a"):
|
||||
for pattern in patterns:
|
||||
m = re.search(pattern, a.text)
|
||||
if m:
|
||||
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
||||
break
|
||||
for p in soup.find_all("p"):
|
||||
p.name = "span"
|
||||
for th in soup.find_all("th"):
|
||||
th['style'] = "vertical-align: middle; text-align: center;"
|
||||
gloss = make_gloss(soup.table)
|
||||
glossary = [gloss]
|
||||
return glossary
|
50
bot/yomichan/terms/jitenon.py
Normal file
50
bot/yomichan/terms/jitenon.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import bot.yomichan.grammar as Grammar
|
||||
from bot.yomichan.terms.terminator import Terminator
|
||||
from bot.yomichan.glossary.jitenon import make_glossary
|
||||
|
||||
|
||||
class JitenonTerminator(Terminator):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def _definition_tags(self, entry):
|
||||
return None
|
||||
|
||||
def _glossary(self, entry):
|
||||
if entry.entry_id in self.glossary_cache:
|
||||
return self.glossary_cache[entry.entry_id]
|
||||
glossary = make_glossary(entry)
|
||||
self.glossary_cache[entry.entry_id] = glossary
|
||||
return glossary
|
||||
|
||||
def _sequence(self, entry):
|
||||
return entry.entry_id
|
||||
|
||||
def _link_glossary_parameters(self, entry):
|
||||
return []
|
||||
|
||||
def _subentry_lists(self, entry):
|
||||
return []
|
||||
|
||||
|
||||
class JitenonYojiTerminator(JitenonTerminator):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def _inflection_rules(self, entry, expression):
|
||||
return ""
|
||||
|
||||
def _term_tags(self, entry):
|
||||
tags = entry.kankenkyuu.replace(" ", "").split("/")
|
||||
return " ".join(tags)
|
||||
|
||||
|
||||
class JitenonKotowazaTerminator(JitenonTerminator):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def _inflection_rules(self, entry, expression):
|
||||
return Grammar.sudachi_rules(expression)
|
||||
|
||||
def _term_tags(self, entry):
|
||||
return ""
|
54
bot/yomichan/terms/terminator.py
Normal file
54
bot/yomichan/terms/terminator.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
class Terminator:
|
||||
def __init__(self):
|
||||
self.glossary_cache = {}
|
||||
|
||||
def make_terms(self, entry):
|
||||
terms = []
|
||||
headwords = entry.get_headwords()
|
||||
for reading, expressions in headwords.items():
|
||||
for expression in expressions:
|
||||
definition_tags = self._definition_tags(entry)
|
||||
inflection_rules = self._inflection_rules(entry, expression)
|
||||
score = -len(terms)
|
||||
glossary = self._glossary(entry)
|
||||
sequence = self._sequence(entry)
|
||||
term_tags = ""
|
||||
term = [
|
||||
expression, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
|
||||
for x in self._link_glossary_parameters(entry):
|
||||
(subentries, definition_tags) = x
|
||||
if len(subentries) == 0:
|
||||
continue
|
||||
score = -len(terms)
|
||||
glossary = self.__links_glossary(subentries)
|
||||
term = [
|
||||
expression, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
|
||||
for subentries in self._subentry_lists(entry):
|
||||
for subentry in subentries:
|
||||
for term in self.make_terms(subentry):
|
||||
terms.append(term)
|
||||
return terms
|
||||
|
||||
@staticmethod
|
||||
def __links_glossary(subentries):
|
||||
glossary = []
|
||||
for subentry in subentries:
|
||||
exp = subentry.get_first_expression()
|
||||
gloss = {
|
||||
"type": "structured-content",
|
||||
"content": {
|
||||
"tag": "a",
|
||||
"href": f"?query={exp}&wildcards=off",
|
||||
"content": exp,
|
||||
}
|
||||
}
|
||||
glossary.append(gloss)
|
||||
return glossary
|
|
@ -44,7 +44,7 @@ def main():
|
|||
crawler_class = crawlers[args.target]
|
||||
crawler = crawler_class()
|
||||
crawler.crawl()
|
||||
crawler.make_entries()
|
||||
crawler.read_entries()
|
||||
crawler.make_yomichan_dictionary()
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue