Add support for Shinmeikai 8th edition & Daijirin 4th edition
This commit is contained in:
parent
0cfa3a19df
commit
5aa954bf2d
134
bot/crawlers.py
134
bot/crawlers.py
|
@ -1,41 +1,59 @@
|
|||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.scraper as Scraper
|
||||
|
||||
from bot.entries.jitenon import JitenonKotowazaEntry
|
||||
from bot.yomichan.export import JitenonKotowazaExporter
|
||||
|
||||
from bot.entries.jitenon import JitenonYojiEntry
|
||||
from bot.entries.smk8 import Smk8Entry
|
||||
from bot.entries.daijirin2 import Daijirin2Entry
|
||||
|
||||
from bot.yomichan.export import JitenonKotowazaExporter
|
||||
from bot.yomichan.export import JitenonYojiExporter
|
||||
from bot.yomichan.export import Smk8Exporter
|
||||
from bot.yomichan.export import Daijirin2Exporter
|
||||
|
||||
|
||||
class Crawler():
|
||||
def __init__(self):
|
||||
self._crawl_map = {}
|
||||
self.__entries = []
|
||||
class _Crawler():
|
||||
def __init__(self, args):
|
||||
self._page_dir = args.page_dir
|
||||
self._image_dir = args.image_dir
|
||||
self._page_map = {}
|
||||
self._entries = []
|
||||
|
||||
def read_entries(self):
|
||||
entries_len = len(self._crawl_map)
|
||||
items = self._crawl_map.items()
|
||||
for idx, (entry_id, entry_path) in enumerate(items):
|
||||
update = f"Reading entry {idx+1}/{entries_len}"
|
||||
def read_pages(self):
|
||||
pages_len = len(self._page_map)
|
||||
items = self._page_map.items()
|
||||
for idx, (page_id, page_path) in enumerate(items):
|
||||
update = f"Reading page {idx+1}/{pages_len}"
|
||||
print(update, end='\r', flush=True)
|
||||
entry = self._entry_class(entry_id)
|
||||
entry.set_markup(entry_path)
|
||||
self.__entries.append(entry)
|
||||
entry = self._entry_class(page_id)
|
||||
with open(page_path, "r") as f:
|
||||
page = f.read()
|
||||
entry.set_page(page)
|
||||
self._entries.append(entry)
|
||||
print()
|
||||
|
||||
def make_yomichan_dictionary(self):
|
||||
self._yomi_exporter.export(self.__entries)
|
||||
self._yomi_exporter.export(self._entries, self._image_dir)
|
||||
|
||||
def _parse_page_id(self, page_link):
|
||||
m = re.search(self._page_id_pattern, page_link)
|
||||
if not m:
|
||||
return None
|
||||
page_id = int(m.group(1))
|
||||
if page_id in self._page_map:
|
||||
return None
|
||||
return page_id
|
||||
|
||||
|
||||
class JitenonCrawler(Crawler):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
class _JitenonCrawler(_Crawler):
|
||||
def __init__(self, args):
|
||||
super().__init__(args)
|
||||
|
||||
def crawl(self):
|
||||
print(f"Scraping {self._name}...")
|
||||
def collect_pages(self):
|
||||
print("Scraping jitenon.jp")
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
|
@ -44,40 +62,60 @@ class JitenonCrawler(Crawler):
|
|||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
entry_link = kana_a['href']
|
||||
entry_id = self.__parse_entry_id(entry_link)
|
||||
if entry_id is None:
|
||||
page_link = kana_a['href']
|
||||
page_id = self._parse_page_id(page_link)
|
||||
if page_id is None:
|
||||
continue
|
||||
_, entry_path = jitenon.scrape(entry_link)
|
||||
self._crawl_map[entry_id] = entry_path
|
||||
entries_len = len(self._crawl_map)
|
||||
print(f"Finished scraping {entries_len} entries")
|
||||
|
||||
def __parse_entry_id(self, entry_link):
|
||||
m = re.search(self._entry_id_pattern, entry_link)
|
||||
if not m:
|
||||
return None
|
||||
entry_id = int(m.group(1))
|
||||
if entry_id in self._crawl_map:
|
||||
return None
|
||||
return entry_id
|
||||
_, page_path = jitenon.scrape(page_link)
|
||||
self._page_map[page_id] = page_path
|
||||
pages_len = len(self._page_map)
|
||||
print(f"Finished scraping {pages_len} pages")
|
||||
|
||||
|
||||
class JitenonYojiCrawler(JitenonCrawler):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
class JitenonYojiCrawler(_JitenonCrawler):
|
||||
def __init__(self, args):
|
||||
super().__init__(args)
|
||||
self._entry_class = JitenonYojiEntry
|
||||
self._yomi_exporter = JitenonYojiExporter()
|
||||
self._name = "jitenon-yoji"
|
||||
self._yomi_exporter = JitenonYojiExporter(args.target)
|
||||
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
|
||||
self._entry_id_pattern = r"([0-9]+).html"
|
||||
self._page_id_pattern = r"([0-9]+)\.html$"
|
||||
|
||||
|
||||
class JitenonKotowazaCrawler(JitenonCrawler):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
class JitenonKotowazaCrawler(_JitenonCrawler):
|
||||
def __init__(self, args):
|
||||
super().__init__(args)
|
||||
self._entry_class = JitenonKotowazaEntry
|
||||
self._yomi_exporter = JitenonKotowazaExporter()
|
||||
self._name = "jitenon-kotowaza"
|
||||
self._yomi_exporter = JitenonKotowazaExporter(args.target)
|
||||
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
|
||||
self._entry_id_pattern = r"([0-9]+).php"
|
||||
self._page_id_pattern = r"([0-9]+)\.php$"
|
||||
|
||||
|
||||
class _MonokakidoCrawler(_Crawler):
|
||||
def __init__(self, args):
|
||||
super().__init__(args)
|
||||
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
||||
|
||||
def collect_pages(self):
|
||||
print(f"Searching for page files in `{self._page_dir}`")
|
||||
for pagefile in os.listdir(self._page_dir):
|
||||
page_id = self._parse_page_id(pagefile)
|
||||
if page_id is None or page_id == 0:
|
||||
continue
|
||||
path = os.path.join(self._page_dir, pagefile)
|
||||
self._page_map[page_id] = path
|
||||
pages_len = len(self._page_map)
|
||||
print(f"Found {pages_len} page files for processing")
|
||||
|
||||
|
||||
class Smk8Crawler(_MonokakidoCrawler):
|
||||
def __init__(self, args):
|
||||
super().__init__(args)
|
||||
self._entry_class = Smk8Entry
|
||||
self._yomi_exporter = Smk8Exporter(args.target)
|
||||
|
||||
|
||||
class Daijirin2Crawler(_MonokakidoCrawler):
|
||||
def __init__(self, args):
|
||||
super().__init__(args)
|
||||
self._entry_class = Daijirin2Entry
|
||||
self._yomi_exporter = Daijirin2Exporter(args.target)
|
||||
|
|
98
bot/data.py
98
bot/data.py
|
@ -2,11 +2,24 @@ import os
|
|||
import sys
|
||||
import json
|
||||
import csv
|
||||
from functools import cache
|
||||
from pathlib import Path
|
||||
|
||||
from platformdirs import user_config_dir
|
||||
|
||||
|
||||
@cache
|
||||
def get_adobe_glyph(code):
|
||||
adobe_glyphs = __load_adobe_glyphs()
|
||||
override_adobe_glyphs = __load_override_adobe_glyphs()
|
||||
if code in override_adobe_glyphs:
|
||||
return override_adobe_glyphs[code]
|
||||
if len(adobe_glyphs[code]) > 1:
|
||||
raise Exception(f"Multiple glyphs available for code {code}")
|
||||
return adobe_glyphs[code][0]
|
||||
|
||||
|
||||
@cache
|
||||
def load_config():
|
||||
config_dir = user_config_dir("jitenbot")
|
||||
if not Path(config_dir).is_dir():
|
||||
|
@ -22,18 +35,21 @@ def load_config():
|
|||
return config
|
||||
|
||||
|
||||
@cache
|
||||
def load_yomichan_inflection_categories():
|
||||
file_name = "yomichan_inflection_categories.json"
|
||||
data = __load_json(file_name)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def load_yomichan_metadata():
|
||||
file_name = "yomichan_metadata.json"
|
||||
data = __load_json(file_name)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def load_variant_kanji():
|
||||
def loader(data, row):
|
||||
data[row[0]] = row[1]
|
||||
|
@ -43,12 +59,94 @@ def load_variant_kanji():
|
|||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def load_smk8_phrase_readings():
|
||||
def loader(data, row):
|
||||
entry_id = (int(row[0]), int(row[1]))
|
||||
reading = row[2]
|
||||
data[entry_id] = reading
|
||||
file_name = os.path.join("smk8", "phrase_readings.csv")
|
||||
data = {}
|
||||
__load_csv(file_name, loader, data)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def load_daijirin2_phrase_readings():
|
||||
def loader(data, row):
|
||||
entry_id = (int(row[0]), int(row[1]))
|
||||
reading = row[2]
|
||||
data[entry_id] = reading
|
||||
file_name = os.path.join("daijirin2", "phrase_readings.csv")
|
||||
data = {}
|
||||
__load_csv(file_name, loader, data)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def load_daijirin2_kana_abbreviations():
|
||||
def loader(data, row):
|
||||
entry_id = (int(row[0]), int(row[1]))
|
||||
abbreviations = []
|
||||
for abbr in row[2:]:
|
||||
if abbr.strip() != "":
|
||||
abbreviations.append(abbr)
|
||||
data[entry_id] = abbreviations
|
||||
file_name = os.path.join("daijirin2", "kana_abbreviations.csv")
|
||||
data = {}
|
||||
__load_csv(file_name, loader, data)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def load_smk8_yomichan_name_conversion():
|
||||
file_name = os.path.join("smk8", "yomichan_name_conversion.json")
|
||||
data = __load_json(file_name)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def load_daijirin2_yomichan_name_conversion():
|
||||
file_name = os.path.join("daijirin2", "yomichan_name_conversion.json")
|
||||
data = __load_json(file_name)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def __load_default_config():
|
||||
file_name = "default_config.json"
|
||||
data = __load_json(file_name)
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def __load_adobe_glyphs():
|
||||
def loader(data, row):
|
||||
if row[0].startswith("#"):
|
||||
return
|
||||
character = chr(int(row[0].split(" ")[0], 16))
|
||||
code = int(row[2].removeprefix(" CID+"))
|
||||
if code in data:
|
||||
if character not in data[code]:
|
||||
data[code].append(character)
|
||||
else:
|
||||
data[code] = [character]
|
||||
file_name = os.path.join("adobe", "Adobe-Japan1_sequences.txt")
|
||||
data = {}
|
||||
__load_csv(file_name, loader, data, delim=';')
|
||||
return data
|
||||
|
||||
|
||||
@cache
|
||||
def __load_override_adobe_glyphs():
|
||||
file_name = os.path.join("adobe", "override_glyphs.json")
|
||||
json_data = __load_json(file_name)
|
||||
data = {}
|
||||
for key, val in json_data.items():
|
||||
data[int(key)] = val
|
||||
return data
|
||||
|
||||
|
||||
def __load_json(file_name):
|
||||
file_path = os.path.join("data", file_name)
|
||||
if not Path(file_path).is_file():
|
||||
|
|
272
bot/entries/daijirin2.py
Normal file
272
bot/entries/daijirin2.py
Normal file
|
@ -0,0 +1,272 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.expressions as Expressions
|
||||
import bot.soup as Soup
|
||||
from bot.data import load_daijirin2_phrase_readings
|
||||
from bot.data import load_daijirin2_kana_abbreviations
|
||||
from bot.entries.entry import Entry
|
||||
from bot.entries.daijirin2_preprocess import preprocess_page
|
||||
|
||||
|
||||
class _BaseDaijirin2Entry(Entry):
|
||||
ID_TO_ENTRY = {}
|
||||
SUBENTRY_ID_TO_ENTRY_ID = {}
|
||||
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
if entry_id not in self.ID_TO_ENTRY:
|
||||
self.ID_TO_ENTRY[entry_id] = self
|
||||
else:
|
||||
raise Exception(f"Duplicate entry ID: {entry_id}")
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
|
||||
|
||||
def set_page(self, page):
|
||||
page = self.__decompose_subentries(page)
|
||||
self._page = page
|
||||
|
||||
def get_page_soup(self):
|
||||
soup = BeautifulSoup(self._page, "xml")
|
||||
return soup
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
for pos_group in soup.find_all("品詞G"):
|
||||
if pos_group.parent.name == "大語義":
|
||||
self._set_part_of_speech_tags(pos_group)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _set_part_of_speech_tags(self, el):
|
||||
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
|
||||
for child in el.children:
|
||||
if child.name is not None:
|
||||
self._set_part_of_speech_tags(child)
|
||||
continue
|
||||
pos = str(child)
|
||||
if el.name not in pos_names:
|
||||
continue
|
||||
elif pos in ["[", "]"]:
|
||||
continue
|
||||
elif pos in self._part_of_speech_tags:
|
||||
continue
|
||||
else:
|
||||
self._part_of_speech_tags.append(pos)
|
||||
|
||||
def get_headwords(self):
|
||||
if self._headwords is not None:
|
||||
return self._headwords
|
||||
self._set_headwords()
|
||||
self._set_variant_headwords()
|
||||
return self._headwords
|
||||
|
||||
def _set_regular_headwords(self, soup):
|
||||
self._fill_alts(soup)
|
||||
reading = soup.find("見出仮名").text
|
||||
expressions = []
|
||||
for el in soup.find_all("標準表記"):
|
||||
expression = self._clean_expression(el.text)
|
||||
if "—" in expression:
|
||||
kana_abbrs = self._kana_abbreviations[self.entry_id]
|
||||
for abbr in kana_abbrs:
|
||||
expression = expression.replace("—", abbr, 1)
|
||||
expressions.append(expression)
|
||||
expressions = Expressions.expand_abbreviation_list(expressions)
|
||||
if len(expressions) == 0:
|
||||
expressions.append(reading)
|
||||
self._headwords = {reading: expressions}
|
||||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
||||
def __decompose_subentries(self, page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
subentry_parameters = [
|
||||
[Daijirin2ChildEntry, ["子項目"], self.children],
|
||||
[Daijirin2PhraseEntry, ["句項目"], self.phrases],
|
||||
]
|
||||
for x in subentry_parameters:
|
||||
subentry_class, tags, subentry_list = x
|
||||
for tag in tags:
|
||||
tag_soup = soup.find(tag)
|
||||
while tag_soup is not None:
|
||||
tag_soup.name = "項目"
|
||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||
subentry = subentry_class(subentry_id)
|
||||
page = tag_soup.decode()
|
||||
subentry.set_page(page)
|
||||
subentry_list.append(subentry)
|
||||
tag_soup.decompose()
|
||||
tag_soup = soup.find(tag)
|
||||
return soup.decode()
|
||||
|
||||
@staticmethod
|
||||
def id_string_to_entry_id(id_string):
|
||||
parts = id_string.split("-")
|
||||
if len(parts) == 1:
|
||||
return (int(parts[0]), 0)
|
||||
elif len(parts) == 2:
|
||||
# subentries have a hexadecimal part
|
||||
return (int(parts[0]), int(parts[1], 16))
|
||||
else:
|
||||
raise Exception(f"Invalid entry ID: {id_string}")
|
||||
|
||||
@staticmethod
|
||||
def _delete_unused_nodes(soup):
|
||||
unused_nodes = [
|
||||
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
|
||||
"表外字マーク", "表外字マーク", "ルビG"
|
||||
]
|
||||
for name in unused_nodes:
|
||||
Soup.delete_soup_nodes(soup, name)
|
||||
|
||||
@staticmethod
|
||||
def _clean_expression(expression):
|
||||
for x in ["〈", "〉", "《", "》", " "]:
|
||||
expression = expression.replace(x, "")
|
||||
return expression
|
||||
|
||||
@staticmethod
|
||||
def _fill_alts(soup):
|
||||
for gaiji in soup.find_all(class_="gaiji"):
|
||||
if gaiji.name == "img" and gaiji.has_attr("alt"):
|
||||
gaiji.name = "span"
|
||||
gaiji.string = gaiji.attrs["alt"]
|
||||
|
||||
|
||||
class Daijirin2Entry(_BaseDaijirin2Entry):
|
||||
def __init__(self, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(entry_id)
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
||||
|
||||
def _set_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
if soup.find("漢字見出") is not None:
|
||||
self._set_kanji_headwords(soup)
|
||||
elif soup.find("略語G") is not None:
|
||||
self._set_acronym_headwords(soup)
|
||||
else:
|
||||
self._set_regular_headwords(soup)
|
||||
|
||||
def _set_kanji_headwords(self, soup):
|
||||
readings = []
|
||||
for el in soup.find_all("漢字音"):
|
||||
hira = Expressions.kata_to_hira(el.text)
|
||||
readings.append(hira)
|
||||
if soup.find("漢字音") is None:
|
||||
readings.append("")
|
||||
expressions = []
|
||||
for el in soup.find_all("漢字見出"):
|
||||
expressions.append(el.text)
|
||||
self._headwords = {}
|
||||
for reading in readings:
|
||||
self._headwords[reading] = expressions
|
||||
|
||||
def _set_acronym_headwords(self, soup):
|
||||
expressions = []
|
||||
for el in soup.find_all("略語"):
|
||||
expression_parts = []
|
||||
for part in el.find_all(["欧字", "和字"]):
|
||||
expression_parts.append(part.text)
|
||||
expression = "".join(expression_parts)
|
||||
expressions.append(expression)
|
||||
self._headwords = {"": expressions}
|
||||
|
||||
|
||||
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
|
||||
def _set_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._delete_unused_nodes(soup)
|
||||
self._set_regular_headwords(soup)
|
||||
|
||||
|
||||
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
self.__phrase_readings = load_daijirin2_phrase_readings()
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrases do not contain these tags
|
||||
return []
|
||||
|
||||
def _set_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
headwords = {}
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings()
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
self._headwords = headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
self._delete_unused_nodes(soup)
|
||||
text = soup.find("句表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
expressions.append(exp)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self):
|
||||
text = self.__phrase_readings[self.entry_id]
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
||||
|
||||
@staticmethod
|
||||
def __expand_alternatives(expression):
|
||||
"""Return a list of strings described by = notation.
|
||||
eg. "同じ穴の=狢(=狐・狸)" -> [
|
||||
"同じ穴の狢", "同じ穴の狐", "同じ穴の狸"
|
||||
]
|
||||
eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [
|
||||
"聞くは一時の恥、聞かぬは末代の恥",
|
||||
"聞くは一時の恥、聞かぬは一生の恥",
|
||||
"聞くは一旦の恥、聞かぬは末代の恥",
|
||||
"聞くは一旦の恥、聞かぬは一生の恥"
|
||||
]
|
||||
"""
|
||||
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||
groups = re.findall(group_pattern, expression)
|
||||
expressions = [""]
|
||||
for group in groups:
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[0])
|
||||
expressions = new_exps.copy()
|
||||
if group[1] == "":
|
||||
continue
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[2])
|
||||
for expression in expressions:
|
||||
for alt in group[3].split("・"):
|
||||
new_exps.append(expression + alt)
|
||||
expressions = new_exps.copy()
|
||||
return expressions
|
56
bot/entries/daijirin2_preprocess.py
Normal file
56
bot/entries/daijirin2_preprocess.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.data import get_adobe_glyph
|
||||
|
||||
|
||||
__GAIJI = {
|
||||
"gaiji/DJRK0002.svg": "𦬇",
|
||||
"gaiji/U芸E0102.svg": "芸",
|
||||
}
|
||||
|
||||
|
||||
def preprocess_page(page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
__replace_glyph_codes(soup)
|
||||
__add_gaiji_alt_text(soup)
|
||||
__replace_halfwidth_braces(soup)
|
||||
page = __strip_page(soup)
|
||||
return page
|
||||
|
||||
|
||||
def __replace_glyph_codes(soup):
|
||||
for el in soup.find_all(style=True):
|
||||
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
|
||||
if not m:
|
||||
continue
|
||||
del el.attrs["style"]
|
||||
if el.has_attr("alt"):
|
||||
el.string = el.attrs["alt"]
|
||||
continue
|
||||
code = int(m.group(1))
|
||||
for geta in el.find_all(string="〓"):
|
||||
glyph = get_adobe_glyph(code)
|
||||
geta.replace_with(glyph)
|
||||
|
||||
|
||||
def __add_gaiji_alt_text(soup):
|
||||
for gaiji in soup.find_all(class_="gaiji"):
|
||||
src = gaiji.attrs["src"] if gaiji.has_attr("src") else ""
|
||||
if src in __GAIJI:
|
||||
gaiji.attrs["alt"] = __GAIJI[src]
|
||||
|
||||
|
||||
def __replace_halfwidth_braces(soup):
|
||||
for x in soup.find_all("送り仮名省略"):
|
||||
for el in x.find_all(string="("):
|
||||
el.replace_with("(")
|
||||
for el in x.find_all(string=")"):
|
||||
el.replace_with(")")
|
||||
|
||||
|
||||
def __strip_page(soup):
|
||||
koumoku = soup.find("項目")
|
||||
if koumoku is None:
|
||||
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
|
||||
return koumoku.decode()
|
38
bot/entries/entry.py
Normal file
38
bot/entries/entry.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from bot.data import load_variant_kanji
|
||||
|
||||
|
||||
class Entry(ABC):
|
||||
def __init__(self, entry_id):
|
||||
self.entry_id = entry_id
|
||||
self._page = None
|
||||
self._headwords = None
|
||||
self._part_of_speech_tags = None
|
||||
self._variant_kanji = load_variant_kanji()
|
||||
|
||||
@abstractmethod
|
||||
def set_page(self, page):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_page_soup(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_headwords(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_part_of_speech_tags(self):
|
||||
pass
|
||||
|
||||
def get_first_expression(self):
|
||||
headwords = self.get_headwords()
|
||||
expressions = next(iter(headwords.values()))
|
||||
expression = expressions[0]
|
||||
return expression
|
||||
|
||||
def get_first_reading(self):
|
||||
headwords = self.get_headwords()
|
||||
reading = next(iter(headwords.keys()))
|
||||
return reading
|
|
@ -2,29 +2,21 @@ import re
|
|||
from datetime import datetime, date
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.data import load_variant_kanji
|
||||
from bot.entries.entry import Entry
|
||||
import bot.expressions as Expressions
|
||||
|
||||
|
||||
class JitenonEntry:
|
||||
_VARIANT_KANJI = None
|
||||
|
||||
class _JitenonEntry(Entry):
|
||||
def __init__(self, entry_id):
|
||||
if self._VARIANT_KANJI is None:
|
||||
self._VARIANT_KANJI = load_variant_kanji()
|
||||
self.entry_id = entry_id
|
||||
self.markup = ""
|
||||
super().__init__(entry_id)
|
||||
self.modified_date = date(1970, 1, 1)
|
||||
self.attribution = ""
|
||||
for column in self._COLUMNS.values():
|
||||
setattr(self, column[0], column[1])
|
||||
self._headwords = None
|
||||
|
||||
def set_markup(self, path):
|
||||
with open(path, "r") as f:
|
||||
html = f.read()
|
||||
soup = BeautifulSoup(html, features="html5lib")
|
||||
self.__set_modified_date(html)
|
||||
def set_page(self, page):
|
||||
soup = BeautifulSoup(page, features="html5lib")
|
||||
self.__set_modified_date(page)
|
||||
self.attribution = soup.find(class_="copyright").text
|
||||
table = soup.find(class_="kanjirighttb")
|
||||
rows = table.find("tbody").find_all("tr")
|
||||
|
@ -33,7 +25,11 @@ class JitenonEntry:
|
|||
colname = row.th.text if row.th is not None else colname
|
||||
colval = self.__clean_text(row.td.text)
|
||||
self.__set_column(colname, colval)
|
||||
self.markup = table.decode()
|
||||
self._page = table.decode()
|
||||
|
||||
def get_page_soup(self):
|
||||
soup = BeautifulSoup(self._page, "html5lib")
|
||||
return soup
|
||||
|
||||
def get_headwords(self):
|
||||
if self._headwords is not None:
|
||||
|
@ -42,16 +38,9 @@ class JitenonEntry:
|
|||
self._set_variant_headwords()
|
||||
return self._headwords
|
||||
|
||||
def get_first_expression(self):
|
||||
headwords = self.get_headwords()
|
||||
expressions = next(iter(headwords.values()))
|
||||
expression = expressions[0]
|
||||
return expression
|
||||
|
||||
def get_first_reading(self):
|
||||
headwords = self.get_headwords()
|
||||
reading = next(iter(headwords.keys()))
|
||||
return reading
|
||||
def get_part_of_speech_tags(self):
|
||||
# Jitenon doesn't have any
|
||||
return []
|
||||
|
||||
def _set_headwords(self):
|
||||
headwords = {}
|
||||
|
@ -66,8 +55,8 @@ class JitenonEntry:
|
|||
headwords[reading].append(expression)
|
||||
self._headwords = headwords
|
||||
|
||||
def __set_modified_date(self, html):
|
||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
||||
def __set_modified_date(self, page):
|
||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
||||
if not m:
|
||||
return
|
||||
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
||||
|
@ -94,7 +83,7 @@ class JitenonEntry:
|
|||
return [m.group(1)]
|
||||
m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata)
|
||||
if m:
|
||||
return Expressions.expand_shouryaku(yomikata)
|
||||
return Expressions.expand_abbreviation(yomikata)
|
||||
m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata)
|
||||
if m:
|
||||
yomikatas = [m.group(1)]
|
||||
|
@ -139,7 +128,7 @@ class JitenonEntry:
|
|||
return ",".join(colvals)
|
||||
|
||||
|
||||
class JitenonYojiEntry(JitenonEntry):
|
||||
class JitenonYojiEntry(_JitenonEntry):
|
||||
_COLUMNS = {
|
||||
"四字熟語": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
|
@ -151,15 +140,15 @@ class JitenonYojiEntry(JitenonEntry):
|
|||
"類義語": ["ruigigo", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
super().__init__(sequence)
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
|
||||
|
||||
class JitenonKotowazaEntry(JitenonEntry):
|
||||
class JitenonKotowazaEntry(_JitenonEntry):
|
||||
_COLUMNS = {
|
||||
"言葉": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
|
@ -170,8 +159,8 @@ class JitenonKotowazaEntry(JitenonEntry):
|
|||
"類句": ["ruiku", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
super().__init__(sequence)
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
|
||||
def _set_headwords(self):
|
||||
if self.expression == "金棒引き・鉄棒引き":
|
||||
|
@ -183,5 +172,5 @@ class JitenonKotowazaEntry(JitenonEntry):
|
|||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
|
|
242
bot/entries/smk8.py
Normal file
242
bot/entries/smk8.py
Normal file
|
@ -0,0 +1,242 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.expressions as Expressions
|
||||
import bot.soup as Soup
|
||||
from bot.data import load_smk8_phrase_readings
|
||||
from bot.entries.entry import Entry
|
||||
from bot.entries.smk8_preprocess import preprocess_page
|
||||
|
||||
|
||||
class _BaseSmk8Entry(Entry):
|
||||
ID_TO_ENTRY = {}
|
||||
SUBENTRY_ID_TO_ENTRY_ID = {}
|
||||
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
if entry_id not in self.ID_TO_ENTRY:
|
||||
self.ID_TO_ENTRY[entry_id] = self
|
||||
else:
|
||||
raise Exception(f"Duplicate entry ID: {entry_id}")
|
||||
self.children = []
|
||||
self.phrases = []
|
||||
self.kanjis = []
|
||||
|
||||
def set_page(self, page):
|
||||
page = self.__decompose_subentries(page)
|
||||
self._page = page
|
||||
|
||||
def get_page_soup(self):
|
||||
soup = BeautifulSoup(self._page, "xml")
|
||||
return soup
|
||||
|
||||
def get_headwords(self):
|
||||
if self._headwords is not None:
|
||||
return self._headwords
|
||||
self._set_headwords()
|
||||
self._set_variant_headwords()
|
||||
return self._headwords
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
if self._part_of_speech_tags is not None:
|
||||
return self._part_of_speech_tags
|
||||
self._part_of_speech_tags = []
|
||||
soup = self.get_page_soup()
|
||||
headword_info = soup.find("見出要素")
|
||||
if headword_info is None:
|
||||
return self._part_of_speech_tags
|
||||
for tag in headword_info.find_all("品詞M"):
|
||||
if tag.text not in self._part_of_speech_tags:
|
||||
self._part_of_speech_tags.append(tag.text)
|
||||
return self._part_of_speech_tags
|
||||
|
||||
def _set_variant_headwords(self):
|
||||
for expressions in self._headwords.values():
|
||||
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||
Expressions.add_fullwidth(expressions)
|
||||
Expressions.remove_iteration_mark(expressions)
|
||||
Expressions.add_iteration_mark(expressions)
|
||||
|
||||
def _find_reading(self, soup):
|
||||
midasi_kana = soup.find("見出仮名")
|
||||
reading = midasi_kana.text
|
||||
for x in [" ", "・"]:
|
||||
reading = reading.replace(x, "")
|
||||
return reading
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
clean_expressions = []
|
||||
for expression in soup.find_all("標準表記"):
|
||||
clean_expression = self._clean_expression(expression.text)
|
||||
clean_expressions.append(clean_expression)
|
||||
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
||||
return expressions
|
||||
|
||||
def __decompose_subentries(self, page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
subentry_parameters = [
|
||||
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
|
||||
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
||||
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
|
||||
]
|
||||
for x in subentry_parameters:
|
||||
subentry_class, tags, subentry_list = x
|
||||
for tag in tags:
|
||||
tag_soup = soup.find(tag)
|
||||
while tag_soup is not None:
|
||||
tag_soup.name = "項目"
|
||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||
subentry = subentry_class(subentry_id)
|
||||
page = tag_soup.decode()
|
||||
subentry.set_page(page)
|
||||
subentry_list.append(subentry)
|
||||
tag_soup.decompose()
|
||||
tag_soup = soup.find(tag)
|
||||
return soup.decode()
|
||||
|
||||
@staticmethod
|
||||
def id_string_to_entry_id(id_string):
|
||||
parts = id_string.split("-")
|
||||
if len(parts) == 1:
|
||||
return (int(parts[0]), 0)
|
||||
elif len(parts) == 2:
|
||||
# subentries have a hexadecimal part
|
||||
return (int(parts[0]), int(parts[1], 16))
|
||||
else:
|
||||
raise Exception(f"Invalid entry ID: {id_string}")
|
||||
|
||||
@staticmethod
|
||||
def _clean_expression(expression):
|
||||
for x in ["〈", "〉", "{", "}", "…", " "]:
|
||||
expression = expression.replace(x, "")
|
||||
return expression
|
||||
|
||||
@staticmethod
|
||||
def _fill_alts(soup):
|
||||
for e in soup.find_all(["親見出仮名", "親見出表記"]):
|
||||
e.string = e.attrs["alt"]
|
||||
for gaiji in soup.find_all("外字"):
|
||||
gaiji.string = gaiji.img.attrs["alt"]
|
||||
|
||||
|
||||
class Smk8Entry(_BaseSmk8Entry):
|
||||
def __init__(self, page_id):
|
||||
entry_id = (page_id, 0)
|
||||
super().__init__(entry_id)
|
||||
|
||||
def set_page(self, page):
|
||||
page = preprocess_page(page)
|
||||
super().set_page(page)
|
||||
|
||||
def _set_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
Soup.delete_soup_nodes(soup, "表音表記")
|
||||
self._fill_alts(soup)
|
||||
reading = self._find_reading(soup)
|
||||
expressions = []
|
||||
if soup.find("見出部").find("標準表記") is None:
|
||||
expressions.append(reading)
|
||||
for expression in self._find_expressions(soup):
|
||||
if expression not in expressions:
|
||||
expressions.append(expression)
|
||||
self._headwords = {reading: expressions}
|
||||
|
||||
|
||||
class Smk8ChildEntry(_BaseSmk8Entry):
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
|
||||
def _set_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
Soup.delete_soup_nodes(soup, "表音表記")
|
||||
self._fill_alts(soup)
|
||||
reading = self._find_reading(soup)
|
||||
expressions = []
|
||||
if soup.find("子見出部").find("標準表記") is None:
|
||||
expressions.append(reading)
|
||||
for expression in self._find_expressions(soup):
|
||||
if expression not in expressions:
|
||||
expressions.append(expression)
|
||||
self._headwords = {reading: expressions}
|
||||
|
||||
|
||||
class Smk8PhraseEntry(_BaseSmk8Entry):
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
self.__phrase_readings = load_smk8_phrase_readings()
|
||||
|
||||
def get_part_of_speech_tags(self):
|
||||
# phrases do not contain these tags
|
||||
return []
|
||||
|
||||
def _set_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
headwords = {}
|
||||
expressions = self._find_expressions(soup)
|
||||
readings = self._find_readings()
|
||||
for idx, expression in enumerate(expressions):
|
||||
reading = readings[idx]
|
||||
if reading in headwords:
|
||||
headwords[reading].append(expression)
|
||||
else:
|
||||
headwords[reading] = [expression]
|
||||
self._headwords = headwords
|
||||
|
||||
def _find_expressions(self, soup):
|
||||
Soup.delete_soup_nodes(soup, "ルビG")
|
||||
self._fill_alts(soup)
|
||||
text = soup.find("標準表記").text
|
||||
text = self._clean_expression(text)
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
expressions = []
|
||||
for alt in alternatives:
|
||||
for exp in Expressions.expand_abbreviation(alt):
|
||||
expressions.append(exp)
|
||||
return expressions
|
||||
|
||||
def _find_readings(self):
|
||||
text = self.__phrase_readings[self.entry_id]
|
||||
alternatives = self.__expand_alternatives(text)
|
||||
readings = []
|
||||
for alt in alternatives:
|
||||
for reading in Expressions.expand_abbreviation(alt):
|
||||
readings.append(reading)
|
||||
return readings
|
||||
|
||||
@staticmethod
|
||||
def __expand_alternatives(expression):
|
||||
"""Return a list of strings described by △ notation
|
||||
eg. "△金(時間・暇)に飽かして" -> [
|
||||
"金に飽かして", "時間に飽かして", "暇に飽かして"
|
||||
]
|
||||
"""
|
||||
m = re.search(r"△([^(]+)(([^(]+))", expression)
|
||||
if not m:
|
||||
return [expression]
|
||||
alt_parts = [m.group(1)]
|
||||
for alt_part in m.group(2).split("・"):
|
||||
alt_parts.append(alt_part)
|
||||
alts = []
|
||||
for alt_part in alt_parts:
|
||||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, expression)
|
||||
alts.append(alt_exp)
|
||||
return alts
|
||||
|
||||
|
||||
class Smk8KanjiEntry(_BaseSmk8Entry):
|
||||
def __init__(self, entry_id):
|
||||
super().__init__(entry_id)
|
||||
|
||||
def _set_headwords(self):
|
||||
soup = self.get_page_soup()
|
||||
self._fill_alts(soup)
|
||||
reading = self.__get_parent_reading()
|
||||
expressions = self._find_expressions(soup)
|
||||
self._headwords = {reading: expressions}
|
||||
|
||||
def __get_parent_reading(self):
|
||||
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
|
||||
parent = self.ID_TO_ENTRY[parent_id]
|
||||
reading = parent.get_first_reading()
|
||||
return reading
|
91
bot/entries/smk8_preprocess.py
Normal file
91
bot/entries/smk8_preprocess.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.data import get_adobe_glyph
|
||||
|
||||
|
||||
__GAIJI = {
|
||||
"gaiji/5350.svg": "卐",
|
||||
"gaiji/62cb.svg": "抛",
|
||||
"gaiji/7be1.svg": "簒",
|
||||
}
|
||||
|
||||
|
||||
def preprocess_page(page):
|
||||
page = __strip_page(page)
|
||||
page = __replace_glyph_codes(page)
|
||||
page = __format_hyougai_marks(page)
|
||||
return page
|
||||
|
||||
|
||||
def __strip_page(page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
koumoku = soup.find(["項目", "字音語参照項目"])
|
||||
if koumoku is not None:
|
||||
return koumoku.decode()
|
||||
else:
|
||||
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
|
||||
|
||||
|
||||
def __replace_glyph_codes(page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
for span in soup.find_all("span"):
|
||||
if "style" in span.attrs:
|
||||
m = re.search(r"^glyph:([0-9]+);$", span.attrs["style"])
|
||||
del span.attrs["style"]
|
||||
if m is None:
|
||||
continue
|
||||
code = int(m.group(1))
|
||||
for geta in span.find_all(string="〓"):
|
||||
glyph = get_adobe_glyph(code)
|
||||
geta.replace_with(glyph)
|
||||
for hyouki in soup.find_all("親見出表記"):
|
||||
if "alt" not in hyouki.attrs:
|
||||
continue
|
||||
alt = hyouki.attrs["alt"]
|
||||
codes = re.findall(r"{CID([0-9]+)}", alt)
|
||||
for code in codes:
|
||||
glyph = get_adobe_glyph(int(code))
|
||||
alt = alt.replace(f"{{CID{code}}}", glyph)
|
||||
hyouki.attrs["alt"] = alt
|
||||
for gaiji in soup.find_all("外字"):
|
||||
img = gaiji.img
|
||||
src = img.attrs["src"] if img.has_attr("src") else ""
|
||||
if src in __GAIJI:
|
||||
img.attrs["alt"] = __GAIJI[src]
|
||||
return soup.decode()
|
||||
|
||||
|
||||
def __format_hyougai_marks(page):
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
for el in soup.find_all("外字"):
|
||||
el.string = "〓"
|
||||
text = soup.text
|
||||
for x in ["\n", "\t", " "]:
|
||||
text = text.replace(x, "")
|
||||
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
|
||||
page = re.sub(r"〈([^〈]+)〉", r"␂\1␃", page)
|
||||
for mark in re.findall(r"《.", text):
|
||||
if mark[1] == "〓":
|
||||
page = page.replace("《", "<表外音訓/>", 1)
|
||||
else:
|
||||
page = re.sub(f"《([^{mark[1]}]*)({mark[1]})",
|
||||
r"\1<表外音訓>\2</表外音訓>",
|
||||
page, count=1)
|
||||
for mark in re.findall(r"〈.", text):
|
||||
if mark[1] == "〓":
|
||||
page = page.replace("〈", "<表外字/>", 1)
|
||||
else:
|
||||
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
|
||||
r"\1<表外字>\2</表外字>",
|
||||
page, count=1)
|
||||
page = page.replace("␂", "〈")
|
||||
page = page.replace("␃", "〉")
|
||||
soup = BeautifulSoup(page, features="xml")
|
||||
for el in soup.find_all("表外音訓"):
|
||||
if el.text == "":
|
||||
el.append(el.next_sibling)
|
||||
for el in soup.find_all("表外字"):
|
||||
if el.text == "":
|
||||
el.append(el.next_sibling)
|
||||
return soup.decode()
|
|
@ -1,12 +1,28 @@
|
|||
import re
|
||||
|
||||
__WIDE_MAP = {i: i + 0xFEE0 for i in range(0x21, 0x7F)}
|
||||
__KATA_TO_HIRA_MAP = {
|
||||
i: i - 96 for i in [
|
||||
*range(0x30A1, 0x30F6),
|
||||
*range(0x30FD, 0x30FE),
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
__HALFWIDTH_TO_FULLWIDTH_MAP = {
|
||||
i: i + 0xFEE0 for i in [
|
||||
*range(0x21, 0x7F),
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def kata_to_hira(text):
|
||||
hira = text.translate(__KATA_TO_HIRA_MAP)
|
||||
return hira
|
||||
|
||||
|
||||
def add_fullwidth(expressions):
|
||||
for expression in expressions:
|
||||
if re.match(r"[A-Za-z0-9]", expression):
|
||||
new_exp = expression.translate(__WIDE_MAP)
|
||||
new_exp = expression.translate(__HALFWIDTH_TO_FULLWIDTH_MAP)
|
||||
if new_exp not in expressions:
|
||||
expressions.append(new_exp)
|
||||
|
||||
|
@ -23,23 +39,50 @@ def add_variant_kanji(expressions, variant_kanji):
|
|||
expressions.append(new_exp)
|
||||
|
||||
|
||||
def expand_shouryaku(shouryaku):
|
||||
def remove_iteration_mark(expressions):
|
||||
iterated_kanji = r"(.)々"
|
||||
for expression in expressions:
|
||||
for char in re.findall(iterated_kanji, expression):
|
||||
new_exp = expression.replace(f"{char}々", f"{char}{char}")
|
||||
if new_exp not in expressions:
|
||||
expressions.append(new_exp)
|
||||
|
||||
|
||||
def add_iteration_mark(expressions):
|
||||
repeat_kanji = r"([^0-z0-zぁ-ヿ])\1"
|
||||
for expression in expressions:
|
||||
for char in re.findall(repeat_kanji, expression):
|
||||
new_exp = expression.replace(f"{char}{char}", f"{char}々")
|
||||
if new_exp not in expressions:
|
||||
expressions.append(new_exp)
|
||||
|
||||
|
||||
def expand_abbreviation(abbreviated_expression):
|
||||
"""Return a list of words described by a 省略 notation.
|
||||
eg. "有(り)合(わ)せ" -> [
|
||||
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
|
||||
]
|
||||
"""
|
||||
groups = re.findall(r"([^(]*)((([^(]+)))?", shouryaku)
|
||||
forms = [""]
|
||||
groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression)
|
||||
expressions = [""]
|
||||
for group in groups:
|
||||
new_forms = []
|
||||
for form in forms:
|
||||
new_forms.append(form + group[0])
|
||||
forms = new_forms.copy()
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[0])
|
||||
expressions = new_exps.copy()
|
||||
if group[2] == '':
|
||||
continue
|
||||
new_forms = []
|
||||
for form in forms:
|
||||
new_forms.append(form + group[2])
|
||||
forms = new_forms.copy() + forms.copy()
|
||||
return forms
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
new_exps.append(expression + group[2])
|
||||
expressions = new_exps.copy() + expressions.copy()
|
||||
return expressions
|
||||
|
||||
|
||||
def expand_abbreviation_list(expressions):
|
||||
new_exps = []
|
||||
for expression in expressions:
|
||||
for new_exp in expand_abbreviation(expression):
|
||||
if new_exp not in new_exps:
|
||||
new_exps.append(new_exp)
|
||||
return new_exps
|
||||
|
|
84
bot/icons.py
Normal file
84
bot/icons.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
from bs4 import BeautifulSoup
|
||||
from PIL import Image
|
||||
from functools import cache
|
||||
|
||||
|
||||
@cache
|
||||
def calculate_ratio(path):
|
||||
if path.endswith(".svg"):
|
||||
ratio = __calculate_svg_ratio(path)
|
||||
else:
|
||||
ratio = __calculate_bitmap_ratio(path)
|
||||
return ratio
|
||||
|
||||
|
||||
@cache
|
||||
def make_rectangle(path, text, rect_stroke, rect_fill, text_fill):
|
||||
svg = __svg_text_rectangle(text, rect_stroke, rect_fill, text_fill)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(svg)
|
||||
|
||||
|
||||
@cache
|
||||
def make_monochrome_fill_rectangle(path, text):
|
||||
svg = __svg_masked_rectangle(text)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(svg)
|
||||
|
||||
|
||||
def __calculate_svg_ratio(path):
|
||||
with open(path, "r") as f:
|
||||
xml = f.read()
|
||||
soup = BeautifulSoup(xml, "xml")
|
||||
svg = soup.svg
|
||||
if svg.has_attr("width") and svg.has_attr("height"):
|
||||
width = float(svg.attrs["width"])
|
||||
height = float(svg.attrs["height"])
|
||||
ratio = width / height
|
||||
elif svg.has_attr("viewBox"):
|
||||
_, _, width, height = svg.attrs["viewBox"].split(" ")
|
||||
ratio = float(width) / float(height)
|
||||
else:
|
||||
raise Exception(f"Cannot calculate ratio for SVG\n{svg.prettify()}")
|
||||
return ratio
|
||||
|
||||
|
||||
def __calculate_bitmap_ratio(path):
|
||||
img = Image.open(path)
|
||||
img_w = img.size[0]
|
||||
img_h = img.size[1]
|
||||
ratio = img_w / img_h
|
||||
return ratio
|
||||
|
||||
|
||||
def __svg_text_rectangle(text, rect_stroke, rect_fill, text_fill):
|
||||
height = 128
|
||||
width = len(text) * height
|
||||
svg = f"""
|
||||
<svg lang='ja' width='{width}' height='{height}' viewBox='0 0 {width} {height}'
|
||||
xmlns='http://www.w3.org/2000/svg' version='1.1'>
|
||||
<rect width='{width}' height='{height}' ry='20' stroke='{rect_stroke}'
|
||||
fill='{rect_fill}' stroke-width='8'/>
|
||||
<text text-anchor='middle' x='50%' y='50%' dy='.35em'
|
||||
font-family='sans-serif' font-size='100px'
|
||||
fill='{text_fill}'>{text}</text>
|
||||
</svg>"""
|
||||
return svg.strip()
|
||||
|
||||
|
||||
def __svg_masked_rectangle(text):
|
||||
height = 128
|
||||
width = len(text) * height
|
||||
svg = f"""
|
||||
<svg lang='ja' width='{width}' height='{height}' viewBox='0 0 {width} {height}'
|
||||
xmlns='http://www.w3.org/2000/svg' version='1.1'>
|
||||
<mask id='a'>
|
||||
<rect width='{width}' height='{height}' fill='white'/>
|
||||
<text text-anchor='middle' x='50%' y='50%' dy='.35em'
|
||||
font-family='sans-serif' font-size='100px'
|
||||
fill='black'>{text}</text>
|
||||
</mask>
|
||||
<rect width='{width}' height='{height}' ry='20'
|
||||
fill='black' mask='url(#a)'/>
|
||||
</svg>"""
|
||||
return svg.strip()
|
|
@ -15,11 +15,8 @@ from bot.data import load_config
|
|||
|
||||
|
||||
class Scraper():
|
||||
__CONFIG = None
|
||||
|
||||
def __init__(self):
|
||||
if self.__CONFIG is None:
|
||||
self.__CONFIG = load_config()
|
||||
self._config = load_config()
|
||||
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
|
||||
self.netloc_re = re.compile(pattern)
|
||||
self.__set_session()
|
||||
|
@ -45,7 +42,7 @@ class Scraper():
|
|||
allowed_methods=["HEAD", "GET", "OPTIONS"]
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
headers = self.__CONFIG["http-request-headers"]
|
||||
headers = self._config["http-request-headers"]
|
||||
self.session = requests.Session()
|
||||
self.session.mount("https://", adapter)
|
||||
self.session.headers.update(headers)
|
||||
|
|
5
bot/soup.py
Normal file
5
bot/soup.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
def delete_soup_nodes(soup, node_name):
|
||||
node = soup.find(node_name)
|
||||
while node is not None:
|
||||
node.decompose()
|
||||
node = soup.find(node_name)
|
|
@ -9,14 +9,19 @@ from bot.data import load_yomichan_metadata
|
|||
|
||||
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
|
||||
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
|
||||
from bot.yomichan.terms.smk8 import Smk8Terminator
|
||||
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
|
||||
|
||||
|
||||
class Exporter:
|
||||
def __init__(self):
|
||||
def __init__(self, name):
|
||||
self._name = name
|
||||
self._build_dir = None
|
||||
self._terms_per_file = 2000
|
||||
|
||||
def export(self, entries):
|
||||
def export(self, entries, image_dir):
|
||||
if image_dir is not None:
|
||||
self.__init_build_image_dir(image_dir)
|
||||
meta = load_yomichan_metadata()
|
||||
index = meta[self._name]["index"]
|
||||
index["revision"] = self._get_revision(entries)
|
||||
|
@ -29,14 +34,20 @@ class Exporter:
|
|||
if self._build_dir is not None:
|
||||
return self._build_dir
|
||||
cache_dir = user_cache_dir("jitenbot")
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
|
||||
build_directory = os.path.join(cache_dir, "yomichan_build")
|
||||
if Path(build_directory).is_dir():
|
||||
shutil.rmtree(build_directory)
|
||||
os.makedirs(build_directory)
|
||||
self._build_dir = build_directory
|
||||
return self._build_dir
|
||||
|
||||
def __init_build_image_dir(self, image_dir):
|
||||
print("Copying image files to build directory...")
|
||||
build_dir = self._get_build_dir()
|
||||
build_img_dir = os.path.join(build_dir, self._name)
|
||||
shutil.copytree(image_dir, build_img_dir)
|
||||
self._terminator.set_image_dir(build_img_dir)
|
||||
|
||||
def __get_terms(self, entries):
|
||||
terms = []
|
||||
entries_len = len(entries)
|
||||
|
@ -101,15 +112,15 @@ class Exporter:
|
|||
|
||||
|
||||
class JitenonExporter(Exporter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
|
||||
def _get_revision(self, entries):
|
||||
modified_date = None
|
||||
for entry in entries:
|
||||
if modified_date is None or entry.modified_date > modified_date:
|
||||
modified_date = entry.modified_date
|
||||
revision = f"{self._name}.{modified_date}"
|
||||
revision = f"{self._name};{modified_date}"
|
||||
return revision
|
||||
|
||||
def _get_attribution(self, entries):
|
||||
|
@ -121,14 +132,38 @@ class JitenonExporter(Exporter):
|
|||
|
||||
|
||||
class JitenonYojiExporter(JitenonExporter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._name = "jitenon-yoji"
|
||||
self._terminator = JitenonYojiTerminator()
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self._terminator = JitenonYojiTerminator(name)
|
||||
|
||||
|
||||
class JitenonKotowazaExporter(JitenonExporter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._name = "jitenon-kotowaza"
|
||||
self._terminator = JitenonKotowazaTerminator()
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self._terminator = JitenonKotowazaTerminator(name)
|
||||
|
||||
|
||||
class Smk8Exporter(Exporter):
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self._terminator = Smk8Terminator(name)
|
||||
|
||||
def _get_revision(self, entries):
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||
return f"{self._name};{timestamp}"
|
||||
|
||||
def _get_attribution(self, entries):
|
||||
return "© Sanseido Co., LTD. 2020"
|
||||
|
||||
|
||||
class Daijirin2Exporter(Exporter):
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self._terminator = Daijirin2Terminator(name)
|
||||
|
||||
def _get_revision(self, entries):
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||
return f"{self._name};{timestamp}"
|
||||
|
||||
def _get_attribution(self, entries):
|
||||
return "© Sanseido Co., LTD. 2019"
|
||||
|
|
238
bot/yomichan/glossary/daijirin2.py
Normal file
238
bot/yomichan/glossary/daijirin2.py
Normal file
|
@ -0,0 +1,238 @@
|
|||
import re
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
from functools import cache
|
||||
from pathlib import Path
|
||||
|
||||
import bot.icons as Icons
|
||||
from bot.soup import delete_soup_nodes
|
||||
from bot.data import load_daijirin2_yomichan_name_conversion
|
||||
from bot.yomichan.glossary.gloss import make_gloss
|
||||
from bot.yomichan.glossary.name_conversion import convert_names
|
||||
|
||||
|
||||
def make_glossary(entry, image_dir):
|
||||
soup = entry.get_page_soup()
|
||||
__add_rubies(soup)
|
||||
__hyperlink_parent_expression(soup, entry)
|
||||
__delete_unused_nodes(soup, image_dir)
|
||||
__clear_styles(soup)
|
||||
__set_data_class(soup)
|
||||
__convert_links(soup, entry)
|
||||
__convert_gaiji(soup, image_dir)
|
||||
__convert_graphics(soup, image_dir)
|
||||
__convert_logos(soup, image_dir)
|
||||
__convert_kanjion_logos(soup, image_dir)
|
||||
__convert_daigoginum(soup, image_dir)
|
||||
__convert_jundaigoginum(soup, image_dir)
|
||||
|
||||
name_conversion = load_daijirin2_yomichan_name_conversion()
|
||||
convert_names(soup, name_conversion)
|
||||
|
||||
gloss = make_gloss(soup.span)
|
||||
glossary = [gloss]
|
||||
return glossary
|
||||
|
||||
|
||||
def __add_rubies(soup):
|
||||
for name in ["表外音訓", "表外字"]:
|
||||
for ruby in soup.find_all(name):
|
||||
ruby.name = "ruby"
|
||||
rt = ruby.find("表外字マーク")
|
||||
rt.name = "rt"
|
||||
ruby.append(rt) # needs to positioned after the text
|
||||
|
||||
|
||||
def __hyperlink_parent_expression(soup, entry):
|
||||
if soup.find("親表記") is None:
|
||||
return
|
||||
parent_entry_id = entry.SUBENTRY_ID_TO_ENTRY_ID[entry.entry_id]
|
||||
parent_entry = entry.ID_TO_ENTRY[parent_entry_id]
|
||||
parent_expression = parent_entry.get_first_expression()
|
||||
for el in soup.find_all("親表記"):
|
||||
el.name = "a"
|
||||
el.attrs["href"] = f"?query={parent_expression}&wildcards=off"
|
||||
|
||||
|
||||
def __delete_unused_nodes(soup, image_dir):
|
||||
if not __graphics_directory_exists(image_dir):
|
||||
delete_soup_nodes(soup, "カットG")
|
||||
for el in soup.find_all("logo"):
|
||||
next_sibling = el.next_sibling
|
||||
if next_sibling is None:
|
||||
continue
|
||||
elif next_sibling.name in ["漢字見出G", "漢字音G"]:
|
||||
el.decompose()
|
||||
for el in soup.find_all("漢字音G"):
|
||||
for child in el.find_all(string="・"):
|
||||
child.replace_with("")
|
||||
|
||||
|
||||
@cache
|
||||
def __graphics_directory_exists(image_dir):
|
||||
path = os.path.join(image_dir, "graphics")
|
||||
return Path(path).is_dir()
|
||||
|
||||
|
||||
def __clear_styles(soup):
|
||||
for el in soup.select("[style]"):
|
||||
del el.attrs["style"]
|
||||
|
||||
|
||||
def __set_data_class(soup):
|
||||
for el in soup.select("[class]"):
|
||||
el.attrs["data-class"] = el.attrs["class"]
|
||||
|
||||
|
||||
def __convert_links(soup, entry):
|
||||
for el in soup.find_all("a"):
|
||||
href = el.attrs["href"]
|
||||
if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
|
||||
ref_entry_id = entry.id_string_to_entry_id(href)
|
||||
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
|
||||
expression = ref_entry.get_first_expression()
|
||||
el.attrs["href"] = f"?query={expression}&wildcards=off"
|
||||
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
|
||||
pass
|
||||
else:
|
||||
raise Exception(f"Invalid href format: {href}")
|
||||
|
||||
|
||||
def __convert_gaiji(soup, image_dir):
|
||||
for el in soup.find_all("img"):
|
||||
src = el.attrs["src"]
|
||||
if not src.startswith("gaiji"):
|
||||
continue
|
||||
path = image_dir
|
||||
for part in src.split("/"):
|
||||
if part.strip() == "":
|
||||
continue
|
||||
path = os.path.join(path, part)
|
||||
ratio = Icons.calculate_ratio(path)
|
||||
img = BeautifulSoup("<img/>", "xml").img
|
||||
img.attrs = {
|
||||
"height": 1.0 if ratio > 1.0 else ratio,
|
||||
"width": ratio if ratio > 1.0 else 1.0,
|
||||
"sizeUnits": "em",
|
||||
"collapsible": False,
|
||||
"collapsed": False,
|
||||
"background": False,
|
||||
"appearance": "monochrome",
|
||||
"title": el.attrs["alt"] if el.has_attr("alt") else "",
|
||||
"path": f"{os.path.basename(image_dir)}/{src}",
|
||||
"src": src,
|
||||
}
|
||||
el.name = "span"
|
||||
el.clear()
|
||||
el.append(img)
|
||||
el.attrs["style"] = "vertical-align: text-bottom;"
|
||||
|
||||
|
||||
def __convert_graphics(soup, image_dir):
|
||||
for el in soup.find_all("img"):
|
||||
src = el.attrs["src"]
|
||||
if not src.startswith("graphics"):
|
||||
continue
|
||||
el.attrs = {
|
||||
"collapsible": True,
|
||||
"collapsed": True,
|
||||
"title": el.attrs["alt"] if el.has_attr("alt") else "",
|
||||
"path": f"{os.path.basename(image_dir)}/{src}",
|
||||
"src": src,
|
||||
}
|
||||
|
||||
|
||||
def __convert_logos(soup, image_dir):
|
||||
for el in soup.find_all("logo"):
|
||||
filename = f"{el.text}-default.svg"
|
||||
path = os.path.join(image_dir, filename)
|
||||
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
|
||||
ratio = Icons.calculate_ratio(path)
|
||||
img = BeautifulSoup("<img/>", "xml").img
|
||||
img.attrs = {
|
||||
"height": 1.0 if ratio > 1.0 else ratio,
|
||||
"width": ratio if ratio > 1.0 else 1.0,
|
||||
"sizeUnits": "em",
|
||||
"collapsible": False,
|
||||
"collapsed": False,
|
||||
"background": False,
|
||||
"appearance": "monochrome",
|
||||
"title": el.text,
|
||||
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||
}
|
||||
el.name = "span"
|
||||
el.clear()
|
||||
el.append(img)
|
||||
el.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em;"
|
||||
|
||||
|
||||
def __convert_kanjion_logos(soup, image_dir):
|
||||
for el in soup.find_all("漢字音logo"):
|
||||
filename = f"{el.text}-default.svg"
|
||||
path = os.path.join(image_dir, filename)
|
||||
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
|
||||
ratio = Icons.calculate_ratio(path)
|
||||
img = BeautifulSoup("<img/>", "xml").img
|
||||
img.attrs = {
|
||||
"height": 1.0 if ratio > 1.0 else ratio,
|
||||
"width": ratio if ratio > 1.0 else 1.0,
|
||||
"sizeUnits": "em",
|
||||
"collapsible": False,
|
||||
"collapsed": False,
|
||||
"background": False,
|
||||
"appearance": "monochrome",
|
||||
"title": el.text,
|
||||
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||
}
|
||||
el.name = "span"
|
||||
el.clear()
|
||||
el.append(img)
|
||||
el.attrs["style"] = "vertical-align: text-bottom; margin-left: 0.25em;"
|
||||
|
||||
|
||||
def __convert_daigoginum(soup, image_dir):
|
||||
for el in soup.find_all("大語義num"):
|
||||
filename = f"{el.text}-fill.svg"
|
||||
path = os.path.join(image_dir, filename)
|
||||
Icons.make_monochrome_fill_rectangle(path, el.text)
|
||||
ratio = Icons.calculate_ratio(path)
|
||||
img = BeautifulSoup("<img/>", "xml").img
|
||||
img.attrs = {
|
||||
"height": 1.0 if ratio > 1.0 else ratio,
|
||||
"width": ratio if ratio > 1.0 else 1.0,
|
||||
"sizeUnits": "em",
|
||||
"collapsible": False,
|
||||
"collapsed": False,
|
||||
"background": False,
|
||||
"appearance": "monochrome",
|
||||
"title": el.text,
|
||||
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||
}
|
||||
el.name = "span"
|
||||
el.clear()
|
||||
el.append(img)
|
||||
el.attrs["style"] = "vertical-align: text-bottom;"
|
||||
|
||||
|
||||
def __convert_jundaigoginum(soup, image_dir):
|
||||
for el in soup.find_all("準大語義num"):
|
||||
filename = f"{el.text}-default.svg"
|
||||
path = os.path.join(image_dir, filename)
|
||||
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
|
||||
ratio = Icons.calculate_ratio(path)
|
||||
img = BeautifulSoup("<img/>", "xml").img
|
||||
img.attrs = {
|
||||
"height": 1.0 if ratio > 1.0 else ratio,
|
||||
"width": ratio if ratio > 1.0 else 1.0,
|
||||
"sizeUnits": "em",
|
||||
"collapsible": False,
|
||||
"collapsed": False,
|
||||
"background": False,
|
||||
"appearance": "monochrome",
|
||||
"title": el.text,
|
||||
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||
}
|
||||
el.name = "span"
|
||||
el.clear()
|
||||
el.append(img)
|
||||
el.attrs["style"] = "vertical-align: text-bottom;"
|
|
@ -3,14 +3,14 @@ from css_parser import parseStyle
|
|||
|
||||
|
||||
def make_gloss(soup):
|
||||
node = __get_markup_structure(soup)
|
||||
node = __get_page_structure(soup)
|
||||
return {
|
||||
"type": "structured-content",
|
||||
"content": node["content"],
|
||||
}
|
||||
|
||||
|
||||
def __get_markup_structure(soup):
|
||||
def __get_page_structure(soup):
|
||||
node = {"tag": soup.name}
|
||||
content = []
|
||||
for child in soup.children:
|
||||
|
@ -19,7 +19,7 @@ def __get_markup_structure(soup):
|
|||
if text != "":
|
||||
content.append(text)
|
||||
else:
|
||||
content.append(__get_markup_structure(child))
|
||||
content.append(__get_page_structure(child))
|
||||
|
||||
attributes = __get_attributes(soup.attrs)
|
||||
for key, val in attributes.items():
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from bot.yomichan.glossary.gloss import make_gloss
|
||||
|
||||
|
||||
def make_glossary(entry):
|
||||
soup = BeautifulSoup(entry.markup, "html5lib")
|
||||
soup = entry.get_page_soup()
|
||||
__replace_punctuation(soup)
|
||||
__add_internal_links(soup)
|
||||
__convert_paragraphs(soup)
|
||||
|
|
101
bot/yomichan/glossary/name_conversion.py
Normal file
101
bot/yomichan/glossary/name_conversion.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def convert_names(soup, name_conversion):
|
||||
for child in soup.children:
|
||||
if child.name is None:
|
||||
continue
|
||||
else:
|
||||
convert_names(child, name_conversion)
|
||||
|
||||
if child.name in name_conversion.keys():
|
||||
conversion = name_conversion[child.name]
|
||||
if "name" in conversion:
|
||||
child.attrs["data-name"] = child.name
|
||||
child.name = conversion["name"]
|
||||
if "style" in conversion:
|
||||
child.attrs["style"] = conversion["style"]
|
||||
if "procedures" in conversion:
|
||||
procedures = conversion["procedures"]
|
||||
__apply_name_conversion_procedures(child, procedures)
|
||||
else:
|
||||
child.attrs["data-name"] = child.name
|
||||
child.name = "span"
|
||||
|
||||
|
||||
def __apply_name_conversion_procedures(soup, procedures):
|
||||
functions = {
|
||||
"has_class": __has_class,
|
||||
"has_parent": __has_parent,
|
||||
"has_previous_sibling": __has_previous_sibling,
|
||||
"replace": __replace,
|
||||
"wrap": __wrap,
|
||||
"add_ruby_text": __add_ruby_text,
|
||||
}
|
||||
for procedure in procedures:
|
||||
function = functions[procedure["procedure_name"]]
|
||||
parameters = procedure["parameters"]
|
||||
function(soup, **parameters)
|
||||
|
||||
|
||||
def __has_class(soup, class_name, key, value):
|
||||
if not soup.has_attr("class"):
|
||||
return
|
||||
soup_classes = soup.attrs["class"].split(" ")
|
||||
if class_name not in soup_classes:
|
||||
return
|
||||
if key == "style":
|
||||
soup.attrs["style"] = value
|
||||
elif key == "name":
|
||||
soup.name = value
|
||||
else:
|
||||
raise Exception()
|
||||
|
||||
|
||||
def __has_parent(soup, parent_name, key, value):
|
||||
if soup.find_parent(parent_name) is None:
|
||||
return
|
||||
if key == "style":
|
||||
soup.attrs["style"] = value
|
||||
elif key == "name":
|
||||
soup.name = value
|
||||
else:
|
||||
raise Exception()
|
||||
|
||||
|
||||
def __has_previous_sibling(soup, name, key, value):
|
||||
sibling = soup.previous_sibling
|
||||
if sibling is None:
|
||||
return
|
||||
elif sibling.name is None:
|
||||
return
|
||||
elif sibling.has_attr("data-name"):
|
||||
previous_sibling_name = sibling.attrs["data-name"]
|
||||
else:
|
||||
previous_sibling_name = sibling.name
|
||||
if previous_sibling_name != name:
|
||||
return
|
||||
if key == "style":
|
||||
soup.attrs["style"] = value
|
||||
elif key == "name":
|
||||
soup.name = value
|
||||
else:
|
||||
raise Exception()
|
||||
|
||||
|
||||
def __replace(soup, old, new):
|
||||
soup.string = soup.text.replace(old, new)
|
||||
|
||||
|
||||
def __wrap(soup, l_wrap, r_wrap):
|
||||
if soup.text.strip() != "":
|
||||
soup.string = f"{l_wrap}{soup.text}{r_wrap}"
|
||||
|
||||
|
||||
def __add_ruby_text(soup, mark, style):
|
||||
if style.strip() != "":
|
||||
markup = f"<rt><span style='{style}'>{mark}</span></rt>"
|
||||
else:
|
||||
markup = f"<rt>{mark}</rt>"
|
||||
rt_soup = BeautifulSoup(markup, "xml")
|
||||
soup.append(rt_soup.rt)
|
151
bot/yomichan/glossary/smk8.py
Normal file
151
bot/yomichan/glossary/smk8.py
Normal file
|
@ -0,0 +1,151 @@
|
|||
import re
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import bot.icons as Icons
|
||||
from bot.soup import delete_soup_nodes
|
||||
from bot.data import load_smk8_yomichan_name_conversion
|
||||
from bot.yomichan.glossary.gloss import make_gloss
|
||||
from bot.yomichan.glossary.name_conversion import convert_names
|
||||
|
||||
|
||||
def make_glossary(entry, image_dir):
|
||||
soup = entry.get_page_soup()
|
||||
__fill_alts(soup)
|
||||
__delete_unused_nodes(soup)
|
||||
__clear_styles(soup)
|
||||
__set_data_class(soup)
|
||||
__convert_links(soup, entry)
|
||||
__convert_priority_markers(soup)
|
||||
__convert_gaiji(soup, image_dir)
|
||||
__convert_rectangles(soup, image_dir)
|
||||
|
||||
name_conversion = load_smk8_yomichan_name_conversion()
|
||||
convert_names(soup, name_conversion)
|
||||
|
||||
gloss = make_gloss(soup.span)
|
||||
glossary = [gloss]
|
||||
return glossary
|
||||
|
||||
|
||||
def __fill_alts(soup):
|
||||
for name in ["親見出仮名", "親見出表記"]:
|
||||
for el in soup.find_all(name):
|
||||
el.name = "a"
|
||||
alt = el.attrs["alt"]
|
||||
el.string = alt
|
||||
el.attrs["href"] = f"?query={alt}&wildcards=off"
|
||||
del el.attrs["alt"]
|
||||
|
||||
|
||||
def __delete_unused_nodes(soup):
|
||||
for name in ["audio", "連濁"]:
|
||||
delete_soup_nodes(soup, name)
|
||||
|
||||
|
||||
def __clear_styles(soup):
|
||||
for el in soup.select("[style]"):
|
||||
del el.attrs["style"]
|
||||
|
||||
|
||||
def __set_data_class(soup):
|
||||
for el in soup.select("[class]"):
|
||||
el.attrs["data-class"] = el.attrs["class"]
|
||||
|
||||
|
||||
def __convert_links(soup, entry):
|
||||
for el in soup.find_all("a"):
|
||||
href = el.attrs["href"]
|
||||
if href.startswith("$"):
|
||||
el.unwrap()
|
||||
elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
|
||||
ref_entry_id = entry.id_string_to_entry_id(href)
|
||||
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
|
||||
expression = ref_entry.get_first_expression()
|
||||
el.attrs["href"] = f"?query={expression}&wildcards=off"
|
||||
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
|
||||
pass
|
||||
else:
|
||||
raise Exception(f"Invalid href format: {href}")
|
||||
|
||||
|
||||
def __convert_priority_markers(soup):
|
||||
style = "vertical-align: super; font-size: 0.6em"
|
||||
for el in soup.find_all("img", attrs={"alt": "*"}):
|
||||
el.name = "span"
|
||||
el.string = "*"
|
||||
el.attrs["style"] = style
|
||||
for el in soup.find_all("img", attrs={"alt": "⁑"}):
|
||||
el.name = "span"
|
||||
el.string = "**"
|
||||
el.attrs["style"] = style
|
||||
|
||||
|
||||
def __convert_gaiji(soup, image_dir):
|
||||
for el in soup.find_all("img"):
|
||||
src = el.attrs["src"]
|
||||
path = image_dir
|
||||
for part in src.split("/"):
|
||||
if part.strip() == "":
|
||||
continue
|
||||
path = os.path.join(path, part)
|
||||
ratio = Icons.calculate_ratio(path)
|
||||
img = BeautifulSoup("<img/>", "xml").img
|
||||
img.attrs = {
|
||||
"height": 1.0 if ratio > 1.0 else ratio,
|
||||
"width": ratio if ratio > 1.0 else 1.0,
|
||||
"sizeUnits": "em",
|
||||
"collapsible": False,
|
||||
"collapsed": False,
|
||||
"background": False,
|
||||
"appearance": "monochrome",
|
||||
"title": el.attrs["alt"] if el.has_attr("alt") else "",
|
||||
"path": f"{os.path.basename(image_dir)}/{src}",
|
||||
"src": src,
|
||||
}
|
||||
el.name = "span"
|
||||
el.clear()
|
||||
el.append(img)
|
||||
el.attrs["style"] = "vertical-align: text-bottom;"
|
||||
|
||||
|
||||
def __convert_rectangles(soup, image_dir):
|
||||
cls_to_appearance = {
|
||||
"default": "monochrome",
|
||||
"fill": "monochrome",
|
||||
"red": "auto",
|
||||
"redfill": "auto",
|
||||
}
|
||||
for el in soup.find_all("rect"):
|
||||
cls = el.attrs["class"] if el.has_attr("class") else "default"
|
||||
filename = f"{el.text}-{cls}.svg"
|
||||
path = os.path.join(image_dir, filename)
|
||||
__make_rectangle(path, el.text, cls)
|
||||
ratio = Icons.calculate_ratio(path)
|
||||
img = BeautifulSoup("<img/>", "xml").img
|
||||
img.attrs = {
|
||||
"height": 1.0 if ratio > 1.0 else ratio,
|
||||
"width": ratio if ratio > 1.0 else 1.0,
|
||||
"sizeUnits": "em",
|
||||
"collapsible": False,
|
||||
"collapsed": False,
|
||||
"background": False,
|
||||
"appearance": cls_to_appearance[cls],
|
||||
"title": el.text,
|
||||
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||
}
|
||||
el.name = "span"
|
||||
el.clear()
|
||||
el.append(img)
|
||||
el.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em"
|
||||
|
||||
|
||||
def __make_rectangle(path, text, cls):
|
||||
if cls == "fill":
|
||||
Icons.make_monochrome_fill_rectangle(path, text)
|
||||
elif cls == "red":
|
||||
Icons.make_rectangle(path, text, "red", "white", "red")
|
||||
elif cls == "redfill":
|
||||
Icons.make_rectangle(path, text, "red", "red", "white")
|
||||
else:
|
||||
Icons.make_rectangle(path, text, "black", "transparent", "black")
|
|
@ -7,32 +7,29 @@ __U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
|
|||
"ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
|
||||
|
||||
__SUDACHI_DICTIONARY = None
|
||||
__SUDACHI_INFLECTION_TYPES = None
|
||||
|
||||
|
||||
def sudachi_rules(expression):
|
||||
global __SUDACHI_DICTIONARY
|
||||
global __SUDACHI_INFLECTION_TYPES
|
||||
if __SUDACHI_DICTIONARY is None:
|
||||
__SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
|
||||
if __SUDACHI_INFLECTION_TYPES is None:
|
||||
categories = load_yomichan_inflection_categories()
|
||||
__SUDACHI_INFLECTION_TYPES = categories["sudachi"]
|
||||
sudachi_inflection_categories = categories["sudachi"]
|
||||
splitmode = tokenizer.Tokenizer.SplitMode.A
|
||||
tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
|
||||
if len(tokens) == 0:
|
||||
return ""
|
||||
pos = tokens[len(tokens)-1].part_of_speech()[4]
|
||||
tags = pos.split("-")
|
||||
rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
|
||||
rules = tags_to_rules(expression, tags, sudachi_inflection_categories)
|
||||
return rules
|
||||
|
||||
|
||||
def tags_to_rules(expression, tags, inflection_types):
|
||||
def tags_to_rules(expression, tags, inflection_categories):
|
||||
rules = set()
|
||||
exp_final_character = expression[len(expression)-1:]
|
||||
for tag in tags:
|
||||
if tag in inflection_types["sahen"]:
|
||||
if tag in inflection_categories["sahen"]:
|
||||
if expression.endswith("する"):
|
||||
rules.add("vs")
|
||||
elif expression.endswith("為る"):
|
||||
|
@ -41,20 +38,20 @@ def tags_to_rules(expression, tags, inflection_types):
|
|||
rules.add("vz")
|
||||
elif expression.endswith("す"):
|
||||
rules.add("v5")
|
||||
if tag in inflection_types["godan"]:
|
||||
if tag in inflection_categories["godan"]:
|
||||
if exp_final_character in __U_KANA_LIST:
|
||||
rules.add("v5")
|
||||
if tag in inflection_types["ichidan"]:
|
||||
if tag in inflection_categories["ichidan"]:
|
||||
if expression.endswith("る"):
|
||||
rules.add("v1")
|
||||
if tag in inflection_types["keiyoushi"]:
|
||||
if tag in inflection_categories["keiyoushi"]:
|
||||
if expression.endswith("い"):
|
||||
rules.add("adj-i")
|
||||
if tag in inflection_types["kahen"]:
|
||||
if tag in inflection_categories["kahen"]:
|
||||
if expression.endswith("くる"):
|
||||
rules.add("vk")
|
||||
elif expression.endswith("来る"):
|
||||
rules.add("vk")
|
||||
if tag in inflection_types["sudachi"]:
|
||||
if tag in inflection_categories["sudachi"]:
|
||||
return sudachi_rules(expression)
|
||||
return " ".join(list(rules))
|
||||
|
|
53
bot/yomichan/terms/daijirin2.py
Normal file
53
bot/yomichan/terms/daijirin2.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
from bot.data import load_yomichan_inflection_categories
|
||||
|
||||
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
|
||||
|
||||
from bot.yomichan.terms.terminator import Terminator
|
||||
from bot.yomichan.glossary.daijirin2 import make_glossary
|
||||
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
||||
|
||||
|
||||
class Daijirin2Terminator(Terminator):
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
categories = load_yomichan_inflection_categories()
|
||||
self._inflection_categories = categories[name]
|
||||
|
||||
def _definition_tags(self, entry):
|
||||
return ""
|
||||
|
||||
def _inflection_rules(self, entry, expression):
|
||||
if isinstance(entry, PhraseEntry):
|
||||
return sudachi_rules(expression)
|
||||
pos_tags = entry.get_part_of_speech_tags()
|
||||
if len(pos_tags) > 0:
|
||||
rules = tags_to_rules(expression, pos_tags,
|
||||
self._inflection_categories)
|
||||
else:
|
||||
rules = sudachi_rules(expression)
|
||||
return rules
|
||||
|
||||
def _glossary(self, entry):
|
||||
if entry.entry_id in self._glossary_cache:
|
||||
return self._glossary_cache[entry.entry_id]
|
||||
glossary = make_glossary(entry, self._image_dir)
|
||||
self._glossary_cache[entry.entry_id] = glossary
|
||||
return glossary
|
||||
|
||||
def _sequence(self, entry):
|
||||
return entry.entry_id[0] * 100000 + entry.entry_id[1]
|
||||
|
||||
def _term_tags(self, entry):
|
||||
return ""
|
||||
|
||||
def _link_glossary_parameters(self, entry):
|
||||
return [
|
||||
[entry.children, "子"],
|
||||
[entry.phrases, "句"],
|
||||
]
|
||||
|
||||
def _subentry_lists(self, entry):
|
||||
return [
|
||||
entry.children,
|
||||
entry.phrases,
|
||||
]
|
|
@ -4,8 +4,8 @@ from bot.yomichan.glossary.jitenon import make_glossary
|
|||
|
||||
|
||||
class JitenonTerminator(Terminator):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
|
||||
def _definition_tags(self, entry):
|
||||
return None
|
||||
|
@ -28,8 +28,8 @@ class JitenonTerminator(Terminator):
|
|||
|
||||
|
||||
class JitenonYojiTerminator(JitenonTerminator):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
|
||||
def _inflection_rules(self, entry, expression):
|
||||
return ""
|
||||
|
@ -40,8 +40,8 @@ class JitenonYojiTerminator(JitenonTerminator):
|
|||
|
||||
|
||||
class JitenonKotowazaTerminator(JitenonTerminator):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
|
||||
def _inflection_rules(self, entry, expression):
|
||||
return sudachi_rules(expression)
|
||||
|
|
58
bot/yomichan/terms/smk8.py
Normal file
58
bot/yomichan/terms/smk8.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
from bot.data import load_yomichan_inflection_categories
|
||||
|
||||
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
|
||||
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
|
||||
|
||||
from bot.yomichan.terms.terminator import Terminator
|
||||
from bot.yomichan.glossary.smk8 import make_glossary
|
||||
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
||||
|
||||
|
||||
class Smk8Terminator(Terminator):
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
categories = load_yomichan_inflection_categories()
|
||||
self._inflection_categories = categories[name]
|
||||
|
||||
def _definition_tags(self, entry):
|
||||
if isinstance(entry, KanjiEntry):
|
||||
return "造"
|
||||
else:
|
||||
return ""
|
||||
|
||||
def _inflection_rules(self, entry, expression):
|
||||
if isinstance(entry, PhraseEntry):
|
||||
return sudachi_rules(expression)
|
||||
elif isinstance(entry, KanjiEntry):
|
||||
return ""
|
||||
pos_tags = entry.get_part_of_speech_tags()
|
||||
if len(pos_tags) == 0:
|
||||
return sudachi_rules(expression)
|
||||
else:
|
||||
return tags_to_rules(expression, pos_tags, self._inflection_categories)
|
||||
|
||||
def _glossary(self, entry):
|
||||
if entry.entry_id in self._glossary_cache:
|
||||
return self._glossary_cache[entry.entry_id]
|
||||
glossary = make_glossary(entry, self._image_dir)
|
||||
self._glossary_cache[entry.entry_id] = glossary
|
||||
return glossary
|
||||
|
||||
def _sequence(self, entry):
|
||||
return entry.entry_id[0] * 100000 + entry.entry_id[1]
|
||||
|
||||
def _term_tags(self, entry):
|
||||
return ""
|
||||
|
||||
def _link_glossary_parameters(self, entry):
|
||||
return [
|
||||
[entry.children, "子"],
|
||||
[entry.phrases, "句"]
|
||||
]
|
||||
|
||||
def _subentry_lists(self, entry):
|
||||
return [
|
||||
entry.children,
|
||||
entry.phrases,
|
||||
entry.kanjis
|
||||
]
|
|
@ -1,6 +1,11 @@
|
|||
class Terminator:
|
||||
def __init__(self):
|
||||
def __init__(self, name):
|
||||
self._name = name
|
||||
self._glossary_cache = {}
|
||||
self._image_dir = None
|
||||
|
||||
def set_image_dir(self, image_dir):
|
||||
self._image_dir = image_dir
|
||||
|
||||
def make_terms(self, entry):
|
||||
terms = []
|
||||
|
|
14782
data/adobe/Adobe-Japan1_sequences.txt
Normal file
14782
data/adobe/Adobe-Japan1_sequences.txt
Normal file
File diff suppressed because it is too large
Load diff
14
data/adobe/override_glyphs.json
Normal file
14
data/adobe/override_glyphs.json
Normal file
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"8228": "Ø",
|
||||
"9772": "(",
|
||||
"9773": ")",
|
||||
"10078": "Т",
|
||||
"10079": "У",
|
||||
"10080": "Ф",
|
||||
"10081": "Х",
|
||||
"10082": "Ц",
|
||||
"10083": "Ч",
|
||||
"10084": "Ш",
|
||||
"12107": "〻",
|
||||
"12180": "⮗"
|
||||
}
|
10963
data/daijirin2/kana_abbreviations.csv
Normal file
10963
data/daijirin2/kana_abbreviations.csv
Normal file
File diff suppressed because it is too large
Load diff
8328
data/daijirin2/phrase_readings.csv
Normal file
8328
data/daijirin2/phrase_readings.csv
Normal file
File diff suppressed because it is too large
Load diff
290
data/daijirin2/yomichan_name_conversion.json
Normal file
290
data/daijirin2/yomichan_name_conversion.json
Normal file
|
@ -0,0 +1,290 @@
|
|||
{
|
||||
"a": {},
|
||||
"br": {},
|
||||
"img": {},
|
||||
"div": {},
|
||||
"span": {},
|
||||
"ruby": {},
|
||||
"rt": {},
|
||||
"語構成": {
|
||||
"name": "span",
|
||||
"style": "margin-right: 0.5em;"
|
||||
},
|
||||
"熟語例G": {
|
||||
"name": "div"
|
||||
},
|
||||
"漢字音G": {
|
||||
"name": "ul"
|
||||
},
|
||||
"漢字音": {
|
||||
"name": "li"
|
||||
},
|
||||
"sup": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.6em; vertical-align: super;"
|
||||
},
|
||||
"p": {
|
||||
"name": "div",
|
||||
"style": "margin-top: 0.5em; margin-bottom: 0.5em;"
|
||||
},
|
||||
"カット": {
|
||||
"name": "div"
|
||||
},
|
||||
"中語義": {
|
||||
"name": "div"
|
||||
},
|
||||
"副義": {
|
||||
"name": "div"
|
||||
},
|
||||
"異字同訓解説": {
|
||||
"name": "div"
|
||||
},
|
||||
"異字同訓語義G": {
|
||||
"name": "div"
|
||||
},
|
||||
"細義": {
|
||||
"name": "div"
|
||||
},
|
||||
"単位名": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.6em; vertical-align: super;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"原籍": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em; vertical-align: super;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"句仮名": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.6em; vertical-align: super;"
|
||||
},
|
||||
"品詞行": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.6em; vertical-align: super;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"用例": {
|
||||
"name": "div"
|
||||
},
|
||||
"季語G": {
|
||||
"name": "div"
|
||||
},
|
||||
"補説G": {
|
||||
"name": "div",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "has_previous_sibling",
|
||||
"parameters": {
|
||||
"name": "語義Gnum",
|
||||
"key": "name",
|
||||
"value": "span"
|
||||
}
|
||||
},
|
||||
{
|
||||
"procedure_name": "has_previous_sibling",
|
||||
"parameters": {
|
||||
"name": "アクセントG",
|
||||
"key": "name",
|
||||
"value": "span"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"語釈": {
|
||||
"name": "span",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "has_previous_sibling",
|
||||
"parameters": {
|
||||
"name": "補説G",
|
||||
"key": "name",
|
||||
"value": "div"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"品詞用法": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.6em; vertical-align: super;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"大語義": {
|
||||
"name": "div"
|
||||
},
|
||||
"文語形": {
|
||||
"name": "div"
|
||||
},
|
||||
"慣用G": {
|
||||
"name": "div",
|
||||
"style": "margin-top: 0.5em"
|
||||
},
|
||||
"歴史仮名": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.6em;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"派生G": {
|
||||
"name": "div",
|
||||
"style": "margin-top: 0.5em"
|
||||
},
|
||||
"準大語義": {
|
||||
"name": "div"
|
||||
},
|
||||
"見出部": {
|
||||
"name": "span"
|
||||
},
|
||||
"解説部": {
|
||||
"name": "div"
|
||||
},
|
||||
"語義G": {
|
||||
"name": "div"
|
||||
},
|
||||
"語義区切": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em; vertical-align: super;"
|
||||
},
|
||||
"返り点": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.5em; font-weight: normal; vertical-align: super;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "has_class",
|
||||
"parameters": {
|
||||
"class_name": "熟語記号",
|
||||
"key": "style",
|
||||
"value": "vertical-align: baseline;"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"生没年": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em;"
|
||||
},
|
||||
"用法": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em; vertical-align: super;"
|
||||
},
|
||||
"異字同訓": {
|
||||
"name": "div",
|
||||
"style": "margin-top: 0.5em;"
|
||||
},
|
||||
"異字同訓仮名": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"異字同訓漢字": {
|
||||
"name": "span",
|
||||
"style": "font-weight: normal;"
|
||||
},
|
||||
"異字同訓表記": {
|
||||
"name": "span",
|
||||
"style": "font-weight: normal;"
|
||||
},
|
||||
"見出仮名": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"見出相当部": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"カットG": {
|
||||
"name": "div",
|
||||
"style": "margin-top: 0.5em;"
|
||||
},
|
||||
"sm": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em;"
|
||||
},
|
||||
"small": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em;"
|
||||
},
|
||||
"sub": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em; vertical-align: sub;"
|
||||
},
|
||||
"付記": {
|
||||
"name": "span",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"アクセントG": {
|
||||
"name": "span",
|
||||
"style": "margin-left: 0.25em; margin-right: 0.25em; font-size: 0.7em; vertical-align: super;"
|
||||
},
|
||||
"i": {
|
||||
"name": "span",
|
||||
"style": "font-style: italic;"
|
||||
},
|
||||
"h1": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"読みG": {
|
||||
"name": "span",
|
||||
"style": "vertical-align: super; font-size: 0.6em;"
|
||||
},
|
||||
"ルビG": {
|
||||
"name": "span",
|
||||
"style": "vertical-align: super; font-size: 0.6em; font-weight: normal;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
1135
data/smk8/phrase_readings.csv
Normal file
1135
data/smk8/phrase_readings.csv
Normal file
File diff suppressed because it is too large
Load diff
221
data/smk8/yomichan_name_conversion.json
Normal file
221
data/smk8/yomichan_name_conversion.json
Normal file
|
@ -0,0 +1,221 @@
|
|||
{
|
||||
"a": {},
|
||||
"br": {},
|
||||
"img": {},
|
||||
"div": {},
|
||||
"span": {},
|
||||
"ruby": {},
|
||||
"rt": {},
|
||||
"語義": {
|
||||
"name": "div"
|
||||
},
|
||||
"副義": {
|
||||
"name": "div"
|
||||
},
|
||||
"派生": {
|
||||
"name": "div"
|
||||
},
|
||||
"用例": {
|
||||
"name": "div"
|
||||
},
|
||||
"参照G": {
|
||||
"name": "div"
|
||||
},
|
||||
"用例G": {
|
||||
"name": "div"
|
||||
},
|
||||
"解説部": {
|
||||
"name": "div"
|
||||
},
|
||||
"大語義": {
|
||||
"name": "div"
|
||||
},
|
||||
"名詞形G": {
|
||||
"name": "div"
|
||||
},
|
||||
"可能形G": {
|
||||
"name": "div"
|
||||
},
|
||||
"派生SubG": {
|
||||
"name": "div"
|
||||
},
|
||||
"子解説部": {
|
||||
"name": "div"
|
||||
},
|
||||
"句解説部": {
|
||||
"name": "div"
|
||||
},
|
||||
"運用解説": {
|
||||
"name": "div"
|
||||
},
|
||||
"表記解説": {
|
||||
"name": "div"
|
||||
},
|
||||
"文法解説": {
|
||||
"name": "div"
|
||||
},
|
||||
"派生SubGF": {
|
||||
"name": "div"
|
||||
},
|
||||
"かぞえ方解説": {
|
||||
"name": "div"
|
||||
},
|
||||
"二分": {
|
||||
"name": "span",
|
||||
"style": "margin-right: 1.0em;"
|
||||
},
|
||||
"四分": {
|
||||
"name": "span",
|
||||
"style": "margin-right: 0.5em;"
|
||||
},
|
||||
"言換M": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.5em;"
|
||||
},
|
||||
"品詞用法": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.7em;"
|
||||
},
|
||||
"ルビG": {
|
||||
"name": "span",
|
||||
"style": "vertical-align: super; font-size: 0.65em"
|
||||
},
|
||||
"アクセント": {
|
||||
"name": "span",
|
||||
"style": "vertical-align: super; font-size: 0.7em;"
|
||||
},
|
||||
"アクセント組M": {
|
||||
"name": "span",
|
||||
"style": "vertical-align: super; font-size: 0.7em;"
|
||||
},
|
||||
"IT": {
|
||||
"name": "span",
|
||||
"style": "font-style: italic;"
|
||||
},
|
||||
"EXCLAMATION": {
|
||||
"name": "span",
|
||||
"style": "font-style: italic;"
|
||||
},
|
||||
"B": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"EM": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"出現形": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"見出仮名": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"基本構文em": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"ウ濁音参照": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;"
|
||||
},
|
||||
"表外字": {
|
||||
"name": "ruby",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "add_ruby_text",
|
||||
"parameters": {
|
||||
"mark": "︿",
|
||||
"style": "font-size: 2em;"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"表外音訓": {
|
||||
"name": "ruby",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "add_ruby_text",
|
||||
"parameters": {
|
||||
"mark": "︽",
|
||||
"style": "font-size: 2em;"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"表音式": {
|
||||
"name": "ruby"
|
||||
},
|
||||
"表音表記": {
|
||||
"name": "rt",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "replace",
|
||||
"parameters": {
|
||||
"old": "(",
|
||||
"new": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"procedure_name": "replace",
|
||||
"parameters": {
|
||||
"old": ")",
|
||||
"new": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"派生見出": {
|
||||
"name": "span",
|
||||
"style": "font-weight: bold;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "has_class",
|
||||
"parameters": {
|
||||
"class_name": "normal",
|
||||
"key": "style",
|
||||
"value": "font-weight: normal;"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"品詞G": {
|
||||
"name": "span",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "has_parent",
|
||||
"parameters": {
|
||||
"parent_name": "品詞用法",
|
||||
"key": "style",
|
||||
"value": "font-size: 1.43em;"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"歴史仮名": {
|
||||
"name": "span",
|
||||
"style": "font-size: 0.6em; font-weight: normal;",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"ルビ": {
|
||||
"name": "span",
|
||||
"procedures": [
|
||||
{
|
||||
"procedure_name": "wrap",
|
||||
"parameters": {
|
||||
"l_wrap": "(",
|
||||
"r_wrap": ")"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -1,19 +1,45 @@
|
|||
俠,侠
|
||||
俱,倶
|
||||
儘,侭
|
||||
凜,凛
|
||||
剝,剥
|
||||
𠮟,叱
|
||||
吞,呑
|
||||
啞,唖
|
||||
噓,嘘
|
||||
嚙,噛
|
||||
囊,嚢
|
||||
塡,填
|
||||
姸,妍
|
||||
屛,屏
|
||||
屢,屡
|
||||
拋,抛
|
||||
搔,掻
|
||||
摑,掴
|
||||
攪,撹
|
||||
潑,溌
|
||||
瀆,涜
|
||||
焰,焔
|
||||
禱,祷
|
||||
竜,龍
|
||||
筓,笄
|
||||
簞,箪
|
||||
籠,篭
|
||||
繡,繍
|
||||
繫,繋
|
||||
腁,胼
|
||||
萊,莱
|
||||
藪,薮
|
||||
蟬,蝉
|
||||
蠟,蝋
|
||||
軀,躯
|
||||
醬,醤
|
||||
醱,醗
|
||||
頰,頬
|
||||
顚,顛
|
||||
驒,騨
|
||||
鶯,鴬
|
||||
鷗,鴎
|
||||
鷽,鴬
|
||||
鹼,鹸
|
||||
麴,麹
|
||||
|
|
|
|
@ -6,5 +6,21 @@
|
|||
"keiyoushi": ["形容詞", "ナイ", "タイ", "ラシイ"],
|
||||
"kahen": ["カ行変格"],
|
||||
"sudachi": []
|
||||
},
|
||||
"smk8": {
|
||||
"sahen": ["サ", "サ変型"],
|
||||
"godan": ["上二", "下二", "四", "五", "上二型", "下二型", "四段型", "五型", "特殊型"],
|
||||
"ichidan": ["上一", "下一", "上一型", "下一型"],
|
||||
"keiyoushi": ["形", "形型"],
|
||||
"kahen": ["カ"],
|
||||
"sudachi": ["連体"]
|
||||
},
|
||||
"daijirin2": {
|
||||
"sahen": ["サ変", "サ特活"],
|
||||
"godan": ["ナ変", "マ特活", "ラ特活", "上二", "下二", "五", "四"],
|
||||
"ichidan": ["上一", "下一"],
|
||||
"keiyoushi": ["形"],
|
||||
"kahen": ["カ変"],
|
||||
"sudachi": ["助動", "接尾", "枕詞", "連体", "連語"]
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,5 +24,28 @@
|
|||
"url": "https://kotowaza.jitenon.jp/"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"smk8": {
|
||||
"index": {
|
||||
"title": "新明解国語辞典 第八版",
|
||||
"sequenced": true,
|
||||
"format": 3
|
||||
},
|
||||
"tags": [
|
||||
["子", "name", 0, "子項目", 0],
|
||||
["句", "expression", 0, "句項目", 0],
|
||||
["造", "popular", 0, "造語成分項目", 0]
|
||||
]
|
||||
},
|
||||
"daijirin2": {
|
||||
"index": {
|
||||
"title": "大辞林 第四版",
|
||||
"sequenced": true,
|
||||
"format": 3
|
||||
},
|
||||
"tags": [
|
||||
["子", "name", 0, "子項目", 0],
|
||||
["句", "expression", 0, "句項目", 0]
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
68
jitenbot.py
68
jitenbot.py
|
@ -16,47 +16,59 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|||
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
from bot.crawlers import JitenonYojiCrawler
|
||||
from bot.crawlers import JitenonKotowazaCrawler
|
||||
from bot.crawlers import Smk8Crawler
|
||||
from bot.crawlers import Daijirin2Crawler
|
||||
|
||||
|
||||
crawlers = {
|
||||
"jitenon-yoji": JitenonYojiCrawler,
|
||||
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
||||
}
|
||||
def directory(d):
|
||||
if not os.path.isdir(d):
|
||||
raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory")
|
||||
elif not os.access(d, os.R_OK):
|
||||
raise argparse.ArgumentTypeError(f"Cannot access directory `{d}`")
|
||||
else:
|
||||
return d
|
||||
|
||||
|
||||
def add_target_argument(parser):
|
||||
target_argument_params = {
|
||||
"choices": crawlers.keys(),
|
||||
"help": "Dictionary to convert."
|
||||
}
|
||||
parser.add_argument("target", **target_argument_params)
|
||||
|
||||
|
||||
def make_parser():
|
||||
argument_parser_params = {
|
||||
"prog": "jitenbot",
|
||||
"description": "Convert Japanese dictionary files to new formats.",
|
||||
}
|
||||
parser = argparse.ArgumentParser(**argument_parser_params)
|
||||
return parser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = make_parser()
|
||||
add_target_argument(parser)
|
||||
def parse_args(targets):
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="jitenbot",
|
||||
description="Convert Japanese dictionary files to new formats.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"target",
|
||||
choices=targets,
|
||||
help="name of dictionary to convert"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p", "--page-dir",
|
||||
help="path to directory containing XML page files",
|
||||
type=directory
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i", "--image-dir",
|
||||
help="path to directory containing image files (gaiji, etc.)",
|
||||
type=directory
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
crawlers = {
|
||||
"jitenon-yoji": JitenonYojiCrawler,
|
||||
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
||||
"smk8": Smk8Crawler,
|
||||
"daijirin2": Daijirin2Crawler,
|
||||
}
|
||||
args = parse_args(crawlers.keys())
|
||||
crawler_class = crawlers[args.target]
|
||||
crawler = crawler_class()
|
||||
crawler.crawl()
|
||||
crawler.read_entries()
|
||||
crawler = crawler_class(args)
|
||||
crawler.collect_pages()
|
||||
crawler.read_pages()
|
||||
crawler.make_yomichan_dictionary()
|
||||
|
||||
|
||||
|
|
|
@ -5,9 +5,12 @@ charset-normalizer==3.1.0
|
|||
css-parser==1.0.8
|
||||
html5lib==1.1
|
||||
idna==3.4
|
||||
requests==2.28.2
|
||||
lxml==4.9.2
|
||||
Pillow==9.5.0
|
||||
platformdirs==3.5.0
|
||||
requests==2.29.0
|
||||
six==1.16.0
|
||||
soupsieve==2.4
|
||||
soupsieve==2.4.1
|
||||
SudachiDict-full==20230110
|
||||
SudachiPy==0.6.7
|
||||
urllib3==1.26.15
|
||||
|
|
Loading…
Reference in a new issue