Add support for Shinmeikai 8th edition & Daijirin 4th edition
This commit is contained in:
parent
0cfa3a19df
commit
5aa954bf2d
134
bot/crawlers.py
134
bot/crawlers.py
|
@ -1,41 +1,59 @@
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import bot.scraper as Scraper
|
import bot.scraper as Scraper
|
||||||
|
|
||||||
from bot.entries.jitenon import JitenonKotowazaEntry
|
from bot.entries.jitenon import JitenonKotowazaEntry
|
||||||
from bot.yomichan.export import JitenonKotowazaExporter
|
|
||||||
|
|
||||||
from bot.entries.jitenon import JitenonYojiEntry
|
from bot.entries.jitenon import JitenonYojiEntry
|
||||||
|
from bot.entries.smk8 import Smk8Entry
|
||||||
|
from bot.entries.daijirin2 import Daijirin2Entry
|
||||||
|
|
||||||
|
from bot.yomichan.export import JitenonKotowazaExporter
|
||||||
from bot.yomichan.export import JitenonYojiExporter
|
from bot.yomichan.export import JitenonYojiExporter
|
||||||
|
from bot.yomichan.export import Smk8Exporter
|
||||||
|
from bot.yomichan.export import Daijirin2Exporter
|
||||||
|
|
||||||
|
|
||||||
class Crawler():
|
class _Crawler():
|
||||||
def __init__(self):
|
def __init__(self, args):
|
||||||
self._crawl_map = {}
|
self._page_dir = args.page_dir
|
||||||
self.__entries = []
|
self._image_dir = args.image_dir
|
||||||
|
self._page_map = {}
|
||||||
|
self._entries = []
|
||||||
|
|
||||||
def read_entries(self):
|
def read_pages(self):
|
||||||
entries_len = len(self._crawl_map)
|
pages_len = len(self._page_map)
|
||||||
items = self._crawl_map.items()
|
items = self._page_map.items()
|
||||||
for idx, (entry_id, entry_path) in enumerate(items):
|
for idx, (page_id, page_path) in enumerate(items):
|
||||||
update = f"Reading entry {idx+1}/{entries_len}"
|
update = f"Reading page {idx+1}/{pages_len}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
entry = self._entry_class(entry_id)
|
entry = self._entry_class(page_id)
|
||||||
entry.set_markup(entry_path)
|
with open(page_path, "r") as f:
|
||||||
self.__entries.append(entry)
|
page = f.read()
|
||||||
|
entry.set_page(page)
|
||||||
|
self._entries.append(entry)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
def make_yomichan_dictionary(self):
|
def make_yomichan_dictionary(self):
|
||||||
self._yomi_exporter.export(self.__entries)
|
self._yomi_exporter.export(self._entries, self._image_dir)
|
||||||
|
|
||||||
|
def _parse_page_id(self, page_link):
|
||||||
|
m = re.search(self._page_id_pattern, page_link)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
page_id = int(m.group(1))
|
||||||
|
if page_id in self._page_map:
|
||||||
|
return None
|
||||||
|
return page_id
|
||||||
|
|
||||||
|
|
||||||
class JitenonCrawler(Crawler):
|
class _JitenonCrawler(_Crawler):
|
||||||
def __init__(self):
|
def __init__(self, args):
|
||||||
super().__init__()
|
super().__init__(args)
|
||||||
|
|
||||||
def crawl(self):
|
def collect_pages(self):
|
||||||
print(f"Scraping {self._name}...")
|
print("Scraping jitenon.jp")
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
@ -44,40 +62,60 @@ class JitenonCrawler(Crawler):
|
||||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
entry_link = kana_a['href']
|
page_link = kana_a['href']
|
||||||
entry_id = self.__parse_entry_id(entry_link)
|
page_id = self._parse_page_id(page_link)
|
||||||
if entry_id is None:
|
if page_id is None:
|
||||||
continue
|
continue
|
||||||
_, entry_path = jitenon.scrape(entry_link)
|
_, page_path = jitenon.scrape(page_link)
|
||||||
self._crawl_map[entry_id] = entry_path
|
self._page_map[page_id] = page_path
|
||||||
entries_len = len(self._crawl_map)
|
pages_len = len(self._page_map)
|
||||||
print(f"Finished scraping {entries_len} entries")
|
print(f"Finished scraping {pages_len} pages")
|
||||||
|
|
||||||
def __parse_entry_id(self, entry_link):
|
|
||||||
m = re.search(self._entry_id_pattern, entry_link)
|
|
||||||
if not m:
|
|
||||||
return None
|
|
||||||
entry_id = int(m.group(1))
|
|
||||||
if entry_id in self._crawl_map:
|
|
||||||
return None
|
|
||||||
return entry_id
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiCrawler(JitenonCrawler):
|
class JitenonYojiCrawler(_JitenonCrawler):
|
||||||
def __init__(self):
|
def __init__(self, args):
|
||||||
super().__init__()
|
super().__init__(args)
|
||||||
self._entry_class = JitenonYojiEntry
|
self._entry_class = JitenonYojiEntry
|
||||||
self._yomi_exporter = JitenonYojiExporter()
|
self._yomi_exporter = JitenonYojiExporter(args.target)
|
||||||
self._name = "jitenon-yoji"
|
|
||||||
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
|
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
|
||||||
self._entry_id_pattern = r"([0-9]+).html"
|
self._page_id_pattern = r"([0-9]+)\.html$"
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaCrawler(JitenonCrawler):
|
class JitenonKotowazaCrawler(_JitenonCrawler):
|
||||||
def __init__(self):
|
def __init__(self, args):
|
||||||
super().__init__()
|
super().__init__(args)
|
||||||
self._entry_class = JitenonKotowazaEntry
|
self._entry_class = JitenonKotowazaEntry
|
||||||
self._yomi_exporter = JitenonKotowazaExporter()
|
self._yomi_exporter = JitenonKotowazaExporter(args.target)
|
||||||
self._name = "jitenon-kotowaza"
|
|
||||||
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
|
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
|
||||||
self._entry_id_pattern = r"([0-9]+).php"
|
self._page_id_pattern = r"([0-9]+)\.php$"
|
||||||
|
|
||||||
|
|
||||||
|
class _MonokakidoCrawler(_Crawler):
|
||||||
|
def __init__(self, args):
|
||||||
|
super().__init__(args)
|
||||||
|
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
||||||
|
|
||||||
|
def collect_pages(self):
|
||||||
|
print(f"Searching for page files in `{self._page_dir}`")
|
||||||
|
for pagefile in os.listdir(self._page_dir):
|
||||||
|
page_id = self._parse_page_id(pagefile)
|
||||||
|
if page_id is None or page_id == 0:
|
||||||
|
continue
|
||||||
|
path = os.path.join(self._page_dir, pagefile)
|
||||||
|
self._page_map[page_id] = path
|
||||||
|
pages_len = len(self._page_map)
|
||||||
|
print(f"Found {pages_len} page files for processing")
|
||||||
|
|
||||||
|
|
||||||
|
class Smk8Crawler(_MonokakidoCrawler):
|
||||||
|
def __init__(self, args):
|
||||||
|
super().__init__(args)
|
||||||
|
self._entry_class = Smk8Entry
|
||||||
|
self._yomi_exporter = Smk8Exporter(args.target)
|
||||||
|
|
||||||
|
|
||||||
|
class Daijirin2Crawler(_MonokakidoCrawler):
|
||||||
|
def __init__(self, args):
|
||||||
|
super().__init__(args)
|
||||||
|
self._entry_class = Daijirin2Entry
|
||||||
|
self._yomi_exporter = Daijirin2Exporter(args.target)
|
||||||
|
|
98
bot/data.py
98
bot/data.py
|
@ -2,11 +2,24 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import csv
|
import csv
|
||||||
|
from functools import cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from platformdirs import user_config_dir
|
from platformdirs import user_config_dir
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def get_adobe_glyph(code):
|
||||||
|
adobe_glyphs = __load_adobe_glyphs()
|
||||||
|
override_adobe_glyphs = __load_override_adobe_glyphs()
|
||||||
|
if code in override_adobe_glyphs:
|
||||||
|
return override_adobe_glyphs[code]
|
||||||
|
if len(adobe_glyphs[code]) > 1:
|
||||||
|
raise Exception(f"Multiple glyphs available for code {code}")
|
||||||
|
return adobe_glyphs[code][0]
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def load_config():
|
def load_config():
|
||||||
config_dir = user_config_dir("jitenbot")
|
config_dir = user_config_dir("jitenbot")
|
||||||
if not Path(config_dir).is_dir():
|
if not Path(config_dir).is_dir():
|
||||||
|
@ -22,18 +35,21 @@ def load_config():
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def load_yomichan_inflection_categories():
|
def load_yomichan_inflection_categories():
|
||||||
file_name = "yomichan_inflection_categories.json"
|
file_name = "yomichan_inflection_categories.json"
|
||||||
data = __load_json(file_name)
|
data = __load_json(file_name)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def load_yomichan_metadata():
|
def load_yomichan_metadata():
|
||||||
file_name = "yomichan_metadata.json"
|
file_name = "yomichan_metadata.json"
|
||||||
data = __load_json(file_name)
|
data = __load_json(file_name)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def load_variant_kanji():
|
def load_variant_kanji():
|
||||||
def loader(data, row):
|
def loader(data, row):
|
||||||
data[row[0]] = row[1]
|
data[row[0]] = row[1]
|
||||||
|
@ -43,12 +59,94 @@ def load_variant_kanji():
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def load_smk8_phrase_readings():
|
||||||
|
def loader(data, row):
|
||||||
|
entry_id = (int(row[0]), int(row[1]))
|
||||||
|
reading = row[2]
|
||||||
|
data[entry_id] = reading
|
||||||
|
file_name = os.path.join("smk8", "phrase_readings.csv")
|
||||||
|
data = {}
|
||||||
|
__load_csv(file_name, loader, data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def load_daijirin2_phrase_readings():
|
||||||
|
def loader(data, row):
|
||||||
|
entry_id = (int(row[0]), int(row[1]))
|
||||||
|
reading = row[2]
|
||||||
|
data[entry_id] = reading
|
||||||
|
file_name = os.path.join("daijirin2", "phrase_readings.csv")
|
||||||
|
data = {}
|
||||||
|
__load_csv(file_name, loader, data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def load_daijirin2_kana_abbreviations():
|
||||||
|
def loader(data, row):
|
||||||
|
entry_id = (int(row[0]), int(row[1]))
|
||||||
|
abbreviations = []
|
||||||
|
for abbr in row[2:]:
|
||||||
|
if abbr.strip() != "":
|
||||||
|
abbreviations.append(abbr)
|
||||||
|
data[entry_id] = abbreviations
|
||||||
|
file_name = os.path.join("daijirin2", "kana_abbreviations.csv")
|
||||||
|
data = {}
|
||||||
|
__load_csv(file_name, loader, data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def load_smk8_yomichan_name_conversion():
|
||||||
|
file_name = os.path.join("smk8", "yomichan_name_conversion.json")
|
||||||
|
data = __load_json(file_name)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def load_daijirin2_yomichan_name_conversion():
|
||||||
|
file_name = os.path.join("daijirin2", "yomichan_name_conversion.json")
|
||||||
|
data = __load_json(file_name)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def __load_default_config():
|
def __load_default_config():
|
||||||
file_name = "default_config.json"
|
file_name = "default_config.json"
|
||||||
data = __load_json(file_name)
|
data = __load_json(file_name)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def __load_adobe_glyphs():
|
||||||
|
def loader(data, row):
|
||||||
|
if row[0].startswith("#"):
|
||||||
|
return
|
||||||
|
character = chr(int(row[0].split(" ")[0], 16))
|
||||||
|
code = int(row[2].removeprefix(" CID+"))
|
||||||
|
if code in data:
|
||||||
|
if character not in data[code]:
|
||||||
|
data[code].append(character)
|
||||||
|
else:
|
||||||
|
data[code] = [character]
|
||||||
|
file_name = os.path.join("adobe", "Adobe-Japan1_sequences.txt")
|
||||||
|
data = {}
|
||||||
|
__load_csv(file_name, loader, data, delim=';')
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def __load_override_adobe_glyphs():
|
||||||
|
file_name = os.path.join("adobe", "override_glyphs.json")
|
||||||
|
json_data = __load_json(file_name)
|
||||||
|
data = {}
|
||||||
|
for key, val in json_data.items():
|
||||||
|
data[int(key)] = val
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def __load_json(file_name):
|
def __load_json(file_name):
|
||||||
file_path = os.path.join("data", file_name)
|
file_path = os.path.join("data", file_name)
|
||||||
if not Path(file_path).is_file():
|
if not Path(file_path).is_file():
|
||||||
|
|
272
bot/entries/daijirin2.py
Normal file
272
bot/entries/daijirin2.py
Normal file
|
@ -0,0 +1,272 @@
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import bot.expressions as Expressions
|
||||||
|
import bot.soup as Soup
|
||||||
|
from bot.data import load_daijirin2_phrase_readings
|
||||||
|
from bot.data import load_daijirin2_kana_abbreviations
|
||||||
|
from bot.entries.entry import Entry
|
||||||
|
from bot.entries.daijirin2_preprocess import preprocess_page
|
||||||
|
|
||||||
|
|
||||||
|
class _BaseDaijirin2Entry(Entry):
|
||||||
|
ID_TO_ENTRY = {}
|
||||||
|
SUBENTRY_ID_TO_ENTRY_ID = {}
|
||||||
|
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
if entry_id not in self.ID_TO_ENTRY:
|
||||||
|
self.ID_TO_ENTRY[entry_id] = self
|
||||||
|
else:
|
||||||
|
raise Exception(f"Duplicate entry ID: {entry_id}")
|
||||||
|
self.children = []
|
||||||
|
self.phrases = []
|
||||||
|
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
|
||||||
|
|
||||||
|
def set_page(self, page):
|
||||||
|
page = self.__decompose_subentries(page)
|
||||||
|
self._page = page
|
||||||
|
|
||||||
|
def get_page_soup(self):
|
||||||
|
soup = BeautifulSoup(self._page, "xml")
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
if self._part_of_speech_tags is not None:
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
self._part_of_speech_tags = []
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
for pos_group in soup.find_all("品詞G"):
|
||||||
|
if pos_group.parent.name == "大語義":
|
||||||
|
self._set_part_of_speech_tags(pos_group)
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
|
||||||
|
def _set_part_of_speech_tags(self, el):
|
||||||
|
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
|
||||||
|
for child in el.children:
|
||||||
|
if child.name is not None:
|
||||||
|
self._set_part_of_speech_tags(child)
|
||||||
|
continue
|
||||||
|
pos = str(child)
|
||||||
|
if el.name not in pos_names:
|
||||||
|
continue
|
||||||
|
elif pos in ["[", "]"]:
|
||||||
|
continue
|
||||||
|
elif pos in self._part_of_speech_tags:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self._part_of_speech_tags.append(pos)
|
||||||
|
|
||||||
|
def get_headwords(self):
|
||||||
|
if self._headwords is not None:
|
||||||
|
return self._headwords
|
||||||
|
self._set_headwords()
|
||||||
|
self._set_variant_headwords()
|
||||||
|
return self._headwords
|
||||||
|
|
||||||
|
def _set_regular_headwords(self, soup):
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = soup.find("見出仮名").text
|
||||||
|
expressions = []
|
||||||
|
for el in soup.find_all("標準表記"):
|
||||||
|
expression = self._clean_expression(el.text)
|
||||||
|
if "—" in expression:
|
||||||
|
kana_abbrs = self._kana_abbreviations[self.entry_id]
|
||||||
|
for abbr in kana_abbrs:
|
||||||
|
expression = expression.replace("—", abbr, 1)
|
||||||
|
expressions.append(expression)
|
||||||
|
expressions = Expressions.expand_abbreviation_list(expressions)
|
||||||
|
if len(expressions) == 0:
|
||||||
|
expressions.append(reading)
|
||||||
|
self._headwords = {reading: expressions}
|
||||||
|
|
||||||
|
def _set_variant_headwords(self):
|
||||||
|
for expressions in self._headwords.values():
|
||||||
|
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||||
|
Expressions.add_fullwidth(expressions)
|
||||||
|
Expressions.remove_iteration_mark(expressions)
|
||||||
|
Expressions.add_iteration_mark(expressions)
|
||||||
|
|
||||||
|
def __decompose_subentries(self, page):
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
subentry_parameters = [
|
||||||
|
[Daijirin2ChildEntry, ["子項目"], self.children],
|
||||||
|
[Daijirin2PhraseEntry, ["句項目"], self.phrases],
|
||||||
|
]
|
||||||
|
for x in subentry_parameters:
|
||||||
|
subentry_class, tags, subentry_list = x
|
||||||
|
for tag in tags:
|
||||||
|
tag_soup = soup.find(tag)
|
||||||
|
while tag_soup is not None:
|
||||||
|
tag_soup.name = "項目"
|
||||||
|
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||||
|
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||||
|
subentry = subentry_class(subentry_id)
|
||||||
|
page = tag_soup.decode()
|
||||||
|
subentry.set_page(page)
|
||||||
|
subentry_list.append(subentry)
|
||||||
|
tag_soup.decompose()
|
||||||
|
tag_soup = soup.find(tag)
|
||||||
|
return soup.decode()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def id_string_to_entry_id(id_string):
|
||||||
|
parts = id_string.split("-")
|
||||||
|
if len(parts) == 1:
|
||||||
|
return (int(parts[0]), 0)
|
||||||
|
elif len(parts) == 2:
|
||||||
|
# subentries have a hexadecimal part
|
||||||
|
return (int(parts[0]), int(parts[1], 16))
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid entry ID: {id_string}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _delete_unused_nodes(soup):
|
||||||
|
unused_nodes = [
|
||||||
|
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
|
||||||
|
"表外字マーク", "表外字マーク", "ルビG"
|
||||||
|
]
|
||||||
|
for name in unused_nodes:
|
||||||
|
Soup.delete_soup_nodes(soup, name)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_expression(expression):
|
||||||
|
for x in ["〈", "〉", "《", "》", " "]:
|
||||||
|
expression = expression.replace(x, "")
|
||||||
|
return expression
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fill_alts(soup):
|
||||||
|
for gaiji in soup.find_all(class_="gaiji"):
|
||||||
|
if gaiji.name == "img" and gaiji.has_attr("alt"):
|
||||||
|
gaiji.name = "span"
|
||||||
|
gaiji.string = gaiji.attrs["alt"]
|
||||||
|
|
||||||
|
|
||||||
|
class Daijirin2Entry(_BaseDaijirin2Entry):
|
||||||
|
def __init__(self, page_id):
|
||||||
|
entry_id = (page_id, 0)
|
||||||
|
super().__init__(entry_id)
|
||||||
|
|
||||||
|
def set_page(self, page):
|
||||||
|
page = preprocess_page(page)
|
||||||
|
super().set_page(page)
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
if soup.find("漢字見出") is not None:
|
||||||
|
self._set_kanji_headwords(soup)
|
||||||
|
elif soup.find("略語G") is not None:
|
||||||
|
self._set_acronym_headwords(soup)
|
||||||
|
else:
|
||||||
|
self._set_regular_headwords(soup)
|
||||||
|
|
||||||
|
def _set_kanji_headwords(self, soup):
|
||||||
|
readings = []
|
||||||
|
for el in soup.find_all("漢字音"):
|
||||||
|
hira = Expressions.kata_to_hira(el.text)
|
||||||
|
readings.append(hira)
|
||||||
|
if soup.find("漢字音") is None:
|
||||||
|
readings.append("")
|
||||||
|
expressions = []
|
||||||
|
for el in soup.find_all("漢字見出"):
|
||||||
|
expressions.append(el.text)
|
||||||
|
self._headwords = {}
|
||||||
|
for reading in readings:
|
||||||
|
self._headwords[reading] = expressions
|
||||||
|
|
||||||
|
def _set_acronym_headwords(self, soup):
|
||||||
|
expressions = []
|
||||||
|
for el in soup.find_all("略語"):
|
||||||
|
expression_parts = []
|
||||||
|
for part in el.find_all(["欧字", "和字"]):
|
||||||
|
expression_parts.append(part.text)
|
||||||
|
expression = "".join(expression_parts)
|
||||||
|
expressions.append(expression)
|
||||||
|
self._headwords = {"": expressions}
|
||||||
|
|
||||||
|
|
||||||
|
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
self._set_regular_headwords(soup)
|
||||||
|
|
||||||
|
|
||||||
|
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
self.__phrase_readings = load_daijirin2_phrase_readings()
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
# phrases do not contain these tags
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
headwords = {}
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
readings = self._find_readings()
|
||||||
|
for idx, expression in enumerate(expressions):
|
||||||
|
reading = readings[idx]
|
||||||
|
if reading in headwords:
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
else:
|
||||||
|
headwords[reading] = [expression]
|
||||||
|
self._headwords = headwords
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
text = soup.find("句表記").text
|
||||||
|
text = self._clean_expression(text)
|
||||||
|
alternatives = self.__expand_alternatives(text)
|
||||||
|
expressions = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for exp in Expressions.expand_abbreviation(alt):
|
||||||
|
expressions.append(exp)
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def _find_readings(self):
|
||||||
|
text = self.__phrase_readings[self.entry_id]
|
||||||
|
alternatives = self.__expand_alternatives(text)
|
||||||
|
readings = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for reading in Expressions.expand_abbreviation(alt):
|
||||||
|
readings.append(reading)
|
||||||
|
return readings
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __expand_alternatives(expression):
|
||||||
|
"""Return a list of strings described by = notation.
|
||||||
|
eg. "同じ穴の=狢(=狐・狸)" -> [
|
||||||
|
"同じ穴の狢", "同じ穴の狐", "同じ穴の狸"
|
||||||
|
]
|
||||||
|
eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [
|
||||||
|
"聞くは一時の恥、聞かぬは末代の恥",
|
||||||
|
"聞くは一時の恥、聞かぬは一生の恥",
|
||||||
|
"聞くは一旦の恥、聞かぬは末代の恥",
|
||||||
|
"聞くは一旦の恥、聞かぬは一生の恥"
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||||
|
groups = re.findall(group_pattern, expression)
|
||||||
|
expressions = [""]
|
||||||
|
for group in groups:
|
||||||
|
new_exps = []
|
||||||
|
for expression in expressions:
|
||||||
|
new_exps.append(expression + group[0])
|
||||||
|
expressions = new_exps.copy()
|
||||||
|
if group[1] == "":
|
||||||
|
continue
|
||||||
|
new_exps = []
|
||||||
|
for expression in expressions:
|
||||||
|
new_exps.append(expression + group[2])
|
||||||
|
for expression in expressions:
|
||||||
|
for alt in group[3].split("・"):
|
||||||
|
new_exps.append(expression + alt)
|
||||||
|
expressions = new_exps.copy()
|
||||||
|
return expressions
|
56
bot/entries/daijirin2_preprocess.py
Normal file
56
bot/entries/daijirin2_preprocess.py
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from bot.data import get_adobe_glyph
|
||||||
|
|
||||||
|
|
||||||
|
__GAIJI = {
|
||||||
|
"gaiji/DJRK0002.svg": "𦬇",
|
||||||
|
"gaiji/U芸E0102.svg": "芸",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_page(page):
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
__replace_glyph_codes(soup)
|
||||||
|
__add_gaiji_alt_text(soup)
|
||||||
|
__replace_halfwidth_braces(soup)
|
||||||
|
page = __strip_page(soup)
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def __replace_glyph_codes(soup):
|
||||||
|
for el in soup.find_all(style=True):
|
||||||
|
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
del el.attrs["style"]
|
||||||
|
if el.has_attr("alt"):
|
||||||
|
el.string = el.attrs["alt"]
|
||||||
|
continue
|
||||||
|
code = int(m.group(1))
|
||||||
|
for geta in el.find_all(string="〓"):
|
||||||
|
glyph = get_adobe_glyph(code)
|
||||||
|
geta.replace_with(glyph)
|
||||||
|
|
||||||
|
|
||||||
|
def __add_gaiji_alt_text(soup):
|
||||||
|
for gaiji in soup.find_all(class_="gaiji"):
|
||||||
|
src = gaiji.attrs["src"] if gaiji.has_attr("src") else ""
|
||||||
|
if src in __GAIJI:
|
||||||
|
gaiji.attrs["alt"] = __GAIJI[src]
|
||||||
|
|
||||||
|
|
||||||
|
def __replace_halfwidth_braces(soup):
|
||||||
|
for x in soup.find_all("送り仮名省略"):
|
||||||
|
for el in x.find_all(string="("):
|
||||||
|
el.replace_with("(")
|
||||||
|
for el in x.find_all(string=")"):
|
||||||
|
el.replace_with(")")
|
||||||
|
|
||||||
|
|
||||||
|
def __strip_page(soup):
|
||||||
|
koumoku = soup.find("項目")
|
||||||
|
if koumoku is None:
|
||||||
|
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
|
||||||
|
return koumoku.decode()
|
38
bot/entries/entry.py
Normal file
38
bot/entries/entry.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from bot.data import load_variant_kanji
|
||||||
|
|
||||||
|
|
||||||
|
class Entry(ABC):
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
self.entry_id = entry_id
|
||||||
|
self._page = None
|
||||||
|
self._headwords = None
|
||||||
|
self._part_of_speech_tags = None
|
||||||
|
self._variant_kanji = load_variant_kanji()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def set_page(self, page):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_page_soup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_headwords(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_first_expression(self):
|
||||||
|
headwords = self.get_headwords()
|
||||||
|
expressions = next(iter(headwords.values()))
|
||||||
|
expression = expressions[0]
|
||||||
|
return expression
|
||||||
|
|
||||||
|
def get_first_reading(self):
|
||||||
|
headwords = self.get_headwords()
|
||||||
|
reading = next(iter(headwords.keys()))
|
||||||
|
return reading
|
|
@ -2,29 +2,21 @@ import re
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from bot.data import load_variant_kanji
|
from bot.entries.entry import Entry
|
||||||
import bot.expressions as Expressions
|
import bot.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
class JitenonEntry:
|
class _JitenonEntry(Entry):
|
||||||
_VARIANT_KANJI = None
|
|
||||||
|
|
||||||
def __init__(self, entry_id):
|
def __init__(self, entry_id):
|
||||||
if self._VARIANT_KANJI is None:
|
super().__init__(entry_id)
|
||||||
self._VARIANT_KANJI = load_variant_kanji()
|
|
||||||
self.entry_id = entry_id
|
|
||||||
self.markup = ""
|
|
||||||
self.modified_date = date(1970, 1, 1)
|
self.modified_date = date(1970, 1, 1)
|
||||||
self.attribution = ""
|
self.attribution = ""
|
||||||
for column in self._COLUMNS.values():
|
for column in self._COLUMNS.values():
|
||||||
setattr(self, column[0], column[1])
|
setattr(self, column[0], column[1])
|
||||||
self._headwords = None
|
|
||||||
|
|
||||||
def set_markup(self, path):
|
def set_page(self, page):
|
||||||
with open(path, "r") as f:
|
soup = BeautifulSoup(page, features="html5lib")
|
||||||
html = f.read()
|
self.__set_modified_date(page)
|
||||||
soup = BeautifulSoup(html, features="html5lib")
|
|
||||||
self.__set_modified_date(html)
|
|
||||||
self.attribution = soup.find(class_="copyright").text
|
self.attribution = soup.find(class_="copyright").text
|
||||||
table = soup.find(class_="kanjirighttb")
|
table = soup.find(class_="kanjirighttb")
|
||||||
rows = table.find("tbody").find_all("tr")
|
rows = table.find("tbody").find_all("tr")
|
||||||
|
@ -33,7 +25,11 @@ class JitenonEntry:
|
||||||
colname = row.th.text if row.th is not None else colname
|
colname = row.th.text if row.th is not None else colname
|
||||||
colval = self.__clean_text(row.td.text)
|
colval = self.__clean_text(row.td.text)
|
||||||
self.__set_column(colname, colval)
|
self.__set_column(colname, colval)
|
||||||
self.markup = table.decode()
|
self._page = table.decode()
|
||||||
|
|
||||||
|
def get_page_soup(self):
|
||||||
|
soup = BeautifulSoup(self._page, "html5lib")
|
||||||
|
return soup
|
||||||
|
|
||||||
def get_headwords(self):
|
def get_headwords(self):
|
||||||
if self._headwords is not None:
|
if self._headwords is not None:
|
||||||
|
@ -42,16 +38,9 @@ class JitenonEntry:
|
||||||
self._set_variant_headwords()
|
self._set_variant_headwords()
|
||||||
return self._headwords
|
return self._headwords
|
||||||
|
|
||||||
def get_first_expression(self):
|
def get_part_of_speech_tags(self):
|
||||||
headwords = self.get_headwords()
|
# Jitenon doesn't have any
|
||||||
expressions = next(iter(headwords.values()))
|
return []
|
||||||
expression = expressions[0]
|
|
||||||
return expression
|
|
||||||
|
|
||||||
def get_first_reading(self):
|
|
||||||
headwords = self.get_headwords()
|
|
||||||
reading = next(iter(headwords.keys()))
|
|
||||||
return reading
|
|
||||||
|
|
||||||
def _set_headwords(self):
|
def _set_headwords(self):
|
||||||
headwords = {}
|
headwords = {}
|
||||||
|
@ -66,8 +55,8 @@ class JitenonEntry:
|
||||||
headwords[reading].append(expression)
|
headwords[reading].append(expression)
|
||||||
self._headwords = headwords
|
self._headwords = headwords
|
||||||
|
|
||||||
def __set_modified_date(self, html):
|
def __set_modified_date(self, page):
|
||||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
||||||
if not m:
|
if not m:
|
||||||
return
|
return
|
||||||
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
||||||
|
@ -94,7 +83,7 @@ class JitenonEntry:
|
||||||
return [m.group(1)]
|
return [m.group(1)]
|
||||||
m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata)
|
m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return Expressions.expand_shouryaku(yomikata)
|
return Expressions.expand_abbreviation(yomikata)
|
||||||
m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata)
|
m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
yomikatas = [m.group(1)]
|
yomikatas = [m.group(1)]
|
||||||
|
@ -139,7 +128,7 @@ class JitenonEntry:
|
||||||
return ",".join(colvals)
|
return ",".join(colvals)
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiEntry(JitenonEntry):
|
class JitenonYojiEntry(_JitenonEntry):
|
||||||
_COLUMNS = {
|
_COLUMNS = {
|
||||||
"四字熟語": ["expression", ""],
|
"四字熟語": ["expression", ""],
|
||||||
"読み方": ["yomikata", ""],
|
"読み方": ["yomikata", ""],
|
||||||
|
@ -151,15 +140,15 @@ class JitenonYojiEntry(JitenonEntry):
|
||||||
"類義語": ["ruigigo", []],
|
"類義語": ["ruigigo", []],
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, sequence):
|
def __init__(self, entry_id):
|
||||||
super().__init__(sequence)
|
super().__init__(entry_id)
|
||||||
|
|
||||||
def _set_variant_headwords(self):
|
def _set_variant_headwords(self):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
|
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaEntry(JitenonEntry):
|
class JitenonKotowazaEntry(_JitenonEntry):
|
||||||
_COLUMNS = {
|
_COLUMNS = {
|
||||||
"言葉": ["expression", ""],
|
"言葉": ["expression", ""],
|
||||||
"読み方": ["yomikata", ""],
|
"読み方": ["yomikata", ""],
|
||||||
|
@ -170,8 +159,8 @@ class JitenonKotowazaEntry(JitenonEntry):
|
||||||
"類句": ["ruiku", []],
|
"類句": ["ruiku", []],
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, sequence):
|
def __init__(self, entry_id):
|
||||||
super().__init__(sequence)
|
super().__init__(entry_id)
|
||||||
|
|
||||||
def _set_headwords(self):
|
def _set_headwords(self):
|
||||||
if self.expression == "金棒引き・鉄棒引き":
|
if self.expression == "金棒引き・鉄棒引き":
|
||||||
|
@ -183,5 +172,5 @@ class JitenonKotowazaEntry(JitenonEntry):
|
||||||
|
|
||||||
def _set_variant_headwords(self):
|
def _set_variant_headwords(self):
|
||||||
for expressions in self._headwords.values():
|
for expressions in self._headwords.values():
|
||||||
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
|
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||||
Expressions.add_fullwidth(expressions)
|
Expressions.add_fullwidth(expressions)
|
||||||
|
|
242
bot/entries/smk8.py
Normal file
242
bot/entries/smk8.py
Normal file
|
@ -0,0 +1,242 @@
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import bot.expressions as Expressions
|
||||||
|
import bot.soup as Soup
|
||||||
|
from bot.data import load_smk8_phrase_readings
|
||||||
|
from bot.entries.entry import Entry
|
||||||
|
from bot.entries.smk8_preprocess import preprocess_page
|
||||||
|
|
||||||
|
|
||||||
|
class _BaseSmk8Entry(Entry):
|
||||||
|
ID_TO_ENTRY = {}
|
||||||
|
SUBENTRY_ID_TO_ENTRY_ID = {}
|
||||||
|
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
if entry_id not in self.ID_TO_ENTRY:
|
||||||
|
self.ID_TO_ENTRY[entry_id] = self
|
||||||
|
else:
|
||||||
|
raise Exception(f"Duplicate entry ID: {entry_id}")
|
||||||
|
self.children = []
|
||||||
|
self.phrases = []
|
||||||
|
self.kanjis = []
|
||||||
|
|
||||||
|
def set_page(self, page):
|
||||||
|
page = self.__decompose_subentries(page)
|
||||||
|
self._page = page
|
||||||
|
|
||||||
|
def get_page_soup(self):
|
||||||
|
soup = BeautifulSoup(self._page, "xml")
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_headwords(self):
|
||||||
|
if self._headwords is not None:
|
||||||
|
return self._headwords
|
||||||
|
self._set_headwords()
|
||||||
|
self._set_variant_headwords()
|
||||||
|
return self._headwords
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
if self._part_of_speech_tags is not None:
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
self._part_of_speech_tags = []
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
headword_info = soup.find("見出要素")
|
||||||
|
if headword_info is None:
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
for tag in headword_info.find_all("品詞M"):
|
||||||
|
if tag.text not in self._part_of_speech_tags:
|
||||||
|
self._part_of_speech_tags.append(tag.text)
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
|
||||||
|
def _set_variant_headwords(self):
|
||||||
|
for expressions in self._headwords.values():
|
||||||
|
Expressions.add_variant_kanji(expressions, self._variant_kanji)
|
||||||
|
Expressions.add_fullwidth(expressions)
|
||||||
|
Expressions.remove_iteration_mark(expressions)
|
||||||
|
Expressions.add_iteration_mark(expressions)
|
||||||
|
|
||||||
|
def _find_reading(self, soup):
|
||||||
|
midasi_kana = soup.find("見出仮名")
|
||||||
|
reading = midasi_kana.text
|
||||||
|
for x in [" ", "・"]:
|
||||||
|
reading = reading.replace(x, "")
|
||||||
|
return reading
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
clean_expressions = []
|
||||||
|
for expression in soup.find_all("標準表記"):
|
||||||
|
clean_expression = self._clean_expression(expression.text)
|
||||||
|
clean_expressions.append(clean_expression)
|
||||||
|
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def __decompose_subentries(self, page):
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
subentry_parameters = [
|
||||||
|
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
|
||||||
|
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
||||||
|
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
|
||||||
|
]
|
||||||
|
for x in subentry_parameters:
|
||||||
|
subentry_class, tags, subentry_list = x
|
||||||
|
for tag in tags:
|
||||||
|
tag_soup = soup.find(tag)
|
||||||
|
while tag_soup is not None:
|
||||||
|
tag_soup.name = "項目"
|
||||||
|
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||||
|
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||||
|
subentry = subentry_class(subentry_id)
|
||||||
|
page = tag_soup.decode()
|
||||||
|
subentry.set_page(page)
|
||||||
|
subentry_list.append(subentry)
|
||||||
|
tag_soup.decompose()
|
||||||
|
tag_soup = soup.find(tag)
|
||||||
|
return soup.decode()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def id_string_to_entry_id(id_string):
|
||||||
|
parts = id_string.split("-")
|
||||||
|
if len(parts) == 1:
|
||||||
|
return (int(parts[0]), 0)
|
||||||
|
elif len(parts) == 2:
|
||||||
|
# subentries have a hexadecimal part
|
||||||
|
return (int(parts[0]), int(parts[1], 16))
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid entry ID: {id_string}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_expression(expression):
|
||||||
|
for x in ["〈", "〉", "{", "}", "…", " "]:
|
||||||
|
expression = expression.replace(x, "")
|
||||||
|
return expression
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fill_alts(soup):
|
||||||
|
for e in soup.find_all(["親見出仮名", "親見出表記"]):
|
||||||
|
e.string = e.attrs["alt"]
|
||||||
|
for gaiji in soup.find_all("外字"):
|
||||||
|
gaiji.string = gaiji.img.attrs["alt"]
|
||||||
|
|
||||||
|
|
||||||
|
class Smk8Entry(_BaseSmk8Entry):
|
||||||
|
def __init__(self, page_id):
|
||||||
|
entry_id = (page_id, 0)
|
||||||
|
super().__init__(entry_id)
|
||||||
|
|
||||||
|
def set_page(self, page):
|
||||||
|
page = preprocess_page(page)
|
||||||
|
super().set_page(page)
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
Soup.delete_soup_nodes(soup, "表音表記")
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = self._find_reading(soup)
|
||||||
|
expressions = []
|
||||||
|
if soup.find("見出部").find("標準表記") is None:
|
||||||
|
expressions.append(reading)
|
||||||
|
for expression in self._find_expressions(soup):
|
||||||
|
if expression not in expressions:
|
||||||
|
expressions.append(expression)
|
||||||
|
self._headwords = {reading: expressions}
|
||||||
|
|
||||||
|
|
||||||
|
class Smk8ChildEntry(_BaseSmk8Entry):
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
Soup.delete_soup_nodes(soup, "表音表記")
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = self._find_reading(soup)
|
||||||
|
expressions = []
|
||||||
|
if soup.find("子見出部").find("標準表記") is None:
|
||||||
|
expressions.append(reading)
|
||||||
|
for expression in self._find_expressions(soup):
|
||||||
|
if expression not in expressions:
|
||||||
|
expressions.append(expression)
|
||||||
|
self._headwords = {reading: expressions}
|
||||||
|
|
||||||
|
|
||||||
|
class Smk8PhraseEntry(_BaseSmk8Entry):
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
self.__phrase_readings = load_smk8_phrase_readings()
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
# phrases do not contain these tags
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
headwords = {}
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
readings = self._find_readings()
|
||||||
|
for idx, expression in enumerate(expressions):
|
||||||
|
reading = readings[idx]
|
||||||
|
if reading in headwords:
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
else:
|
||||||
|
headwords[reading] = [expression]
|
||||||
|
self._headwords = headwords
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
Soup.delete_soup_nodes(soup, "ルビG")
|
||||||
|
self._fill_alts(soup)
|
||||||
|
text = soup.find("標準表記").text
|
||||||
|
text = self._clean_expression(text)
|
||||||
|
alternatives = self.__expand_alternatives(text)
|
||||||
|
expressions = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for exp in Expressions.expand_abbreviation(alt):
|
||||||
|
expressions.append(exp)
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def _find_readings(self):
|
||||||
|
text = self.__phrase_readings[self.entry_id]
|
||||||
|
alternatives = self.__expand_alternatives(text)
|
||||||
|
readings = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for reading in Expressions.expand_abbreviation(alt):
|
||||||
|
readings.append(reading)
|
||||||
|
return readings
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __expand_alternatives(expression):
|
||||||
|
"""Return a list of strings described by △ notation
|
||||||
|
eg. "△金(時間・暇)に飽かして" -> [
|
||||||
|
"金に飽かして", "時間に飽かして", "暇に飽かして"
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
m = re.search(r"△([^(]+)(([^(]+))", expression)
|
||||||
|
if not m:
|
||||||
|
return [expression]
|
||||||
|
alt_parts = [m.group(1)]
|
||||||
|
for alt_part in m.group(2).split("・"):
|
||||||
|
alt_parts.append(alt_part)
|
||||||
|
alts = []
|
||||||
|
for alt_part in alt_parts:
|
||||||
|
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, expression)
|
||||||
|
alts.append(alt_exp)
|
||||||
|
return alts
|
||||||
|
|
||||||
|
|
||||||
|
class Smk8KanjiEntry(_BaseSmk8Entry):
|
||||||
|
def __init__(self, entry_id):
|
||||||
|
super().__init__(entry_id)
|
||||||
|
|
||||||
|
def _set_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = self.__get_parent_reading()
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
self._headwords = {reading: expressions}
|
||||||
|
|
||||||
|
def __get_parent_reading(self):
|
||||||
|
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
|
||||||
|
parent = self.ID_TO_ENTRY[parent_id]
|
||||||
|
reading = parent.get_first_reading()
|
||||||
|
return reading
|
91
bot/entries/smk8_preprocess.py
Normal file
91
bot/entries/smk8_preprocess.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from bot.data import get_adobe_glyph
|
||||||
|
|
||||||
|
|
||||||
|
__GAIJI = {
|
||||||
|
"gaiji/5350.svg": "卐",
|
||||||
|
"gaiji/62cb.svg": "抛",
|
||||||
|
"gaiji/7be1.svg": "簒",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_page(page):
|
||||||
|
page = __strip_page(page)
|
||||||
|
page = __replace_glyph_codes(page)
|
||||||
|
page = __format_hyougai_marks(page)
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def __strip_page(page):
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
koumoku = soup.find(["項目", "字音語参照項目"])
|
||||||
|
if koumoku is not None:
|
||||||
|
return koumoku.decode()
|
||||||
|
else:
|
||||||
|
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
|
||||||
|
|
||||||
|
|
||||||
|
def __replace_glyph_codes(page):
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
for span in soup.find_all("span"):
|
||||||
|
if "style" in span.attrs:
|
||||||
|
m = re.search(r"^glyph:([0-9]+);$", span.attrs["style"])
|
||||||
|
del span.attrs["style"]
|
||||||
|
if m is None:
|
||||||
|
continue
|
||||||
|
code = int(m.group(1))
|
||||||
|
for geta in span.find_all(string="〓"):
|
||||||
|
glyph = get_adobe_glyph(code)
|
||||||
|
geta.replace_with(glyph)
|
||||||
|
for hyouki in soup.find_all("親見出表記"):
|
||||||
|
if "alt" not in hyouki.attrs:
|
||||||
|
continue
|
||||||
|
alt = hyouki.attrs["alt"]
|
||||||
|
codes = re.findall(r"{CID([0-9]+)}", alt)
|
||||||
|
for code in codes:
|
||||||
|
glyph = get_adobe_glyph(int(code))
|
||||||
|
alt = alt.replace(f"{{CID{code}}}", glyph)
|
||||||
|
hyouki.attrs["alt"] = alt
|
||||||
|
for gaiji in soup.find_all("外字"):
|
||||||
|
img = gaiji.img
|
||||||
|
src = img.attrs["src"] if img.has_attr("src") else ""
|
||||||
|
if src in __GAIJI:
|
||||||
|
img.attrs["alt"] = __GAIJI[src]
|
||||||
|
return soup.decode()
|
||||||
|
|
||||||
|
|
||||||
|
def __format_hyougai_marks(page):
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
for el in soup.find_all("外字"):
|
||||||
|
el.string = "〓"
|
||||||
|
text = soup.text
|
||||||
|
for x in ["\n", "\t", " "]:
|
||||||
|
text = text.replace(x, "")
|
||||||
|
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
|
||||||
|
page = re.sub(r"〈([^〈]+)〉", r"␂\1␃", page)
|
||||||
|
for mark in re.findall(r"《.", text):
|
||||||
|
if mark[1] == "〓":
|
||||||
|
page = page.replace("《", "<表外音訓/>", 1)
|
||||||
|
else:
|
||||||
|
page = re.sub(f"《([^{mark[1]}]*)({mark[1]})",
|
||||||
|
r"\1<表外音訓>\2</表外音訓>",
|
||||||
|
page, count=1)
|
||||||
|
for mark in re.findall(r"〈.", text):
|
||||||
|
if mark[1] == "〓":
|
||||||
|
page = page.replace("〈", "<表外字/>", 1)
|
||||||
|
else:
|
||||||
|
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
|
||||||
|
r"\1<表外字>\2</表外字>",
|
||||||
|
page, count=1)
|
||||||
|
page = page.replace("␂", "〈")
|
||||||
|
page = page.replace("␃", "〉")
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
for el in soup.find_all("表外音訓"):
|
||||||
|
if el.text == "":
|
||||||
|
el.append(el.next_sibling)
|
||||||
|
for el in soup.find_all("表外字"):
|
||||||
|
if el.text == "":
|
||||||
|
el.append(el.next_sibling)
|
||||||
|
return soup.decode()
|
|
@ -1,14 +1,30 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
__WIDE_MAP = {i: i + 0xFEE0 for i in range(0x21, 0x7F)}
|
__KATA_TO_HIRA_MAP = {
|
||||||
|
i: i - 96 for i in [
|
||||||
|
*range(0x30A1, 0x30F6),
|
||||||
|
*range(0x30FD, 0x30FE),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__HALFWIDTH_TO_FULLWIDTH_MAP = {
|
||||||
|
i: i + 0xFEE0 for i in [
|
||||||
|
*range(0x21, 0x7F),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def kata_to_hira(text):
|
||||||
|
hira = text.translate(__KATA_TO_HIRA_MAP)
|
||||||
|
return hira
|
||||||
|
|
||||||
|
|
||||||
def add_fullwidth(expressions):
|
def add_fullwidth(expressions):
|
||||||
for expression in expressions:
|
for expression in expressions:
|
||||||
if re.match(r"[A-Za-z0-9]", expression):
|
new_exp = expression.translate(__HALFWIDTH_TO_FULLWIDTH_MAP)
|
||||||
new_exp = expression.translate(__WIDE_MAP)
|
if new_exp not in expressions:
|
||||||
if new_exp not in expressions:
|
expressions.append(new_exp)
|
||||||
expressions.append(new_exp)
|
|
||||||
|
|
||||||
|
|
||||||
def add_variant_kanji(expressions, variant_kanji):
|
def add_variant_kanji(expressions, variant_kanji):
|
||||||
|
@ -23,23 +39,50 @@ def add_variant_kanji(expressions, variant_kanji):
|
||||||
expressions.append(new_exp)
|
expressions.append(new_exp)
|
||||||
|
|
||||||
|
|
||||||
def expand_shouryaku(shouryaku):
|
def remove_iteration_mark(expressions):
|
||||||
|
iterated_kanji = r"(.)々"
|
||||||
|
for expression in expressions:
|
||||||
|
for char in re.findall(iterated_kanji, expression):
|
||||||
|
new_exp = expression.replace(f"{char}々", f"{char}{char}")
|
||||||
|
if new_exp not in expressions:
|
||||||
|
expressions.append(new_exp)
|
||||||
|
|
||||||
|
|
||||||
|
def add_iteration_mark(expressions):
|
||||||
|
repeat_kanji = r"([^0-z0-zぁ-ヿ])\1"
|
||||||
|
for expression in expressions:
|
||||||
|
for char in re.findall(repeat_kanji, expression):
|
||||||
|
new_exp = expression.replace(f"{char}{char}", f"{char}々")
|
||||||
|
if new_exp not in expressions:
|
||||||
|
expressions.append(new_exp)
|
||||||
|
|
||||||
|
|
||||||
|
def expand_abbreviation(abbreviated_expression):
|
||||||
"""Return a list of words described by a 省略 notation.
|
"""Return a list of words described by a 省略 notation.
|
||||||
eg. "有(り)合(わ)せ" -> [
|
eg. "有(り)合(わ)せ" -> [
|
||||||
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
|
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
|
||||||
]
|
]
|
||||||
"""
|
"""
|
||||||
groups = re.findall(r"([^(]*)((([^(]+)))?", shouryaku)
|
groups = re.findall(r"([^(]*)((([^(]+)))?", abbreviated_expression)
|
||||||
forms = [""]
|
expressions = [""]
|
||||||
for group in groups:
|
for group in groups:
|
||||||
new_forms = []
|
new_exps = []
|
||||||
for form in forms:
|
for expression in expressions:
|
||||||
new_forms.append(form + group[0])
|
new_exps.append(expression + group[0])
|
||||||
forms = new_forms.copy()
|
expressions = new_exps.copy()
|
||||||
if group[2] == '':
|
if group[2] == '':
|
||||||
continue
|
continue
|
||||||
new_forms = []
|
new_exps = []
|
||||||
for form in forms:
|
for expression in expressions:
|
||||||
new_forms.append(form + group[2])
|
new_exps.append(expression + group[2])
|
||||||
forms = new_forms.copy() + forms.copy()
|
expressions = new_exps.copy() + expressions.copy()
|
||||||
return forms
|
return expressions
|
||||||
|
|
||||||
|
|
||||||
|
def expand_abbreviation_list(expressions):
|
||||||
|
new_exps = []
|
||||||
|
for expression in expressions:
|
||||||
|
for new_exp in expand_abbreviation(expression):
|
||||||
|
if new_exp not in new_exps:
|
||||||
|
new_exps.append(new_exp)
|
||||||
|
return new_exps
|
||||||
|
|
84
bot/icons.py
Normal file
84
bot/icons.py
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from PIL import Image
|
||||||
|
from functools import cache
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def calculate_ratio(path):
|
||||||
|
if path.endswith(".svg"):
|
||||||
|
ratio = __calculate_svg_ratio(path)
|
||||||
|
else:
|
||||||
|
ratio = __calculate_bitmap_ratio(path)
|
||||||
|
return ratio
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def make_rectangle(path, text, rect_stroke, rect_fill, text_fill):
|
||||||
|
svg = __svg_text_rectangle(text, rect_stroke, rect_fill, text_fill)
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(svg)
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def make_monochrome_fill_rectangle(path, text):
|
||||||
|
svg = __svg_masked_rectangle(text)
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(svg)
|
||||||
|
|
||||||
|
|
||||||
|
def __calculate_svg_ratio(path):
|
||||||
|
with open(path, "r") as f:
|
||||||
|
xml = f.read()
|
||||||
|
soup = BeautifulSoup(xml, "xml")
|
||||||
|
svg = soup.svg
|
||||||
|
if svg.has_attr("width") and svg.has_attr("height"):
|
||||||
|
width = float(svg.attrs["width"])
|
||||||
|
height = float(svg.attrs["height"])
|
||||||
|
ratio = width / height
|
||||||
|
elif svg.has_attr("viewBox"):
|
||||||
|
_, _, width, height = svg.attrs["viewBox"].split(" ")
|
||||||
|
ratio = float(width) / float(height)
|
||||||
|
else:
|
||||||
|
raise Exception(f"Cannot calculate ratio for SVG\n{svg.prettify()}")
|
||||||
|
return ratio
|
||||||
|
|
||||||
|
|
||||||
|
def __calculate_bitmap_ratio(path):
|
||||||
|
img = Image.open(path)
|
||||||
|
img_w = img.size[0]
|
||||||
|
img_h = img.size[1]
|
||||||
|
ratio = img_w / img_h
|
||||||
|
return ratio
|
||||||
|
|
||||||
|
|
||||||
|
def __svg_text_rectangle(text, rect_stroke, rect_fill, text_fill):
|
||||||
|
height = 128
|
||||||
|
width = len(text) * height
|
||||||
|
svg = f"""
|
||||||
|
<svg lang='ja' width='{width}' height='{height}' viewBox='0 0 {width} {height}'
|
||||||
|
xmlns='http://www.w3.org/2000/svg' version='1.1'>
|
||||||
|
<rect width='{width}' height='{height}' ry='20' stroke='{rect_stroke}'
|
||||||
|
fill='{rect_fill}' stroke-width='8'/>
|
||||||
|
<text text-anchor='middle' x='50%' y='50%' dy='.35em'
|
||||||
|
font-family='sans-serif' font-size='100px'
|
||||||
|
fill='{text_fill}'>{text}</text>
|
||||||
|
</svg>"""
|
||||||
|
return svg.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def __svg_masked_rectangle(text):
|
||||||
|
height = 128
|
||||||
|
width = len(text) * height
|
||||||
|
svg = f"""
|
||||||
|
<svg lang='ja' width='{width}' height='{height}' viewBox='0 0 {width} {height}'
|
||||||
|
xmlns='http://www.w3.org/2000/svg' version='1.1'>
|
||||||
|
<mask id='a'>
|
||||||
|
<rect width='{width}' height='{height}' fill='white'/>
|
||||||
|
<text text-anchor='middle' x='50%' y='50%' dy='.35em'
|
||||||
|
font-family='sans-serif' font-size='100px'
|
||||||
|
fill='black'>{text}</text>
|
||||||
|
</mask>
|
||||||
|
<rect width='{width}' height='{height}' ry='20'
|
||||||
|
fill='black' mask='url(#a)'/>
|
||||||
|
</svg>"""
|
||||||
|
return svg.strip()
|
|
@ -15,11 +15,8 @@ from bot.data import load_config
|
||||||
|
|
||||||
|
|
||||||
class Scraper():
|
class Scraper():
|
||||||
__CONFIG = None
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
if self.__CONFIG is None:
|
self._config = load_config()
|
||||||
self.__CONFIG = load_config()
|
|
||||||
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
|
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
|
||||||
self.netloc_re = re.compile(pattern)
|
self.netloc_re = re.compile(pattern)
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
|
@ -45,7 +42,7 @@ class Scraper():
|
||||||
allowed_methods=["HEAD", "GET", "OPTIONS"]
|
allowed_methods=["HEAD", "GET", "OPTIONS"]
|
||||||
)
|
)
|
||||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||||
headers = self.__CONFIG["http-request-headers"]
|
headers = self._config["http-request-headers"]
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.session.mount("https://", adapter)
|
self.session.mount("https://", adapter)
|
||||||
self.session.headers.update(headers)
|
self.session.headers.update(headers)
|
||||||
|
|
5
bot/soup.py
Normal file
5
bot/soup.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
def delete_soup_nodes(soup, node_name):
|
||||||
|
node = soup.find(node_name)
|
||||||
|
while node is not None:
|
||||||
|
node.decompose()
|
||||||
|
node = soup.find(node_name)
|
|
@ -9,14 +9,19 @@ from bot.data import load_yomichan_metadata
|
||||||
|
|
||||||
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
|
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
|
||||||
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
|
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
|
||||||
|
from bot.yomichan.terms.smk8 import Smk8Terminator
|
||||||
|
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
|
||||||
|
|
||||||
|
|
||||||
class Exporter:
|
class Exporter:
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
|
self._name = name
|
||||||
self._build_dir = None
|
self._build_dir = None
|
||||||
self._terms_per_file = 2000
|
self._terms_per_file = 2000
|
||||||
|
|
||||||
def export(self, entries):
|
def export(self, entries, image_dir):
|
||||||
|
if image_dir is not None:
|
||||||
|
self.__init_build_image_dir(image_dir)
|
||||||
meta = load_yomichan_metadata()
|
meta = load_yomichan_metadata()
|
||||||
index = meta[self._name]["index"]
|
index = meta[self._name]["index"]
|
||||||
index["revision"] = self._get_revision(entries)
|
index["revision"] = self._get_revision(entries)
|
||||||
|
@ -29,14 +34,20 @@ class Exporter:
|
||||||
if self._build_dir is not None:
|
if self._build_dir is not None:
|
||||||
return self._build_dir
|
return self._build_dir
|
||||||
cache_dir = user_cache_dir("jitenbot")
|
cache_dir = user_cache_dir("jitenbot")
|
||||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
build_directory = os.path.join(cache_dir, "yomichan_build")
|
||||||
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
|
|
||||||
if Path(build_directory).is_dir():
|
if Path(build_directory).is_dir():
|
||||||
shutil.rmtree(build_directory)
|
shutil.rmtree(build_directory)
|
||||||
os.makedirs(build_directory)
|
os.makedirs(build_directory)
|
||||||
self._build_dir = build_directory
|
self._build_dir = build_directory
|
||||||
return self._build_dir
|
return self._build_dir
|
||||||
|
|
||||||
|
def __init_build_image_dir(self, image_dir):
|
||||||
|
print("Copying image files to build directory...")
|
||||||
|
build_dir = self._get_build_dir()
|
||||||
|
build_img_dir = os.path.join(build_dir, self._name)
|
||||||
|
shutil.copytree(image_dir, build_img_dir)
|
||||||
|
self._terminator.set_image_dir(build_img_dir)
|
||||||
|
|
||||||
def __get_terms(self, entries):
|
def __get_terms(self, entries):
|
||||||
terms = []
|
terms = []
|
||||||
entries_len = len(entries)
|
entries_len = len(entries)
|
||||||
|
@ -101,15 +112,15 @@ class Exporter:
|
||||||
|
|
||||||
|
|
||||||
class JitenonExporter(Exporter):
|
class JitenonExporter(Exporter):
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
super().__init__()
|
super().__init__(name)
|
||||||
|
|
||||||
def _get_revision(self, entries):
|
def _get_revision(self, entries):
|
||||||
modified_date = None
|
modified_date = None
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
if modified_date is None or entry.modified_date > modified_date:
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
modified_date = entry.modified_date
|
modified_date = entry.modified_date
|
||||||
revision = f"{self._name}.{modified_date}"
|
revision = f"{self._name};{modified_date}"
|
||||||
return revision
|
return revision
|
||||||
|
|
||||||
def _get_attribution(self, entries):
|
def _get_attribution(self, entries):
|
||||||
|
@ -121,14 +132,38 @@ class JitenonExporter(Exporter):
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiExporter(JitenonExporter):
|
class JitenonYojiExporter(JitenonExporter):
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
super().__init__()
|
super().__init__(name)
|
||||||
self._name = "jitenon-yoji"
|
self._terminator = JitenonYojiTerminator(name)
|
||||||
self._terminator = JitenonYojiTerminator()
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaExporter(JitenonExporter):
|
class JitenonKotowazaExporter(JitenonExporter):
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
super().__init__()
|
super().__init__(name)
|
||||||
self._name = "jitenon-kotowaza"
|
self._terminator = JitenonKotowazaTerminator(name)
|
||||||
self._terminator = JitenonKotowazaTerminator()
|
|
||||||
|
|
||||||
|
class Smk8Exporter(Exporter):
|
||||||
|
def __init__(self, name):
|
||||||
|
super().__init__(name)
|
||||||
|
self._terminator = Smk8Terminator(name)
|
||||||
|
|
||||||
|
def _get_revision(self, entries):
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
return f"{self._name};{timestamp}"
|
||||||
|
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2020"
|
||||||
|
|
||||||
|
|
||||||
|
class Daijirin2Exporter(Exporter):
|
||||||
|
def __init__(self, name):
|
||||||
|
super().__init__(name)
|
||||||
|
self._terminator = Daijirin2Terminator(name)
|
||||||
|
|
||||||
|
def _get_revision(self, entries):
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
return f"{self._name};{timestamp}"
|
||||||
|
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2019"
|
||||||
|
|
238
bot/yomichan/glossary/daijirin2.py
Normal file
238
bot/yomichan/glossary/daijirin2.py
Normal file
|
@ -0,0 +1,238 @@
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from functools import cache
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import bot.icons as Icons
|
||||||
|
from bot.soup import delete_soup_nodes
|
||||||
|
from bot.data import load_daijirin2_yomichan_name_conversion
|
||||||
|
from bot.yomichan.glossary.gloss import make_gloss
|
||||||
|
from bot.yomichan.glossary.name_conversion import convert_names
|
||||||
|
|
||||||
|
|
||||||
|
def make_glossary(entry, image_dir):
|
||||||
|
soup = entry.get_page_soup()
|
||||||
|
__add_rubies(soup)
|
||||||
|
__hyperlink_parent_expression(soup, entry)
|
||||||
|
__delete_unused_nodes(soup, image_dir)
|
||||||
|
__clear_styles(soup)
|
||||||
|
__set_data_class(soup)
|
||||||
|
__convert_links(soup, entry)
|
||||||
|
__convert_gaiji(soup, image_dir)
|
||||||
|
__convert_graphics(soup, image_dir)
|
||||||
|
__convert_logos(soup, image_dir)
|
||||||
|
__convert_kanjion_logos(soup, image_dir)
|
||||||
|
__convert_daigoginum(soup, image_dir)
|
||||||
|
__convert_jundaigoginum(soup, image_dir)
|
||||||
|
|
||||||
|
name_conversion = load_daijirin2_yomichan_name_conversion()
|
||||||
|
convert_names(soup, name_conversion)
|
||||||
|
|
||||||
|
gloss = make_gloss(soup.span)
|
||||||
|
glossary = [gloss]
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
|
||||||
|
def __add_rubies(soup):
|
||||||
|
for name in ["表外音訓", "表外字"]:
|
||||||
|
for ruby in soup.find_all(name):
|
||||||
|
ruby.name = "ruby"
|
||||||
|
rt = ruby.find("表外字マーク")
|
||||||
|
rt.name = "rt"
|
||||||
|
ruby.append(rt) # needs to positioned after the text
|
||||||
|
|
||||||
|
|
||||||
|
def __hyperlink_parent_expression(soup, entry):
|
||||||
|
if soup.find("親表記") is None:
|
||||||
|
return
|
||||||
|
parent_entry_id = entry.SUBENTRY_ID_TO_ENTRY_ID[entry.entry_id]
|
||||||
|
parent_entry = entry.ID_TO_ENTRY[parent_entry_id]
|
||||||
|
parent_expression = parent_entry.get_first_expression()
|
||||||
|
for el in soup.find_all("親表記"):
|
||||||
|
el.name = "a"
|
||||||
|
el.attrs["href"] = f"?query={parent_expression}&wildcards=off"
|
||||||
|
|
||||||
|
|
||||||
|
def __delete_unused_nodes(soup, image_dir):
|
||||||
|
if not __graphics_directory_exists(image_dir):
|
||||||
|
delete_soup_nodes(soup, "カットG")
|
||||||
|
for el in soup.find_all("logo"):
|
||||||
|
next_sibling = el.next_sibling
|
||||||
|
if next_sibling is None:
|
||||||
|
continue
|
||||||
|
elif next_sibling.name in ["漢字見出G", "漢字音G"]:
|
||||||
|
el.decompose()
|
||||||
|
for el in soup.find_all("漢字音G"):
|
||||||
|
for child in el.find_all(string="・"):
|
||||||
|
child.replace_with("")
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def __graphics_directory_exists(image_dir):
|
||||||
|
path = os.path.join(image_dir, "graphics")
|
||||||
|
return Path(path).is_dir()
|
||||||
|
|
||||||
|
|
||||||
|
def __clear_styles(soup):
|
||||||
|
for el in soup.select("[style]"):
|
||||||
|
del el.attrs["style"]
|
||||||
|
|
||||||
|
|
||||||
|
def __set_data_class(soup):
|
||||||
|
for el in soup.select("[class]"):
|
||||||
|
el.attrs["data-class"] = el.attrs["class"]
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_links(soup, entry):
|
||||||
|
for el in soup.find_all("a"):
|
||||||
|
href = el.attrs["href"]
|
||||||
|
if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
|
||||||
|
ref_entry_id = entry.id_string_to_entry_id(href)
|
||||||
|
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
|
||||||
|
expression = ref_entry.get_first_expression()
|
||||||
|
el.attrs["href"] = f"?query={expression}&wildcards=off"
|
||||||
|
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid href format: {href}")
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_gaiji(soup, image_dir):
|
||||||
|
for el in soup.find_all("img"):
|
||||||
|
src = el.attrs["src"]
|
||||||
|
if not src.startswith("gaiji"):
|
||||||
|
continue
|
||||||
|
path = image_dir
|
||||||
|
for part in src.split("/"):
|
||||||
|
if part.strip() == "":
|
||||||
|
continue
|
||||||
|
path = os.path.join(path, part)
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": "monochrome",
|
||||||
|
"title": el.attrs["alt"] if el.has_attr("alt") else "",
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{src}",
|
||||||
|
"src": src,
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.clear()
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "vertical-align: text-bottom;"
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_graphics(soup, image_dir):
|
||||||
|
for el in soup.find_all("img"):
|
||||||
|
src = el.attrs["src"]
|
||||||
|
if not src.startswith("graphics"):
|
||||||
|
continue
|
||||||
|
el.attrs = {
|
||||||
|
"collapsible": True,
|
||||||
|
"collapsed": True,
|
||||||
|
"title": el.attrs["alt"] if el.has_attr("alt") else "",
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{src}",
|
||||||
|
"src": src,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_logos(soup, image_dir):
|
||||||
|
for el in soup.find_all("logo"):
|
||||||
|
filename = f"{el.text}-default.svg"
|
||||||
|
path = os.path.join(image_dir, filename)
|
||||||
|
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": "monochrome",
|
||||||
|
"title": el.text,
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.clear()
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em;"
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_kanjion_logos(soup, image_dir):
|
||||||
|
for el in soup.find_all("漢字音logo"):
|
||||||
|
filename = f"{el.text}-default.svg"
|
||||||
|
path = os.path.join(image_dir, filename)
|
||||||
|
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": "monochrome",
|
||||||
|
"title": el.text,
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.clear()
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "vertical-align: text-bottom; margin-left: 0.25em;"
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_daigoginum(soup, image_dir):
|
||||||
|
for el in soup.find_all("大語義num"):
|
||||||
|
filename = f"{el.text}-fill.svg"
|
||||||
|
path = os.path.join(image_dir, filename)
|
||||||
|
Icons.make_monochrome_fill_rectangle(path, el.text)
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": "monochrome",
|
||||||
|
"title": el.text,
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.clear()
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "vertical-align: text-bottom;"
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_jundaigoginum(soup, image_dir):
|
||||||
|
for el in soup.find_all("準大語義num"):
|
||||||
|
filename = f"{el.text}-default.svg"
|
||||||
|
path = os.path.join(image_dir, filename)
|
||||||
|
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": "monochrome",
|
||||||
|
"title": el.text,
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.clear()
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "vertical-align: text-bottom;"
|
|
@ -3,14 +3,14 @@ from css_parser import parseStyle
|
||||||
|
|
||||||
|
|
||||||
def make_gloss(soup):
|
def make_gloss(soup):
|
||||||
node = __get_markup_structure(soup)
|
node = __get_page_structure(soup)
|
||||||
return {
|
return {
|
||||||
"type": "structured-content",
|
"type": "structured-content",
|
||||||
"content": node["content"],
|
"content": node["content"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def __get_markup_structure(soup):
|
def __get_page_structure(soup):
|
||||||
node = {"tag": soup.name}
|
node = {"tag": soup.name}
|
||||||
content = []
|
content = []
|
||||||
for child in soup.children:
|
for child in soup.children:
|
||||||
|
@ -19,7 +19,7 @@ def __get_markup_structure(soup):
|
||||||
if text != "":
|
if text != "":
|
||||||
content.append(text)
|
content.append(text)
|
||||||
else:
|
else:
|
||||||
content.append(__get_markup_structure(child))
|
content.append(__get_page_structure(child))
|
||||||
|
|
||||||
attributes = __get_attributes(soup.attrs)
|
attributes = __get_attributes(soup.attrs)
|
||||||
for key, val in attributes.items():
|
for key, val in attributes.items():
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from bot.yomichan.glossary.gloss import make_gloss
|
from bot.yomichan.glossary.gloss import make_gloss
|
||||||
|
|
||||||
|
|
||||||
def make_glossary(entry):
|
def make_glossary(entry):
|
||||||
soup = BeautifulSoup(entry.markup, "html5lib")
|
soup = entry.get_page_soup()
|
||||||
__replace_punctuation(soup)
|
__replace_punctuation(soup)
|
||||||
__add_internal_links(soup)
|
__add_internal_links(soup)
|
||||||
__convert_paragraphs(soup)
|
__convert_paragraphs(soup)
|
||||||
|
|
101
bot/yomichan/glossary/name_conversion.py
Normal file
101
bot/yomichan/glossary/name_conversion.py
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def convert_names(soup, name_conversion):
|
||||||
|
for child in soup.children:
|
||||||
|
if child.name is None:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
convert_names(child, name_conversion)
|
||||||
|
|
||||||
|
if child.name in name_conversion.keys():
|
||||||
|
conversion = name_conversion[child.name]
|
||||||
|
if "name" in conversion:
|
||||||
|
child.attrs["data-name"] = child.name
|
||||||
|
child.name = conversion["name"]
|
||||||
|
if "style" in conversion:
|
||||||
|
child.attrs["style"] = conversion["style"]
|
||||||
|
if "procedures" in conversion:
|
||||||
|
procedures = conversion["procedures"]
|
||||||
|
__apply_name_conversion_procedures(child, procedures)
|
||||||
|
else:
|
||||||
|
child.attrs["data-name"] = child.name
|
||||||
|
child.name = "span"
|
||||||
|
|
||||||
|
|
||||||
|
def __apply_name_conversion_procedures(soup, procedures):
|
||||||
|
functions = {
|
||||||
|
"has_class": __has_class,
|
||||||
|
"has_parent": __has_parent,
|
||||||
|
"has_previous_sibling": __has_previous_sibling,
|
||||||
|
"replace": __replace,
|
||||||
|
"wrap": __wrap,
|
||||||
|
"add_ruby_text": __add_ruby_text,
|
||||||
|
}
|
||||||
|
for procedure in procedures:
|
||||||
|
function = functions[procedure["procedure_name"]]
|
||||||
|
parameters = procedure["parameters"]
|
||||||
|
function(soup, **parameters)
|
||||||
|
|
||||||
|
|
||||||
|
def __has_class(soup, class_name, key, value):
|
||||||
|
if not soup.has_attr("class"):
|
||||||
|
return
|
||||||
|
soup_classes = soup.attrs["class"].split(" ")
|
||||||
|
if class_name not in soup_classes:
|
||||||
|
return
|
||||||
|
if key == "style":
|
||||||
|
soup.attrs["style"] = value
|
||||||
|
elif key == "name":
|
||||||
|
soup.name = value
|
||||||
|
else:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
|
||||||
|
def __has_parent(soup, parent_name, key, value):
|
||||||
|
if soup.find_parent(parent_name) is None:
|
||||||
|
return
|
||||||
|
if key == "style":
|
||||||
|
soup.attrs["style"] = value
|
||||||
|
elif key == "name":
|
||||||
|
soup.name = value
|
||||||
|
else:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
|
||||||
|
def __has_previous_sibling(soup, name, key, value):
|
||||||
|
sibling = soup.previous_sibling
|
||||||
|
if sibling is None:
|
||||||
|
return
|
||||||
|
elif sibling.name is None:
|
||||||
|
return
|
||||||
|
elif sibling.has_attr("data-name"):
|
||||||
|
previous_sibling_name = sibling.attrs["data-name"]
|
||||||
|
else:
|
||||||
|
previous_sibling_name = sibling.name
|
||||||
|
if previous_sibling_name != name:
|
||||||
|
return
|
||||||
|
if key == "style":
|
||||||
|
soup.attrs["style"] = value
|
||||||
|
elif key == "name":
|
||||||
|
soup.name = value
|
||||||
|
else:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
|
||||||
|
def __replace(soup, old, new):
|
||||||
|
soup.string = soup.text.replace(old, new)
|
||||||
|
|
||||||
|
|
||||||
|
def __wrap(soup, l_wrap, r_wrap):
|
||||||
|
if soup.text.strip() != "":
|
||||||
|
soup.string = f"{l_wrap}{soup.text}{r_wrap}"
|
||||||
|
|
||||||
|
|
||||||
|
def __add_ruby_text(soup, mark, style):
|
||||||
|
if style.strip() != "":
|
||||||
|
markup = f"<rt><span style='{style}'>{mark}</span></rt>"
|
||||||
|
else:
|
||||||
|
markup = f"<rt>{mark}</rt>"
|
||||||
|
rt_soup = BeautifulSoup(markup, "xml")
|
||||||
|
soup.append(rt_soup.rt)
|
151
bot/yomichan/glossary/smk8.py
Normal file
151
bot/yomichan/glossary/smk8.py
Normal file
|
@ -0,0 +1,151 @@
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import bot.icons as Icons
|
||||||
|
from bot.soup import delete_soup_nodes
|
||||||
|
from bot.data import load_smk8_yomichan_name_conversion
|
||||||
|
from bot.yomichan.glossary.gloss import make_gloss
|
||||||
|
from bot.yomichan.glossary.name_conversion import convert_names
|
||||||
|
|
||||||
|
|
||||||
|
def make_glossary(entry, image_dir):
|
||||||
|
soup = entry.get_page_soup()
|
||||||
|
__fill_alts(soup)
|
||||||
|
__delete_unused_nodes(soup)
|
||||||
|
__clear_styles(soup)
|
||||||
|
__set_data_class(soup)
|
||||||
|
__convert_links(soup, entry)
|
||||||
|
__convert_priority_markers(soup)
|
||||||
|
__convert_gaiji(soup, image_dir)
|
||||||
|
__convert_rectangles(soup, image_dir)
|
||||||
|
|
||||||
|
name_conversion = load_smk8_yomichan_name_conversion()
|
||||||
|
convert_names(soup, name_conversion)
|
||||||
|
|
||||||
|
gloss = make_gloss(soup.span)
|
||||||
|
glossary = [gloss]
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
|
||||||
|
def __fill_alts(soup):
|
||||||
|
for name in ["親見出仮名", "親見出表記"]:
|
||||||
|
for el in soup.find_all(name):
|
||||||
|
el.name = "a"
|
||||||
|
alt = el.attrs["alt"]
|
||||||
|
el.string = alt
|
||||||
|
el.attrs["href"] = f"?query={alt}&wildcards=off"
|
||||||
|
del el.attrs["alt"]
|
||||||
|
|
||||||
|
|
||||||
|
def __delete_unused_nodes(soup):
|
||||||
|
for name in ["audio", "連濁"]:
|
||||||
|
delete_soup_nodes(soup, name)
|
||||||
|
|
||||||
|
|
||||||
|
def __clear_styles(soup):
|
||||||
|
for el in soup.select("[style]"):
|
||||||
|
del el.attrs["style"]
|
||||||
|
|
||||||
|
|
||||||
|
def __set_data_class(soup):
|
||||||
|
for el in soup.select("[class]"):
|
||||||
|
el.attrs["data-class"] = el.attrs["class"]
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_links(soup, entry):
|
||||||
|
for el in soup.find_all("a"):
|
||||||
|
href = el.attrs["href"]
|
||||||
|
if href.startswith("$"):
|
||||||
|
el.unwrap()
|
||||||
|
elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
|
||||||
|
ref_entry_id = entry.id_string_to_entry_id(href)
|
||||||
|
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
|
||||||
|
expression = ref_entry.get_first_expression()
|
||||||
|
el.attrs["href"] = f"?query={expression}&wildcards=off"
|
||||||
|
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid href format: {href}")
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_priority_markers(soup):
|
||||||
|
style = "vertical-align: super; font-size: 0.6em"
|
||||||
|
for el in soup.find_all("img", attrs={"alt": "*"}):
|
||||||
|
el.name = "span"
|
||||||
|
el.string = "*"
|
||||||
|
el.attrs["style"] = style
|
||||||
|
for el in soup.find_all("img", attrs={"alt": "⁑"}):
|
||||||
|
el.name = "span"
|
||||||
|
el.string = "**"
|
||||||
|
el.attrs["style"] = style
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_gaiji(soup, image_dir):
|
||||||
|
for el in soup.find_all("img"):
|
||||||
|
src = el.attrs["src"]
|
||||||
|
path = image_dir
|
||||||
|
for part in src.split("/"):
|
||||||
|
if part.strip() == "":
|
||||||
|
continue
|
||||||
|
path = os.path.join(path, part)
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": "monochrome",
|
||||||
|
"title": el.attrs["alt"] if el.has_attr("alt") else "",
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{src}",
|
||||||
|
"src": src,
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.clear()
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "vertical-align: text-bottom;"
|
||||||
|
|
||||||
|
|
||||||
|
def __convert_rectangles(soup, image_dir):
|
||||||
|
cls_to_appearance = {
|
||||||
|
"default": "monochrome",
|
||||||
|
"fill": "monochrome",
|
||||||
|
"red": "auto",
|
||||||
|
"redfill": "auto",
|
||||||
|
}
|
||||||
|
for el in soup.find_all("rect"):
|
||||||
|
cls = el.attrs["class"] if el.has_attr("class") else "default"
|
||||||
|
filename = f"{el.text}-{cls}.svg"
|
||||||
|
path = os.path.join(image_dir, filename)
|
||||||
|
__make_rectangle(path, el.text, cls)
|
||||||
|
ratio = Icons.calculate_ratio(path)
|
||||||
|
img = BeautifulSoup("<img/>", "xml").img
|
||||||
|
img.attrs = {
|
||||||
|
"height": 1.0 if ratio > 1.0 else ratio,
|
||||||
|
"width": ratio if ratio > 1.0 else 1.0,
|
||||||
|
"sizeUnits": "em",
|
||||||
|
"collapsible": False,
|
||||||
|
"collapsed": False,
|
||||||
|
"background": False,
|
||||||
|
"appearance": cls_to_appearance[cls],
|
||||||
|
"title": el.text,
|
||||||
|
"path": f"{os.path.basename(image_dir)}/{filename}",
|
||||||
|
}
|
||||||
|
el.name = "span"
|
||||||
|
el.clear()
|
||||||
|
el.append(img)
|
||||||
|
el.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em"
|
||||||
|
|
||||||
|
|
||||||
|
def __make_rectangle(path, text, cls):
|
||||||
|
if cls == "fill":
|
||||||
|
Icons.make_monochrome_fill_rectangle(path, text)
|
||||||
|
elif cls == "red":
|
||||||
|
Icons.make_rectangle(path, text, "red", "white", "red")
|
||||||
|
elif cls == "redfill":
|
||||||
|
Icons.make_rectangle(path, text, "red", "red", "white")
|
||||||
|
else:
|
||||||
|
Icons.make_rectangle(path, text, "black", "transparent", "black")
|
|
@ -7,32 +7,29 @@ __U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
|
||||||
"ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
|
"ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
|
||||||
|
|
||||||
__SUDACHI_DICTIONARY = None
|
__SUDACHI_DICTIONARY = None
|
||||||
__SUDACHI_INFLECTION_TYPES = None
|
|
||||||
|
|
||||||
|
|
||||||
def sudachi_rules(expression):
|
def sudachi_rules(expression):
|
||||||
global __SUDACHI_DICTIONARY
|
global __SUDACHI_DICTIONARY
|
||||||
global __SUDACHI_INFLECTION_TYPES
|
|
||||||
if __SUDACHI_DICTIONARY is None:
|
if __SUDACHI_DICTIONARY is None:
|
||||||
__SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
|
__SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
|
||||||
if __SUDACHI_INFLECTION_TYPES is None:
|
categories = load_yomichan_inflection_categories()
|
||||||
categories = load_yomichan_inflection_categories()
|
sudachi_inflection_categories = categories["sudachi"]
|
||||||
__SUDACHI_INFLECTION_TYPES = categories["sudachi"]
|
|
||||||
splitmode = tokenizer.Tokenizer.SplitMode.A
|
splitmode = tokenizer.Tokenizer.SplitMode.A
|
||||||
tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
|
tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
|
||||||
if len(tokens) == 0:
|
if len(tokens) == 0:
|
||||||
return ""
|
return ""
|
||||||
pos = tokens[len(tokens)-1].part_of_speech()[4]
|
pos = tokens[len(tokens)-1].part_of_speech()[4]
|
||||||
tags = pos.split("-")
|
tags = pos.split("-")
|
||||||
rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
|
rules = tags_to_rules(expression, tags, sudachi_inflection_categories)
|
||||||
return rules
|
return rules
|
||||||
|
|
||||||
|
|
||||||
def tags_to_rules(expression, tags, inflection_types):
|
def tags_to_rules(expression, tags, inflection_categories):
|
||||||
rules = set()
|
rules = set()
|
||||||
exp_final_character = expression[len(expression)-1:]
|
exp_final_character = expression[len(expression)-1:]
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
if tag in inflection_types["sahen"]:
|
if tag in inflection_categories["sahen"]:
|
||||||
if expression.endswith("する"):
|
if expression.endswith("する"):
|
||||||
rules.add("vs")
|
rules.add("vs")
|
||||||
elif expression.endswith("為る"):
|
elif expression.endswith("為る"):
|
||||||
|
@ -41,20 +38,20 @@ def tags_to_rules(expression, tags, inflection_types):
|
||||||
rules.add("vz")
|
rules.add("vz")
|
||||||
elif expression.endswith("す"):
|
elif expression.endswith("す"):
|
||||||
rules.add("v5")
|
rules.add("v5")
|
||||||
if tag in inflection_types["godan"]:
|
if tag in inflection_categories["godan"]:
|
||||||
if exp_final_character in __U_KANA_LIST:
|
if exp_final_character in __U_KANA_LIST:
|
||||||
rules.add("v5")
|
rules.add("v5")
|
||||||
if tag in inflection_types["ichidan"]:
|
if tag in inflection_categories["ichidan"]:
|
||||||
if expression.endswith("る"):
|
if expression.endswith("る"):
|
||||||
rules.add("v1")
|
rules.add("v1")
|
||||||
if tag in inflection_types["keiyoushi"]:
|
if tag in inflection_categories["keiyoushi"]:
|
||||||
if expression.endswith("い"):
|
if expression.endswith("い"):
|
||||||
rules.add("adj-i")
|
rules.add("adj-i")
|
||||||
if tag in inflection_types["kahen"]:
|
if tag in inflection_categories["kahen"]:
|
||||||
if expression.endswith("くる"):
|
if expression.endswith("くる"):
|
||||||
rules.add("vk")
|
rules.add("vk")
|
||||||
elif expression.endswith("来る"):
|
elif expression.endswith("来る"):
|
||||||
rules.add("vk")
|
rules.add("vk")
|
||||||
if tag in inflection_types["sudachi"]:
|
if tag in inflection_categories["sudachi"]:
|
||||||
return sudachi_rules(expression)
|
return sudachi_rules(expression)
|
||||||
return " ".join(list(rules))
|
return " ".join(list(rules))
|
||||||
|
|
53
bot/yomichan/terms/daijirin2.py
Normal file
53
bot/yomichan/terms/daijirin2.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
from bot.data import load_yomichan_inflection_categories
|
||||||
|
|
||||||
|
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
|
||||||
|
|
||||||
|
from bot.yomichan.terms.terminator import Terminator
|
||||||
|
from bot.yomichan.glossary.daijirin2 import make_glossary
|
||||||
|
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
||||||
|
|
||||||
|
|
||||||
|
class Daijirin2Terminator(Terminator):
|
||||||
|
def __init__(self, name):
|
||||||
|
super().__init__(name)
|
||||||
|
categories = load_yomichan_inflection_categories()
|
||||||
|
self._inflection_categories = categories[name]
|
||||||
|
|
||||||
|
def _definition_tags(self, entry):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _inflection_rules(self, entry, expression):
|
||||||
|
if isinstance(entry, PhraseEntry):
|
||||||
|
return sudachi_rules(expression)
|
||||||
|
pos_tags = entry.get_part_of_speech_tags()
|
||||||
|
if len(pos_tags) > 0:
|
||||||
|
rules = tags_to_rules(expression, pos_tags,
|
||||||
|
self._inflection_categories)
|
||||||
|
else:
|
||||||
|
rules = sudachi_rules(expression)
|
||||||
|
return rules
|
||||||
|
|
||||||
|
def _glossary(self, entry):
|
||||||
|
if entry.entry_id in self._glossary_cache:
|
||||||
|
return self._glossary_cache[entry.entry_id]
|
||||||
|
glossary = make_glossary(entry, self._image_dir)
|
||||||
|
self._glossary_cache[entry.entry_id] = glossary
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
def _sequence(self, entry):
|
||||||
|
return entry.entry_id[0] * 100000 + entry.entry_id[1]
|
||||||
|
|
||||||
|
def _term_tags(self, entry):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _link_glossary_parameters(self, entry):
|
||||||
|
return [
|
||||||
|
[entry.children, "子"],
|
||||||
|
[entry.phrases, "句"],
|
||||||
|
]
|
||||||
|
|
||||||
|
def _subentry_lists(self, entry):
|
||||||
|
return [
|
||||||
|
entry.children,
|
||||||
|
entry.phrases,
|
||||||
|
]
|
|
@ -4,8 +4,8 @@ from bot.yomichan.glossary.jitenon import make_glossary
|
||||||
|
|
||||||
|
|
||||||
class JitenonTerminator(Terminator):
|
class JitenonTerminator(Terminator):
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
super().__init__()
|
super().__init__(name)
|
||||||
|
|
||||||
def _definition_tags(self, entry):
|
def _definition_tags(self, entry):
|
||||||
return None
|
return None
|
||||||
|
@ -28,8 +28,8 @@ class JitenonTerminator(Terminator):
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiTerminator(JitenonTerminator):
|
class JitenonYojiTerminator(JitenonTerminator):
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
super().__init__()
|
super().__init__(name)
|
||||||
|
|
||||||
def _inflection_rules(self, entry, expression):
|
def _inflection_rules(self, entry, expression):
|
||||||
return ""
|
return ""
|
||||||
|
@ -40,8 +40,8 @@ class JitenonYojiTerminator(JitenonTerminator):
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaTerminator(JitenonTerminator):
|
class JitenonKotowazaTerminator(JitenonTerminator):
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
super().__init__()
|
super().__init__(name)
|
||||||
|
|
||||||
def _inflection_rules(self, entry, expression):
|
def _inflection_rules(self, entry, expression):
|
||||||
return sudachi_rules(expression)
|
return sudachi_rules(expression)
|
||||||
|
|
58
bot/yomichan/terms/smk8.py
Normal file
58
bot/yomichan/terms/smk8.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
from bot.data import load_yomichan_inflection_categories
|
||||||
|
|
||||||
|
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
|
||||||
|
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
|
||||||
|
|
||||||
|
from bot.yomichan.terms.terminator import Terminator
|
||||||
|
from bot.yomichan.glossary.smk8 import make_glossary
|
||||||
|
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
||||||
|
|
||||||
|
|
||||||
|
class Smk8Terminator(Terminator):
|
||||||
|
def __init__(self, name):
|
||||||
|
super().__init__(name)
|
||||||
|
categories = load_yomichan_inflection_categories()
|
||||||
|
self._inflection_categories = categories[name]
|
||||||
|
|
||||||
|
def _definition_tags(self, entry):
|
||||||
|
if isinstance(entry, KanjiEntry):
|
||||||
|
return "造"
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _inflection_rules(self, entry, expression):
|
||||||
|
if isinstance(entry, PhraseEntry):
|
||||||
|
return sudachi_rules(expression)
|
||||||
|
elif isinstance(entry, KanjiEntry):
|
||||||
|
return ""
|
||||||
|
pos_tags = entry.get_part_of_speech_tags()
|
||||||
|
if len(pos_tags) == 0:
|
||||||
|
return sudachi_rules(expression)
|
||||||
|
else:
|
||||||
|
return tags_to_rules(expression, pos_tags, self._inflection_categories)
|
||||||
|
|
||||||
|
def _glossary(self, entry):
|
||||||
|
if entry.entry_id in self._glossary_cache:
|
||||||
|
return self._glossary_cache[entry.entry_id]
|
||||||
|
glossary = make_glossary(entry, self._image_dir)
|
||||||
|
self._glossary_cache[entry.entry_id] = glossary
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
def _sequence(self, entry):
|
||||||
|
return entry.entry_id[0] * 100000 + entry.entry_id[1]
|
||||||
|
|
||||||
|
def _term_tags(self, entry):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _link_glossary_parameters(self, entry):
|
||||||
|
return [
|
||||||
|
[entry.children, "子"],
|
||||||
|
[entry.phrases, "句"]
|
||||||
|
]
|
||||||
|
|
||||||
|
def _subentry_lists(self, entry):
|
||||||
|
return [
|
||||||
|
entry.children,
|
||||||
|
entry.phrases,
|
||||||
|
entry.kanjis
|
||||||
|
]
|
|
@ -1,6 +1,11 @@
|
||||||
class Terminator:
|
class Terminator:
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
|
self._name = name
|
||||||
self._glossary_cache = {}
|
self._glossary_cache = {}
|
||||||
|
self._image_dir = None
|
||||||
|
|
||||||
|
def set_image_dir(self, image_dir):
|
||||||
|
self._image_dir = image_dir
|
||||||
|
|
||||||
def make_terms(self, entry):
|
def make_terms(self, entry):
|
||||||
terms = []
|
terms = []
|
||||||
|
|
14782
data/adobe/Adobe-Japan1_sequences.txt
Normal file
14782
data/adobe/Adobe-Japan1_sequences.txt
Normal file
File diff suppressed because it is too large
Load diff
14
data/adobe/override_glyphs.json
Normal file
14
data/adobe/override_glyphs.json
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
{
|
||||||
|
"8228": "Ø",
|
||||||
|
"9772": "(",
|
||||||
|
"9773": ")",
|
||||||
|
"10078": "Т",
|
||||||
|
"10079": "У",
|
||||||
|
"10080": "Ф",
|
||||||
|
"10081": "Х",
|
||||||
|
"10082": "Ц",
|
||||||
|
"10083": "Ч",
|
||||||
|
"10084": "Ш",
|
||||||
|
"12107": "〻",
|
||||||
|
"12180": "⮗"
|
||||||
|
}
|
10963
data/daijirin2/kana_abbreviations.csv
Normal file
10963
data/daijirin2/kana_abbreviations.csv
Normal file
File diff suppressed because it is too large
Load diff
8328
data/daijirin2/phrase_readings.csv
Normal file
8328
data/daijirin2/phrase_readings.csv
Normal file
File diff suppressed because it is too large
Load diff
290
data/daijirin2/yomichan_name_conversion.json
Normal file
290
data/daijirin2/yomichan_name_conversion.json
Normal file
|
@ -0,0 +1,290 @@
|
||||||
|
{
|
||||||
|
"a": {},
|
||||||
|
"br": {},
|
||||||
|
"img": {},
|
||||||
|
"div": {},
|
||||||
|
"span": {},
|
||||||
|
"ruby": {},
|
||||||
|
"rt": {},
|
||||||
|
"語構成": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "margin-right: 0.5em;"
|
||||||
|
},
|
||||||
|
"熟語例G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"漢字音G": {
|
||||||
|
"name": "ul"
|
||||||
|
},
|
||||||
|
"漢字音": {
|
||||||
|
"name": "li"
|
||||||
|
},
|
||||||
|
"sup": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.6em; vertical-align: super;"
|
||||||
|
},
|
||||||
|
"p": {
|
||||||
|
"name": "div",
|
||||||
|
"style": "margin-top: 0.5em; margin-bottom: 0.5em;"
|
||||||
|
},
|
||||||
|
"カット": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"中語義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"副義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"異字同訓解説": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"異字同訓語義G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"細義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"単位名": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.6em; vertical-align: super;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"原籍": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em; vertical-align: super;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"句仮名": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.6em; vertical-align: super;"
|
||||||
|
},
|
||||||
|
"品詞行": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.6em; vertical-align: super;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"用例": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"季語G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"補説G": {
|
||||||
|
"name": "div",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "has_previous_sibling",
|
||||||
|
"parameters": {
|
||||||
|
"name": "語義Gnum",
|
||||||
|
"key": "name",
|
||||||
|
"value": "span"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"procedure_name": "has_previous_sibling",
|
||||||
|
"parameters": {
|
||||||
|
"name": "アクセントG",
|
||||||
|
"key": "name",
|
||||||
|
"value": "span"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"語釈": {
|
||||||
|
"name": "span",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "has_previous_sibling",
|
||||||
|
"parameters": {
|
||||||
|
"name": "補説G",
|
||||||
|
"key": "name",
|
||||||
|
"value": "div"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"品詞用法": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.6em; vertical-align: super;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"大語義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"文語形": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"慣用G": {
|
||||||
|
"name": "div",
|
||||||
|
"style": "margin-top: 0.5em"
|
||||||
|
},
|
||||||
|
"歴史仮名": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.6em;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"派生G": {
|
||||||
|
"name": "div",
|
||||||
|
"style": "margin-top: 0.5em"
|
||||||
|
},
|
||||||
|
"準大語義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"見出部": {
|
||||||
|
"name": "span"
|
||||||
|
},
|
||||||
|
"解説部": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"語義G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"語義区切": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em; vertical-align: super;"
|
||||||
|
},
|
||||||
|
"返り点": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.5em; font-weight: normal; vertical-align: super;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "has_class",
|
||||||
|
"parameters": {
|
||||||
|
"class_name": "熟語記号",
|
||||||
|
"key": "style",
|
||||||
|
"value": "vertical-align: baseline;"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"生没年": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em;"
|
||||||
|
},
|
||||||
|
"用法": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em; vertical-align: super;"
|
||||||
|
},
|
||||||
|
"異字同訓": {
|
||||||
|
"name": "div",
|
||||||
|
"style": "margin-top: 0.5em;"
|
||||||
|
},
|
||||||
|
"異字同訓仮名": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"異字同訓漢字": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: normal;"
|
||||||
|
},
|
||||||
|
"異字同訓表記": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: normal;"
|
||||||
|
},
|
||||||
|
"見出仮名": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"見出相当部": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"カットG": {
|
||||||
|
"name": "div",
|
||||||
|
"style": "margin-top: 0.5em;"
|
||||||
|
},
|
||||||
|
"sm": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em;"
|
||||||
|
},
|
||||||
|
"small": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em;"
|
||||||
|
},
|
||||||
|
"sub": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em; vertical-align: sub;"
|
||||||
|
},
|
||||||
|
"付記": {
|
||||||
|
"name": "span",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"アクセントG": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "margin-left: 0.25em; margin-right: 0.25em; font-size: 0.7em; vertical-align: super;"
|
||||||
|
},
|
||||||
|
"i": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-style: italic;"
|
||||||
|
},
|
||||||
|
"h1": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"読みG": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "vertical-align: super; font-size: 0.6em;"
|
||||||
|
},
|
||||||
|
"ルビG": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "vertical-align: super; font-size: 0.6em; font-weight: normal;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
1135
data/smk8/phrase_readings.csv
Normal file
1135
data/smk8/phrase_readings.csv
Normal file
File diff suppressed because it is too large
Load diff
221
data/smk8/yomichan_name_conversion.json
Normal file
221
data/smk8/yomichan_name_conversion.json
Normal file
|
@ -0,0 +1,221 @@
|
||||||
|
{
|
||||||
|
"a": {},
|
||||||
|
"br": {},
|
||||||
|
"img": {},
|
||||||
|
"div": {},
|
||||||
|
"span": {},
|
||||||
|
"ruby": {},
|
||||||
|
"rt": {},
|
||||||
|
"語義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"副義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"派生": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"用例": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"参照G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"用例G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"解説部": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"大語義": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"名詞形G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"可能形G": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"派生SubG": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"子解説部": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"句解説部": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"運用解説": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"表記解説": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"文法解説": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"派生SubGF": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"かぞえ方解説": {
|
||||||
|
"name": "div"
|
||||||
|
},
|
||||||
|
"二分": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "margin-right: 1.0em;"
|
||||||
|
},
|
||||||
|
"四分": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "margin-right: 0.5em;"
|
||||||
|
},
|
||||||
|
"言換M": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.5em;"
|
||||||
|
},
|
||||||
|
"品詞用法": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.7em;"
|
||||||
|
},
|
||||||
|
"ルビG": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "vertical-align: super; font-size: 0.65em"
|
||||||
|
},
|
||||||
|
"アクセント": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "vertical-align: super; font-size: 0.7em;"
|
||||||
|
},
|
||||||
|
"アクセント組M": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "vertical-align: super; font-size: 0.7em;"
|
||||||
|
},
|
||||||
|
"IT": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-style: italic;"
|
||||||
|
},
|
||||||
|
"EXCLAMATION": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-style: italic;"
|
||||||
|
},
|
||||||
|
"B": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"EM": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"出現形": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"見出仮名": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"基本構文em": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"ウ濁音参照": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;"
|
||||||
|
},
|
||||||
|
"表外字": {
|
||||||
|
"name": "ruby",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "add_ruby_text",
|
||||||
|
"parameters": {
|
||||||
|
"mark": "︿",
|
||||||
|
"style": "font-size: 2em;"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"表外音訓": {
|
||||||
|
"name": "ruby",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "add_ruby_text",
|
||||||
|
"parameters": {
|
||||||
|
"mark": "︽",
|
||||||
|
"style": "font-size: 2em;"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"表音式": {
|
||||||
|
"name": "ruby"
|
||||||
|
},
|
||||||
|
"表音表記": {
|
||||||
|
"name": "rt",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "replace",
|
||||||
|
"parameters": {
|
||||||
|
"old": "(",
|
||||||
|
"new": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"procedure_name": "replace",
|
||||||
|
"parameters": {
|
||||||
|
"old": ")",
|
||||||
|
"new": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"派生見出": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-weight: bold;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "has_class",
|
||||||
|
"parameters": {
|
||||||
|
"class_name": "normal",
|
||||||
|
"key": "style",
|
||||||
|
"value": "font-weight: normal;"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"品詞G": {
|
||||||
|
"name": "span",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "has_parent",
|
||||||
|
"parameters": {
|
||||||
|
"parent_name": "品詞用法",
|
||||||
|
"key": "style",
|
||||||
|
"value": "font-size: 1.43em;"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"歴史仮名": {
|
||||||
|
"name": "span",
|
||||||
|
"style": "font-size: 0.6em; font-weight: normal;",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"ルビ": {
|
||||||
|
"name": "span",
|
||||||
|
"procedures": [
|
||||||
|
{
|
||||||
|
"procedure_name": "wrap",
|
||||||
|
"parameters": {
|
||||||
|
"l_wrap": "(",
|
||||||
|
"r_wrap": ")"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,19 +1,45 @@
|
||||||
|
俠,侠
|
||||||
|
俱,倶
|
||||||
儘,侭
|
儘,侭
|
||||||
凜,凛
|
凜,凛
|
||||||
剝,剥
|
剝,剥
|
||||||
|
𠮟,叱
|
||||||
吞,呑
|
吞,呑
|
||||||
啞,唖
|
啞,唖
|
||||||
噓,嘘
|
噓,嘘
|
||||||
嚙,噛
|
嚙,噛
|
||||||
|
囊,嚢
|
||||||
塡,填
|
塡,填
|
||||||
姸,妍
|
姸,妍
|
||||||
|
屛,屏
|
||||||
|
屢,屡
|
||||||
|
拋,抛
|
||||||
搔,掻
|
搔,掻
|
||||||
摑,掴
|
摑,掴
|
||||||
|
攪,撹
|
||||||
潑,溌
|
潑,溌
|
||||||
|
瀆,涜
|
||||||
|
焰,焔
|
||||||
|
禱,祷
|
||||||
竜,龍
|
竜,龍
|
||||||
|
筓,笄
|
||||||
簞,箪
|
簞,箪
|
||||||
籠,篭
|
籠,篭
|
||||||
|
繡,繍
|
||||||
|
繫,繋
|
||||||
|
腁,胼
|
||||||
|
萊,莱
|
||||||
藪,薮
|
藪,薮
|
||||||
|
蟬,蝉
|
||||||
蠟,蝋
|
蠟,蝋
|
||||||
|
軀,躯
|
||||||
醬,醤
|
醬,醤
|
||||||
|
醱,醗
|
||||||
|
頰,頬
|
||||||
|
顚,顛
|
||||||
|
驒,騨
|
||||||
鶯,鴬
|
鶯,鴬
|
||||||
|
鷗,鴎
|
||||||
|
鷽,鴬
|
||||||
|
鹼,鹸
|
||||||
|
麴,麹
|
||||||
|
|
|
|
@ -6,5 +6,21 @@
|
||||||
"keiyoushi": ["形容詞", "ナイ", "タイ", "ラシイ"],
|
"keiyoushi": ["形容詞", "ナイ", "タイ", "ラシイ"],
|
||||||
"kahen": ["カ行変格"],
|
"kahen": ["カ行変格"],
|
||||||
"sudachi": []
|
"sudachi": []
|
||||||
|
},
|
||||||
|
"smk8": {
|
||||||
|
"sahen": ["サ", "サ変型"],
|
||||||
|
"godan": ["上二", "下二", "四", "五", "上二型", "下二型", "四段型", "五型", "特殊型"],
|
||||||
|
"ichidan": ["上一", "下一", "上一型", "下一型"],
|
||||||
|
"keiyoushi": ["形", "形型"],
|
||||||
|
"kahen": ["カ"],
|
||||||
|
"sudachi": ["連体"]
|
||||||
|
},
|
||||||
|
"daijirin2": {
|
||||||
|
"sahen": ["サ変", "サ特活"],
|
||||||
|
"godan": ["ナ変", "マ特活", "ラ特活", "上二", "下二", "五", "四"],
|
||||||
|
"ichidan": ["上一", "下一"],
|
||||||
|
"keiyoushi": ["形"],
|
||||||
|
"kahen": ["カ変"],
|
||||||
|
"sudachi": ["助動", "接尾", "枕詞", "連体", "連語"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,5 +24,28 @@
|
||||||
"url": "https://kotowaza.jitenon.jp/"
|
"url": "https://kotowaza.jitenon.jp/"
|
||||||
},
|
},
|
||||||
"tags": []
|
"tags": []
|
||||||
|
},
|
||||||
|
"smk8": {
|
||||||
|
"index": {
|
||||||
|
"title": "新明解国語辞典 第八版",
|
||||||
|
"sequenced": true,
|
||||||
|
"format": 3
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
["子", "name", 0, "子項目", 0],
|
||||||
|
["句", "expression", 0, "句項目", 0],
|
||||||
|
["造", "popular", 0, "造語成分項目", 0]
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"daijirin2": {
|
||||||
|
"index": {
|
||||||
|
"title": "大辞林 第四版",
|
||||||
|
"sequenced": true,
|
||||||
|
"format": 3
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
["子", "name", 0, "子項目", 0],
|
||||||
|
["句", "expression", 0, "句項目", 0]
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
68
jitenbot.py
68
jitenbot.py
|
@ -16,47 +16,59 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
from bot.crawlers import JitenonYojiCrawler
|
from bot.crawlers import JitenonYojiCrawler
|
||||||
from bot.crawlers import JitenonKotowazaCrawler
|
from bot.crawlers import JitenonKotowazaCrawler
|
||||||
|
from bot.crawlers import Smk8Crawler
|
||||||
|
from bot.crawlers import Daijirin2Crawler
|
||||||
|
|
||||||
|
|
||||||
crawlers = {
|
def directory(d):
|
||||||
"jitenon-yoji": JitenonYojiCrawler,
|
if not os.path.isdir(d):
|
||||||
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory")
|
||||||
}
|
elif not os.access(d, os.R_OK):
|
||||||
|
raise argparse.ArgumentTypeError(f"Cannot access directory `{d}`")
|
||||||
|
else:
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
def add_target_argument(parser):
|
def parse_args(targets):
|
||||||
target_argument_params = {
|
parser = argparse.ArgumentParser(
|
||||||
"choices": crawlers.keys(),
|
prog="jitenbot",
|
||||||
"help": "Dictionary to convert."
|
description="Convert Japanese dictionary files to new formats.",
|
||||||
}
|
)
|
||||||
parser.add_argument("target", **target_argument_params)
|
parser.add_argument(
|
||||||
|
"target",
|
||||||
|
choices=targets,
|
||||||
def make_parser():
|
help="name of dictionary to convert"
|
||||||
argument_parser_params = {
|
)
|
||||||
"prog": "jitenbot",
|
parser.add_argument(
|
||||||
"description": "Convert Japanese dictionary files to new formats.",
|
"-p", "--page-dir",
|
||||||
}
|
help="path to directory containing XML page files",
|
||||||
parser = argparse.ArgumentParser(**argument_parser_params)
|
type=directory
|
||||||
return parser
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-i", "--image-dir",
|
||||||
def parse_args():
|
help="path to directory containing image files (gaiji, etc.)",
|
||||||
parser = make_parser()
|
type=directory
|
||||||
add_target_argument(parser)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
crawlers = {
|
||||||
|
"jitenon-yoji": JitenonYojiCrawler,
|
||||||
|
"jitenon-kotowaza": JitenonKotowazaCrawler,
|
||||||
|
"smk8": Smk8Crawler,
|
||||||
|
"daijirin2": Daijirin2Crawler,
|
||||||
|
}
|
||||||
|
args = parse_args(crawlers.keys())
|
||||||
crawler_class = crawlers[args.target]
|
crawler_class = crawlers[args.target]
|
||||||
crawler = crawler_class()
|
crawler = crawler_class(args)
|
||||||
crawler.crawl()
|
crawler.collect_pages()
|
||||||
crawler.read_entries()
|
crawler.read_pages()
|
||||||
crawler.make_yomichan_dictionary()
|
crawler.make_yomichan_dictionary()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,9 +5,12 @@ charset-normalizer==3.1.0
|
||||||
css-parser==1.0.8
|
css-parser==1.0.8
|
||||||
html5lib==1.1
|
html5lib==1.1
|
||||||
idna==3.4
|
idna==3.4
|
||||||
requests==2.28.2
|
lxml==4.9.2
|
||||||
|
Pillow==9.5.0
|
||||||
|
platformdirs==3.5.0
|
||||||
|
requests==2.29.0
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
soupsieve==2.4
|
soupsieve==2.4.1
|
||||||
SudachiDict-full==20230110
|
SudachiDict-full==20230110
|
||||||
SudachiPy==0.6.7
|
SudachiPy==0.6.7
|
||||||
urllib3==1.26.15
|
urllib3==1.26.15
|
||||||
|
|
Loading…
Reference in a new issue