Add support for Shinmeikai 8th edition & Daijirin 4th edition

This commit is contained in:
stephenmk 2023-05-01 17:31:28 -05:00
parent 0cfa3a19df
commit 5aa954bf2d
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
35 changed files with 37579 additions and 176 deletions

View file

@ -1,41 +1,59 @@
import os
import re
from bs4 import BeautifulSoup
import bot.scraper as Scraper
from bot.entries.jitenon import JitenonKotowazaEntry
from bot.yomichan.export import JitenonKotowazaExporter
from bot.entries.jitenon import JitenonYojiEntry
from bot.entries.smk8 import Smk8Entry
from bot.entries.daijirin2 import Daijirin2Entry
from bot.yomichan.export import JitenonKotowazaExporter
from bot.yomichan.export import JitenonYojiExporter
from bot.yomichan.export import Smk8Exporter
from bot.yomichan.export import Daijirin2Exporter
class Crawler():
def __init__(self):
self._crawl_map = {}
self.__entries = []
class _Crawler():
def __init__(self, args):
self._page_dir = args.page_dir
self._image_dir = args.image_dir
self._page_map = {}
self._entries = []
def read_entries(self):
entries_len = len(self._crawl_map)
items = self._crawl_map.items()
for idx, (entry_id, entry_path) in enumerate(items):
update = f"Reading entry {idx+1}/{entries_len}"
def read_pages(self):
pages_len = len(self._page_map)
items = self._page_map.items()
for idx, (page_id, page_path) in enumerate(items):
update = f"Reading page {idx+1}/{pages_len}"
print(update, end='\r', flush=True)
entry = self._entry_class(entry_id)
entry.set_markup(entry_path)
self.__entries.append(entry)
entry = self._entry_class(page_id)
with open(page_path, "r") as f:
page = f.read()
entry.set_page(page)
self._entries.append(entry)
print()
def make_yomichan_dictionary(self):
self._yomi_exporter.export(self.__entries)
self._yomi_exporter.export(self._entries, self._image_dir)
def _parse_page_id(self, page_link):
m = re.search(self._page_id_pattern, page_link)
if not m:
return None
page_id = int(m.group(1))
if page_id in self._page_map:
return None
return page_id
class JitenonCrawler(Crawler):
def __init__(self):
super().__init__()
class _JitenonCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
def crawl(self):
print(f"Scraping {self._name}...")
def collect_pages(self):
print("Scraping jitenon.jp")
jitenon = Scraper.Jitenon()
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -44,40 +62,60 @@ class JitenonCrawler(Crawler):
kana_doc, _ = jitenon.scrape(gojuon_href)
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
for kana_a in kana_soup.select(".word_box a", href=True):
entry_link = kana_a['href']
entry_id = self.__parse_entry_id(entry_link)
if entry_id is None:
page_link = kana_a['href']
page_id = self._parse_page_id(page_link)
if page_id is None:
continue
_, entry_path = jitenon.scrape(entry_link)
self._crawl_map[entry_id] = entry_path
entries_len = len(self._crawl_map)
print(f"Finished scraping {entries_len} entries")
def __parse_entry_id(self, entry_link):
m = re.search(self._entry_id_pattern, entry_link)
if not m:
return None
entry_id = int(m.group(1))
if entry_id in self._crawl_map:
return None
return entry_id
_, page_path = jitenon.scrape(page_link)
self._page_map[page_id] = page_path
pages_len = len(self._page_map)
print(f"Finished scraping {pages_len} pages")
class JitenonYojiCrawler(JitenonCrawler):
def __init__(self):
super().__init__()
class JitenonYojiCrawler(_JitenonCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = JitenonYojiEntry
self._yomi_exporter = JitenonYojiExporter()
self._name = "jitenon-yoji"
self._yomi_exporter = JitenonYojiExporter(args.target)
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
self._entry_id_pattern = r"([0-9]+).html"
self._page_id_pattern = r"([0-9]+)\.html$"
class JitenonKotowazaCrawler(JitenonCrawler):
def __init__(self):
super().__init__()
class JitenonKotowazaCrawler(_JitenonCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = JitenonKotowazaEntry
self._yomi_exporter = JitenonKotowazaExporter()
self._name = "jitenon-kotowaza"
self._yomi_exporter = JitenonKotowazaExporter(args.target)
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
self._entry_id_pattern = r"([0-9]+).php"
self._page_id_pattern = r"([0-9]+)\.php$"
class _MonokakidoCrawler(_Crawler):
def __init__(self, args):
super().__init__(args)
self._page_id_pattern = r"^([0-9]+)\.xml$"
def collect_pages(self):
print(f"Searching for page files in `{self._page_dir}`")
for pagefile in os.listdir(self._page_dir):
page_id = self._parse_page_id(pagefile)
if page_id is None or page_id == 0:
continue
path = os.path.join(self._page_dir, pagefile)
self._page_map[page_id] = path
pages_len = len(self._page_map)
print(f"Found {pages_len} page files for processing")
class Smk8Crawler(_MonokakidoCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = Smk8Entry
self._yomi_exporter = Smk8Exporter(args.target)
class Daijirin2Crawler(_MonokakidoCrawler):
def __init__(self, args):
super().__init__(args)
self._entry_class = Daijirin2Entry
self._yomi_exporter = Daijirin2Exporter(args.target)

View file

@ -2,11 +2,24 @@ import os
import sys
import json
import csv
from functools import cache
from pathlib import Path
from platformdirs import user_config_dir
@cache
def get_adobe_glyph(code):
adobe_glyphs = __load_adobe_glyphs()
override_adobe_glyphs = __load_override_adobe_glyphs()
if code in override_adobe_glyphs:
return override_adobe_glyphs[code]
if len(adobe_glyphs[code]) > 1:
raise Exception(f"Multiple glyphs available for code {code}")
return adobe_glyphs[code][0]
@cache
def load_config():
config_dir = user_config_dir("jitenbot")
if not Path(config_dir).is_dir():
@ -22,18 +35,21 @@ def load_config():
return config
@cache
def load_yomichan_inflection_categories():
file_name = "yomichan_inflection_categories.json"
data = __load_json(file_name)
return data
@cache
def load_yomichan_metadata():
file_name = "yomichan_metadata.json"
data = __load_json(file_name)
return data
@cache
def load_variant_kanji():
def loader(data, row):
data[row[0]] = row[1]
@ -43,12 +59,94 @@ def load_variant_kanji():
return data
@cache
def load_smk8_phrase_readings():
def loader(data, row):
entry_id = (int(row[0]), int(row[1]))
reading = row[2]
data[entry_id] = reading
file_name = os.path.join("smk8", "phrase_readings.csv")
data = {}
__load_csv(file_name, loader, data)
return data
@cache
def load_daijirin2_phrase_readings():
def loader(data, row):
entry_id = (int(row[0]), int(row[1]))
reading = row[2]
data[entry_id] = reading
file_name = os.path.join("daijirin2", "phrase_readings.csv")
data = {}
__load_csv(file_name, loader, data)
return data
@cache
def load_daijirin2_kana_abbreviations():
def loader(data, row):
entry_id = (int(row[0]), int(row[1]))
abbreviations = []
for abbr in row[2:]:
if abbr.strip() != "":
abbreviations.append(abbr)
data[entry_id] = abbreviations
file_name = os.path.join("daijirin2", "kana_abbreviations.csv")
data = {}
__load_csv(file_name, loader, data)
return data
@cache
def load_smk8_yomichan_name_conversion():
file_name = os.path.join("smk8", "yomichan_name_conversion.json")
data = __load_json(file_name)
return data
@cache
def load_daijirin2_yomichan_name_conversion():
file_name = os.path.join("daijirin2", "yomichan_name_conversion.json")
data = __load_json(file_name)
return data
@cache
def __load_default_config():
file_name = "default_config.json"
data = __load_json(file_name)
return data
@cache
def __load_adobe_glyphs():
def loader(data, row):
if row[0].startswith("#"):
return
character = chr(int(row[0].split(" ")[0], 16))
code = int(row[2].removeprefix(" CID+"))
if code in data:
if character not in data[code]:
data[code].append(character)
else:
data[code] = [character]
file_name = os.path.join("adobe", "Adobe-Japan1_sequences.txt")
data = {}
__load_csv(file_name, loader, data, delim=';')
return data
@cache
def __load_override_adobe_glyphs():
file_name = os.path.join("adobe", "override_glyphs.json")
json_data = __load_json(file_name)
data = {}
for key, val in json_data.items():
data[int(key)] = val
return data
def __load_json(file_name):
file_path = os.path.join("data", file_name)
if not Path(file_path).is_file():

272
bot/entries/daijirin2.py Normal file
View file

@ -0,0 +1,272 @@
import re
from bs4 import BeautifulSoup
import bot.expressions as Expressions
import bot.soup as Soup
from bot.data import load_daijirin2_phrase_readings
from bot.data import load_daijirin2_kana_abbreviations
from bot.entries.entry import Entry
from bot.entries.daijirin2_preprocess import preprocess_page
class _BaseDaijirin2Entry(Entry):
ID_TO_ENTRY = {}
SUBENTRY_ID_TO_ENTRY_ID = {}
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.children = []
self.phrases = []
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
for pos_group in soup.find_all("品詞G"):
if pos_group.parent.name == "大語義":
self._set_part_of_speech_tags(pos_group)
return self._part_of_speech_tags
def _set_part_of_speech_tags(self, el):
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
for child in el.children:
if child.name is not None:
self._set_part_of_speech_tags(child)
continue
pos = str(child)
if el.name not in pos_names:
continue
elif pos in ["", ""]:
continue
elif pos in self._part_of_speech_tags:
continue
else:
self._part_of_speech_tags.append(pos)
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def _set_regular_headwords(self, soup):
self._fill_alts(soup)
reading = soup.find("見出仮名").text
expressions = []
for el in soup.find_all("標準表記"):
expression = self._clean_expression(el.text)
if "" in expression:
kana_abbrs = self._kana_abbreviations[self.entry_id]
for abbr in kana_abbrs:
expression = expression.replace("", abbr, 1)
expressions.append(expression)
expressions = Expressions.expand_abbreviation_list(expressions)
if len(expressions) == 0:
expressions.append(reading)
self._headwords = {reading: expressions}
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Daijirin2ChildEntry, ["子項目"], self.children],
[Daijirin2PhraseEntry, ["句項目"], self.phrases],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _delete_unused_nodes(soup):
unused_nodes = [
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
"表外字マーク", "表外字マーク", "ルビG"
]
for name in unused_nodes:
Soup.delete_soup_nodes(soup, name)
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for gaiji in soup.find_all(class_="gaiji"):
if gaiji.name == "img" and gaiji.has_attr("alt"):
gaiji.name = "span"
gaiji.string = gaiji.attrs["alt"]
class Daijirin2Entry(_BaseDaijirin2Entry):
def __init__(self, page_id):
entry_id = (page_id, 0)
super().__init__(entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _set_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
if soup.find("漢字見出") is not None:
self._set_kanji_headwords(soup)
elif soup.find("略語G") is not None:
self._set_acronym_headwords(soup)
else:
self._set_regular_headwords(soup)
def _set_kanji_headwords(self, soup):
readings = []
for el in soup.find_all("漢字音"):
hira = Expressions.kata_to_hira(el.text)
readings.append(hira)
if soup.find("漢字音") is None:
readings.append("")
expressions = []
for el in soup.find_all("漢字見出"):
expressions.append(el.text)
self._headwords = {}
for reading in readings:
self._headwords[reading] = expressions
def _set_acronym_headwords(self, soup):
expressions = []
for el in soup.find_all("略語"):
expression_parts = []
for part in el.find_all(["欧字", "和字"]):
expression_parts.append(part.text)
expression = "".join(expression_parts)
expressions.append(expression)
self._headwords = {"": expressions}
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
soup = self.get_page_soup()
self._delete_unused_nodes(soup)
self._set_regular_headwords(soup)
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
self.__phrase_readings = load_daijirin2_phrase_readings()
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _set_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
self._headwords = headwords
def _find_expressions(self, soup):
self._delete_unused_nodes(soup)
text = soup.find("句表記").text
text = self._clean_expression(text)
alternatives = self.__expand_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = self.__expand_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
@staticmethod
def __expand_alternatives(expression):
"""Return a list of strings described by notation.
eg. "同じ穴の=狢(=狐・狸)" -> [
"同じ穴の狢", "同じ穴の狐", "同じ穴の狸"
]
eg. "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥" -> [
"聞くは一時の恥、聞かぬは末代の恥",
"聞くは一時の恥、聞かぬは一生の恥",
"聞くは一旦の恥、聞かぬは末代の恥",
"聞くは一旦の恥、聞かぬは一生の恥"
]
"""
group_pattern = r"([^]+)(([^]+)([^]+))?"
groups = re.findall(group_pattern, expression)
expressions = [""]
for group in groups:
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[1] == "":
continue
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
for expression in expressions:
for alt in group[3].split(""):
new_exps.append(expression + alt)
expressions = new_exps.copy()
return expressions

View file

@ -0,0 +1,56 @@
import re
from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
__GAIJI = {
"gaiji/DJRK0002.svg": "𦬇",
"gaiji/U芸E0102.svg": "",
}
def preprocess_page(page):
soup = BeautifulSoup(page, features="xml")
__replace_glyph_codes(soup)
__add_gaiji_alt_text(soup)
__replace_halfwidth_braces(soup)
page = __strip_page(soup)
return page
def __replace_glyph_codes(soup):
for el in soup.find_all(style=True):
m = re.search(r"^glyph:([0-9]+);?$", el.attrs["style"])
if not m:
continue
del el.attrs["style"]
if el.has_attr("alt"):
el.string = el.attrs["alt"]
continue
code = int(m.group(1))
for geta in el.find_all(string=""):
glyph = get_adobe_glyph(code)
geta.replace_with(glyph)
def __add_gaiji_alt_text(soup):
for gaiji in soup.find_all(class_="gaiji"):
src = gaiji.attrs["src"] if gaiji.has_attr("src") else ""
if src in __GAIJI:
gaiji.attrs["alt"] = __GAIJI[src]
def __replace_halfwidth_braces(soup):
for x in soup.find_all("送り仮名省略"):
for el in x.find_all(string="("):
el.replace_with("")
for el in x.find_all(string=")"):
el.replace_with("")
def __strip_page(soup):
koumoku = soup.find("項目")
if koumoku is None:
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
return koumoku.decode()

38
bot/entries/entry.py Normal file
View file

@ -0,0 +1,38 @@
from abc import ABC, abstractmethod
from bot.data import load_variant_kanji
class Entry(ABC):
def __init__(self, entry_id):
self.entry_id = entry_id
self._page = None
self._headwords = None
self._part_of_speech_tags = None
self._variant_kanji = load_variant_kanji()
@abstractmethod
def set_page(self, page):
pass
@abstractmethod
def get_page_soup(self):
pass
@abstractmethod
def get_headwords(self):
pass
@abstractmethod
def get_part_of_speech_tags(self):
pass
def get_first_expression(self):
headwords = self.get_headwords()
expressions = next(iter(headwords.values()))
expression = expressions[0]
return expression
def get_first_reading(self):
headwords = self.get_headwords()
reading = next(iter(headwords.keys()))
return reading

View file

@ -2,29 +2,21 @@ import re
from datetime import datetime, date
from bs4 import BeautifulSoup
from bot.data import load_variant_kanji
from bot.entries.entry import Entry
import bot.expressions as Expressions
class JitenonEntry:
_VARIANT_KANJI = None
class _JitenonEntry(Entry):
def __init__(self, entry_id):
if self._VARIANT_KANJI is None:
self._VARIANT_KANJI = load_variant_kanji()
self.entry_id = entry_id
self.markup = ""
super().__init__(entry_id)
self.modified_date = date(1970, 1, 1)
self.attribution = ""
for column in self._COLUMNS.values():
setattr(self, column[0], column[1])
self._headwords = None
def set_markup(self, path):
with open(path, "r") as f:
html = f.read()
soup = BeautifulSoup(html, features="html5lib")
self.__set_modified_date(html)
def set_page(self, page):
soup = BeautifulSoup(page, features="html5lib")
self.__set_modified_date(page)
self.attribution = soup.find(class_="copyright").text
table = soup.find(class_="kanjirighttb")
rows = table.find("tbody").find_all("tr")
@ -33,7 +25,11 @@ class JitenonEntry:
colname = row.th.text if row.th is not None else colname
colval = self.__clean_text(row.td.text)
self.__set_column(colname, colval)
self.markup = table.decode()
self._page = table.decode()
def get_page_soup(self):
soup = BeautifulSoup(self._page, "html5lib")
return soup
def get_headwords(self):
if self._headwords is not None:
@ -42,16 +38,9 @@ class JitenonEntry:
self._set_variant_headwords()
return self._headwords
def get_first_expression(self):
headwords = self.get_headwords()
expressions = next(iter(headwords.values()))
expression = expressions[0]
return expression
def get_first_reading(self):
headwords = self.get_headwords()
reading = next(iter(headwords.keys()))
return reading
def get_part_of_speech_tags(self):
# Jitenon doesn't have any
return []
def _set_headwords(self):
headwords = {}
@ -66,8 +55,8 @@ class JitenonEntry:
headwords[reading].append(expression)
self._headwords = headwords
def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
def __set_modified_date(self, page):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
if not m:
return
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
@ -94,7 +83,7 @@ class JitenonEntry:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ、]+[ぁ-ヿ、][ぁ-ヿ、]+$", yomikata)
if m:
return Expressions.expand_shouryaku(yomikata)
return Expressions.expand_abbreviation(yomikata)
m = re.search(r"^([ぁ-ヿ、]+)([ぁ-ヿ/\s、]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
@ -139,7 +128,7 @@ class JitenonEntry:
return ",".join(colvals)
class JitenonYojiEntry(JitenonEntry):
class JitenonYojiEntry(_JitenonEntry):
_COLUMNS = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
@ -151,15 +140,15 @@ class JitenonYojiEntry(JitenonEntry):
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
super().__init__(sequence)
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
Expressions.add_variant_kanji(expressions, self._variant_kanji)
class JitenonKotowazaEntry(JitenonEntry):
class JitenonKotowazaEntry(_JitenonEntry):
_COLUMNS = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
@ -170,8 +159,8 @@ class JitenonKotowazaEntry(JitenonEntry):
"類句": ["ruiku", []],
}
def __init__(self, sequence):
super().__init__(sequence)
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
if self.expression == "金棒引き・鉄棒引き":
@ -183,5 +172,5 @@ class JitenonKotowazaEntry(JitenonEntry):
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._VARIANT_KANJI)
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions)

242
bot/entries/smk8.py Normal file
View file

@ -0,0 +1,242 @@
import re
from bs4 import BeautifulSoup
import bot.expressions as Expressions
import bot.soup as Soup
from bot.data import load_smk8_phrase_readings
from bot.entries.entry import Entry
from bot.entries.smk8_preprocess import preprocess_page
class _BaseSmk8Entry(Entry):
ID_TO_ENTRY = {}
SUBENTRY_ID_TO_ENTRY_ID = {}
def __init__(self, entry_id):
super().__init__(entry_id)
if entry_id not in self.ID_TO_ENTRY:
self.ID_TO_ENTRY[entry_id] = self
else:
raise Exception(f"Duplicate entry ID: {entry_id}")
self.children = []
self.phrases = []
self.kanjis = []
def set_page(self, page):
page = self.__decompose_subentries(page)
self._page = page
def get_page_soup(self):
soup = BeautifulSoup(self._page, "xml")
return soup
def get_headwords(self):
if self._headwords is not None:
return self._headwords
self._set_headwords()
self._set_variant_headwords()
return self._headwords
def get_part_of_speech_tags(self):
if self._part_of_speech_tags is not None:
return self._part_of_speech_tags
self._part_of_speech_tags = []
soup = self.get_page_soup()
headword_info = soup.find("見出要素")
if headword_info is None:
return self._part_of_speech_tags
for tag in headword_info.find_all("品詞M"):
if tag.text not in self._part_of_speech_tags:
self._part_of_speech_tags.append(tag.text)
return self._part_of_speech_tags
def _set_variant_headwords(self):
for expressions in self._headwords.values():
Expressions.add_variant_kanji(expressions, self._variant_kanji)
Expressions.add_fullwidth(expressions)
Expressions.remove_iteration_mark(expressions)
Expressions.add_iteration_mark(expressions)
def _find_reading(self, soup):
midasi_kana = soup.find("見出仮名")
reading = midasi_kana.text
for x in [" ", ""]:
reading = reading.replace(x, "")
return reading
def _find_expressions(self, soup):
clean_expressions = []
for expression in soup.find_all("標準表記"):
clean_expression = self._clean_expression(expression.text)
clean_expressions.append(clean_expression)
expressions = Expressions.expand_abbreviation_list(clean_expressions)
return expressions
def __decompose_subentries(self, page):
soup = BeautifulSoup(page, features="xml")
subentry_parameters = [
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
]
for x in subentry_parameters:
subentry_class, tags, subentry_list = x
for tag in tags:
tag_soup = soup.find(tag)
while tag_soup is not None:
tag_soup.name = "項目"
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
subentry = subentry_class(subentry_id)
page = tag_soup.decode()
subentry.set_page(page)
subentry_list.append(subentry)
tag_soup.decompose()
tag_soup = soup.find(tag)
return soup.decode()
@staticmethod
def id_string_to_entry_id(id_string):
parts = id_string.split("-")
if len(parts) == 1:
return (int(parts[0]), 0)
elif len(parts) == 2:
# subentries have a hexadecimal part
return (int(parts[0]), int(parts[1], 16))
else:
raise Exception(f"Invalid entry ID: {id_string}")
@staticmethod
def _clean_expression(expression):
for x in ["", "", "", "", "", " "]:
expression = expression.replace(x, "")
return expression
@staticmethod
def _fill_alts(soup):
for e in soup.find_all(["親見出仮名", "親見出表記"]):
e.string = e.attrs["alt"]
for gaiji in soup.find_all("外字"):
gaiji.string = gaiji.img.attrs["alt"]
class Smk8Entry(_BaseSmk8Entry):
def __init__(self, page_id):
entry_id = (page_id, 0)
super().__init__(entry_id)
def set_page(self, page):
page = preprocess_page(page)
super().set_page(page)
def _set_headwords(self):
soup = self.get_page_soup()
Soup.delete_soup_nodes(soup, "表音表記")
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
self._headwords = {reading: expressions}
class Smk8ChildEntry(_BaseSmk8Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
soup = self.get_page_soup()
Soup.delete_soup_nodes(soup, "表音表記")
self._fill_alts(soup)
reading = self._find_reading(soup)
expressions = []
if soup.find("子見出部").find("標準表記") is None:
expressions.append(reading)
for expression in self._find_expressions(soup):
if expression not in expressions:
expressions.append(expression)
self._headwords = {reading: expressions}
class Smk8PhraseEntry(_BaseSmk8Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
self.__phrase_readings = load_smk8_phrase_readings()
def get_part_of_speech_tags(self):
# phrases do not contain these tags
return []
def _set_headwords(self):
soup = self.get_page_soup()
headwords = {}
expressions = self._find_expressions(soup)
readings = self._find_readings()
for idx, expression in enumerate(expressions):
reading = readings[idx]
if reading in headwords:
headwords[reading].append(expression)
else:
headwords[reading] = [expression]
self._headwords = headwords
def _find_expressions(self, soup):
Soup.delete_soup_nodes(soup, "ルビG")
self._fill_alts(soup)
text = soup.find("標準表記").text
text = self._clean_expression(text)
alternatives = self.__expand_alternatives(text)
expressions = []
for alt in alternatives:
for exp in Expressions.expand_abbreviation(alt):
expressions.append(exp)
return expressions
def _find_readings(self):
text = self.__phrase_readings[self.entry_id]
alternatives = self.__expand_alternatives(text)
readings = []
for alt in alternatives:
for reading in Expressions.expand_abbreviation(alt):
readings.append(reading)
return readings
@staticmethod
def __expand_alternatives(expression):
"""Return a list of strings described by △ notation
eg. "△金(時間・暇)に飽かして" -> [
"金に飽かして", "時間に飽かして", "暇に飽かして"
]
"""
m = re.search(r"△([^]+)([^]+)", expression)
if not m:
return [expression]
alt_parts = [m.group(1)]
for alt_part in m.group(2).split(""):
alt_parts.append(alt_part)
alts = []
for alt_part in alt_parts:
alt_exp = re.sub(r"△[^]+[^]+", alt_part, expression)
alts.append(alt_exp)
return alts
class Smk8KanjiEntry(_BaseSmk8Entry):
def __init__(self, entry_id):
super().__init__(entry_id)
def _set_headwords(self):
soup = self.get_page_soup()
self._fill_alts(soup)
reading = self.__get_parent_reading()
expressions = self._find_expressions(soup)
self._headwords = {reading: expressions}
def __get_parent_reading(self):
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
parent = self.ID_TO_ENTRY[parent_id]
reading = parent.get_first_reading()
return reading

View file

@ -0,0 +1,91 @@
import re
from bs4 import BeautifulSoup
from bot.data import get_adobe_glyph
__GAIJI = {
"gaiji/5350.svg": "",
"gaiji/62cb.svg": "",
"gaiji/7be1.svg": "",
}
def preprocess_page(page):
page = __strip_page(page)
page = __replace_glyph_codes(page)
page = __format_hyougai_marks(page)
return page
def __strip_page(page):
soup = BeautifulSoup(page, features="xml")
koumoku = soup.find(["項目", "字音語参照項目"])
if koumoku is not None:
return koumoku.decode()
else:
raise Exception(f"Primary 項目 not found in page:\n{soup.prettify()}")
def __replace_glyph_codes(page):
soup = BeautifulSoup(page, features="xml")
for span in soup.find_all("span"):
if "style" in span.attrs:
m = re.search(r"^glyph:([0-9]+);$", span.attrs["style"])
del span.attrs["style"]
if m is None:
continue
code = int(m.group(1))
for geta in span.find_all(string=""):
glyph = get_adobe_glyph(code)
geta.replace_with(glyph)
for hyouki in soup.find_all("親見出表記"):
if "alt" not in hyouki.attrs:
continue
alt = hyouki.attrs["alt"]
codes = re.findall(r"{CID([0-9]+)}", alt)
for code in codes:
glyph = get_adobe_glyph(int(code))
alt = alt.replace(f"{{CID{code}}}", glyph)
hyouki.attrs["alt"] = alt
for gaiji in soup.find_all("外字"):
img = gaiji.img
src = img.attrs["src"] if img.has_attr("src") else ""
if src in __GAIJI:
img.attrs["alt"] = __GAIJI[src]
return soup.decode()
def __format_hyougai_marks(page):
soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("外字"):
el.string = ""
text = soup.text
for x in ["\n", "\t", " "]:
text = text.replace(x, "")
text = re.sub(r"〈([^〈]+)〉", r"\1", text)
page = re.sub(r"〈([^〈]+)〉", r"\1␃", page)
for mark in re.findall(r"《.", text):
if mark[1] == "":
page = page.replace("", "<表外音訓/>", 1)
else:
page = re.sub(f"《([^{mark[1]}]*)({mark[1]})",
r"\1<表外音訓>\2</表外音訓>",
page, count=1)
for mark in re.findall(r"〈.", text):
if mark[1] == "":
page = page.replace("", "<表外字/>", 1)
else:
page = re.sub(f"〈([^{mark[1]}]*)({mark[1]})",
r"\1<表外字>\2</表外字>",
page, count=1)
page = page.replace("", "")
page = page.replace("", "")
soup = BeautifulSoup(page, features="xml")
for el in soup.find_all("表外音訓"):
if el.text == "":
el.append(el.next_sibling)
for el in soup.find_all("表外字"):
if el.text == "":
el.append(el.next_sibling)
return soup.decode()

View file

@ -1,14 +1,30 @@
import re
__WIDE_MAP = {i: i + 0xFEE0 for i in range(0x21, 0x7F)}
__KATA_TO_HIRA_MAP = {
i: i - 96 for i in [
*range(0x30A1, 0x30F6),
*range(0x30FD, 0x30FE),
]
}
__HALFWIDTH_TO_FULLWIDTH_MAP = {
i: i + 0xFEE0 for i in [
*range(0x21, 0x7F),
]
}
def kata_to_hira(text):
hira = text.translate(__KATA_TO_HIRA_MAP)
return hira
def add_fullwidth(expressions):
for expression in expressions:
if re.match(r"[A-Za-z0-9]", expression):
new_exp = expression.translate(__WIDE_MAP)
if new_exp not in expressions:
expressions.append(new_exp)
new_exp = expression.translate(__HALFWIDTH_TO_FULLWIDTH_MAP)
if new_exp not in expressions:
expressions.append(new_exp)
def add_variant_kanji(expressions, variant_kanji):
@ -23,23 +39,50 @@ def add_variant_kanji(expressions, variant_kanji):
expressions.append(new_exp)
def expand_shouryaku(shouryaku):
def remove_iteration_mark(expressions):
iterated_kanji = r"(.)々"
for expression in expressions:
for char in re.findall(iterated_kanji, expression):
new_exp = expression.replace(f"{char}", f"{char}{char}")
if new_exp not in expressions:
expressions.append(new_exp)
def add_iteration_mark(expressions):
repeat_kanji = r"([^0-z-zぁ-ヿ])\1"
for expression in expressions:
for char in re.findall(repeat_kanji, expression):
new_exp = expression.replace(f"{char}{char}", f"{char}")
if new_exp not in expressions:
expressions.append(new_exp)
def expand_abbreviation(abbreviated_expression):
"""Return a list of words described by a 省略 notation.
eg. "有(り)合(わ)せ" -> [
"有り合わせ", "有合わせ", "有り合せ", "有合せ"
]
"""
groups = re.findall(r"([^]*)(([^]+))?", shouryaku)
forms = [""]
groups = re.findall(r"([^]*)(([^]+))?", abbreviated_expression)
expressions = [""]
for group in groups:
new_forms = []
for form in forms:
new_forms.append(form + group[0])
forms = new_forms.copy()
new_exps = []
for expression in expressions:
new_exps.append(expression + group[0])
expressions = new_exps.copy()
if group[2] == '':
continue
new_forms = []
for form in forms:
new_forms.append(form + group[2])
forms = new_forms.copy() + forms.copy()
return forms
new_exps = []
for expression in expressions:
new_exps.append(expression + group[2])
expressions = new_exps.copy() + expressions.copy()
return expressions
def expand_abbreviation_list(expressions):
new_exps = []
for expression in expressions:
for new_exp in expand_abbreviation(expression):
if new_exp not in new_exps:
new_exps.append(new_exp)
return new_exps

84
bot/icons.py Normal file
View file

@ -0,0 +1,84 @@
from bs4 import BeautifulSoup
from PIL import Image
from functools import cache
@cache
def calculate_ratio(path):
if path.endswith(".svg"):
ratio = __calculate_svg_ratio(path)
else:
ratio = __calculate_bitmap_ratio(path)
return ratio
@cache
def make_rectangle(path, text, rect_stroke, rect_fill, text_fill):
svg = __svg_text_rectangle(text, rect_stroke, rect_fill, text_fill)
with open(path, "w", encoding="utf-8") as f:
f.write(svg)
@cache
def make_monochrome_fill_rectangle(path, text):
svg = __svg_masked_rectangle(text)
with open(path, "w", encoding="utf-8") as f:
f.write(svg)
def __calculate_svg_ratio(path):
with open(path, "r") as f:
xml = f.read()
soup = BeautifulSoup(xml, "xml")
svg = soup.svg
if svg.has_attr("width") and svg.has_attr("height"):
width = float(svg.attrs["width"])
height = float(svg.attrs["height"])
ratio = width / height
elif svg.has_attr("viewBox"):
_, _, width, height = svg.attrs["viewBox"].split(" ")
ratio = float(width) / float(height)
else:
raise Exception(f"Cannot calculate ratio for SVG\n{svg.prettify()}")
return ratio
def __calculate_bitmap_ratio(path):
img = Image.open(path)
img_w = img.size[0]
img_h = img.size[1]
ratio = img_w / img_h
return ratio
def __svg_text_rectangle(text, rect_stroke, rect_fill, text_fill):
height = 128
width = len(text) * height
svg = f"""
<svg lang='ja' width='{width}' height='{height}' viewBox='0 0 {width} {height}'
xmlns='http://www.w3.org/2000/svg' version='1.1'>
<rect width='{width}' height='{height}' ry='20' stroke='{rect_stroke}'
fill='{rect_fill}' stroke-width='8'/>
<text text-anchor='middle' x='50%' y='50%' dy='.35em'
font-family='sans-serif' font-size='100px'
fill='{text_fill}'>{text}</text>
</svg>"""
return svg.strip()
def __svg_masked_rectangle(text):
height = 128
width = len(text) * height
svg = f"""
<svg lang='ja' width='{width}' height='{height}' viewBox='0 0 {width} {height}'
xmlns='http://www.w3.org/2000/svg' version='1.1'>
<mask id='a'>
<rect width='{width}' height='{height}' fill='white'/>
<text text-anchor='middle' x='50%' y='50%' dy='.35em'
font-family='sans-serif' font-size='100px'
fill='black'>{text}</text>
</mask>
<rect width='{width}' height='{height}' ry='20'
fill='black' mask='url(#a)'/>
</svg>"""
return svg.strip()

View file

@ -15,11 +15,8 @@ from bot.data import load_config
class Scraper():
__CONFIG = None
def __init__(self):
if self.__CONFIG is None:
self.__CONFIG = load_config()
self._config = load_config()
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
self.netloc_re = re.compile(pattern)
self.__set_session()
@ -45,7 +42,7 @@ class Scraper():
allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
headers = self.__CONFIG["http-request-headers"]
headers = self._config["http-request-headers"]
self.session = requests.Session()
self.session.mount("https://", adapter)
self.session.headers.update(headers)

5
bot/soup.py Normal file
View file

@ -0,0 +1,5 @@
def delete_soup_nodes(soup, node_name):
node = soup.find(node_name)
while node is not None:
node.decompose()
node = soup.find(node_name)

View file

@ -9,14 +9,19 @@ from bot.data import load_yomichan_metadata
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
from bot.yomichan.terms.smk8 import Smk8Terminator
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
class Exporter:
def __init__(self):
def __init__(self, name):
self._name = name
self._build_dir = None
self._terms_per_file = 2000
def export(self, entries):
def export(self, entries, image_dir):
if image_dir is not None:
self.__init_build_image_dir(image_dir)
meta = load_yomichan_metadata()
index = meta[self._name]["index"]
index["revision"] = self._get_revision(entries)
@ -29,14 +34,20 @@ class Exporter:
if self._build_dir is not None:
return self._build_dir
cache_dir = user_cache_dir("jitenbot")
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
build_directory = os.path.join(cache_dir, f"build_{timestamp}")
build_directory = os.path.join(cache_dir, "yomichan_build")
if Path(build_directory).is_dir():
shutil.rmtree(build_directory)
os.makedirs(build_directory)
self._build_dir = build_directory
return self._build_dir
def __init_build_image_dir(self, image_dir):
print("Copying image files to build directory...")
build_dir = self._get_build_dir()
build_img_dir = os.path.join(build_dir, self._name)
shutil.copytree(image_dir, build_img_dir)
self._terminator.set_image_dir(build_img_dir)
def __get_terms(self, entries):
terms = []
entries_len = len(entries)
@ -101,15 +112,15 @@ class Exporter:
class JitenonExporter(Exporter):
def __init__(self):
super().__init__()
def __init__(self, name):
super().__init__(name)
def _get_revision(self, entries):
modified_date = None
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
revision = f"{self._name}.{modified_date}"
revision = f"{self._name};{modified_date}"
return revision
def _get_attribution(self, entries):
@ -121,14 +132,38 @@ class JitenonExporter(Exporter):
class JitenonYojiExporter(JitenonExporter):
def __init__(self):
super().__init__()
self._name = "jitenon-yoji"
self._terminator = JitenonYojiTerminator()
def __init__(self, name):
super().__init__(name)
self._terminator = JitenonYojiTerminator(name)
class JitenonKotowazaExporter(JitenonExporter):
def __init__(self):
super().__init__()
self._name = "jitenon-kotowaza"
self._terminator = JitenonKotowazaTerminator()
def __init__(self, name):
super().__init__(name)
self._terminator = JitenonKotowazaTerminator(name)
class Smk8Exporter(Exporter):
def __init__(self, name):
super().__init__(name)
self._terminator = Smk8Terminator(name)
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._name};{timestamp}"
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2020"
class Daijirin2Exporter(Exporter):
def __init__(self, name):
super().__init__(name)
self._terminator = Daijirin2Terminator(name)
def _get_revision(self, entries):
timestamp = datetime.now().strftime("%Y-%m-%d")
return f"{self._name};{timestamp}"
def _get_attribution(self, entries):
return "© Sanseido Co., LTD. 2019"

View file

@ -0,0 +1,238 @@
import re
import os
from bs4 import BeautifulSoup
from functools import cache
from pathlib import Path
import bot.icons as Icons
from bot.soup import delete_soup_nodes
from bot.data import load_daijirin2_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss
from bot.yomichan.glossary.name_conversion import convert_names
def make_glossary(entry, image_dir):
soup = entry.get_page_soup()
__add_rubies(soup)
__hyperlink_parent_expression(soup, entry)
__delete_unused_nodes(soup, image_dir)
__clear_styles(soup)
__set_data_class(soup)
__convert_links(soup, entry)
__convert_gaiji(soup, image_dir)
__convert_graphics(soup, image_dir)
__convert_logos(soup, image_dir)
__convert_kanjion_logos(soup, image_dir)
__convert_daigoginum(soup, image_dir)
__convert_jundaigoginum(soup, image_dir)
name_conversion = load_daijirin2_yomichan_name_conversion()
convert_names(soup, name_conversion)
gloss = make_gloss(soup.span)
glossary = [gloss]
return glossary
def __add_rubies(soup):
for name in ["表外音訓", "表外字"]:
for ruby in soup.find_all(name):
ruby.name = "ruby"
rt = ruby.find("表外字マーク")
rt.name = "rt"
ruby.append(rt) # needs to positioned after the text
def __hyperlink_parent_expression(soup, entry):
if soup.find("親表記") is None:
return
parent_entry_id = entry.SUBENTRY_ID_TO_ENTRY_ID[entry.entry_id]
parent_entry = entry.ID_TO_ENTRY[parent_entry_id]
parent_expression = parent_entry.get_first_expression()
for el in soup.find_all("親表記"):
el.name = "a"
el.attrs["href"] = f"?query={parent_expression}&wildcards=off"
def __delete_unused_nodes(soup, image_dir):
if not __graphics_directory_exists(image_dir):
delete_soup_nodes(soup, "カットG")
for el in soup.find_all("logo"):
next_sibling = el.next_sibling
if next_sibling is None:
continue
elif next_sibling.name in ["漢字見出G", "漢字音G"]:
el.decompose()
for el in soup.find_all("漢字音G"):
for child in el.find_all(string=""):
child.replace_with("")
@cache
def __graphics_directory_exists(image_dir):
path = os.path.join(image_dir, "graphics")
return Path(path).is_dir()
def __clear_styles(soup):
for el in soup.select("[style]"):
del el.attrs["style"]
def __set_data_class(soup):
for el in soup.select("[class]"):
el.attrs["data-class"] = el.attrs["class"]
def __convert_links(soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
if re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
ref_entry_id = entry.id_string_to_entry_id(href)
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
expression = ref_entry.get_first_expression()
el.attrs["href"] = f"?query={expression}&wildcards=off"
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def __convert_gaiji(soup, image_dir):
for el in soup.find_all("img"):
src = el.attrs["src"]
if not src.startswith("gaiji"):
continue
path = image_dir
for part in src.split("/"):
if part.strip() == "":
continue
path = os.path.join(path, part)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": el.attrs["alt"] if el.has_attr("alt") else "",
"path": f"{os.path.basename(image_dir)}/{src}",
"src": src,
}
el.name = "span"
el.clear()
el.append(img)
el.attrs["style"] = "vertical-align: text-bottom;"
def __convert_graphics(soup, image_dir):
for el in soup.find_all("img"):
src = el.attrs["src"]
if not src.startswith("graphics"):
continue
el.attrs = {
"collapsible": True,
"collapsed": True,
"title": el.attrs["alt"] if el.has_attr("alt") else "",
"path": f"{os.path.basename(image_dir)}/{src}",
"src": src,
}
def __convert_logos(soup, image_dir):
for el in soup.find_all("logo"):
filename = f"{el.text}-default.svg"
path = os.path.join(image_dir, filename)
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": el.text,
"path": f"{os.path.basename(image_dir)}/{filename}",
}
el.name = "span"
el.clear()
el.append(img)
el.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em;"
def __convert_kanjion_logos(soup, image_dir):
for el in soup.find_all("漢字音logo"):
filename = f"{el.text}-default.svg"
path = os.path.join(image_dir, filename)
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": el.text,
"path": f"{os.path.basename(image_dir)}/{filename}",
}
el.name = "span"
el.clear()
el.append(img)
el.attrs["style"] = "vertical-align: text-bottom; margin-left: 0.25em;"
def __convert_daigoginum(soup, image_dir):
for el in soup.find_all("大語義num"):
filename = f"{el.text}-fill.svg"
path = os.path.join(image_dir, filename)
Icons.make_monochrome_fill_rectangle(path, el.text)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": el.text,
"path": f"{os.path.basename(image_dir)}/{filename}",
}
el.name = "span"
el.clear()
el.append(img)
el.attrs["style"] = "vertical-align: text-bottom;"
def __convert_jundaigoginum(soup, image_dir):
for el in soup.find_all("準大語義num"):
filename = f"{el.text}-default.svg"
path = os.path.join(image_dir, filename)
Icons.make_rectangle(path, el.text, "black", "transparent", "black")
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": el.text,
"path": f"{os.path.basename(image_dir)}/{filename}",
}
el.name = "span"
el.clear()
el.append(img)
el.attrs["style"] = "vertical-align: text-bottom;"

View file

@ -3,14 +3,14 @@ from css_parser import parseStyle
def make_gloss(soup):
node = __get_markup_structure(soup)
node = __get_page_structure(soup)
return {
"type": "structured-content",
"content": node["content"],
}
def __get_markup_structure(soup):
def __get_page_structure(soup):
node = {"tag": soup.name}
content = []
for child in soup.children:
@ -19,7 +19,7 @@ def __get_markup_structure(soup):
if text != "":
content.append(text)
else:
content.append(__get_markup_structure(child))
content.append(__get_page_structure(child))
attributes = __get_attributes(soup.attrs)
for key, val in attributes.items():

View file

@ -1,11 +1,10 @@
import re
from bs4 import BeautifulSoup
from bot.yomichan.glossary.gloss import make_gloss
def make_glossary(entry):
soup = BeautifulSoup(entry.markup, "html5lib")
soup = entry.get_page_soup()
__replace_punctuation(soup)
__add_internal_links(soup)
__convert_paragraphs(soup)

View file

@ -0,0 +1,101 @@
from bs4 import BeautifulSoup
def convert_names(soup, name_conversion):
for child in soup.children:
if child.name is None:
continue
else:
convert_names(child, name_conversion)
if child.name in name_conversion.keys():
conversion = name_conversion[child.name]
if "name" in conversion:
child.attrs["data-name"] = child.name
child.name = conversion["name"]
if "style" in conversion:
child.attrs["style"] = conversion["style"]
if "procedures" in conversion:
procedures = conversion["procedures"]
__apply_name_conversion_procedures(child, procedures)
else:
child.attrs["data-name"] = child.name
child.name = "span"
def __apply_name_conversion_procedures(soup, procedures):
functions = {
"has_class": __has_class,
"has_parent": __has_parent,
"has_previous_sibling": __has_previous_sibling,
"replace": __replace,
"wrap": __wrap,
"add_ruby_text": __add_ruby_text,
}
for procedure in procedures:
function = functions[procedure["procedure_name"]]
parameters = procedure["parameters"]
function(soup, **parameters)
def __has_class(soup, class_name, key, value):
if not soup.has_attr("class"):
return
soup_classes = soup.attrs["class"].split(" ")
if class_name not in soup_classes:
return
if key == "style":
soup.attrs["style"] = value
elif key == "name":
soup.name = value
else:
raise Exception()
def __has_parent(soup, parent_name, key, value):
if soup.find_parent(parent_name) is None:
return
if key == "style":
soup.attrs["style"] = value
elif key == "name":
soup.name = value
else:
raise Exception()
def __has_previous_sibling(soup, name, key, value):
sibling = soup.previous_sibling
if sibling is None:
return
elif sibling.name is None:
return
elif sibling.has_attr("data-name"):
previous_sibling_name = sibling.attrs["data-name"]
else:
previous_sibling_name = sibling.name
if previous_sibling_name != name:
return
if key == "style":
soup.attrs["style"] = value
elif key == "name":
soup.name = value
else:
raise Exception()
def __replace(soup, old, new):
soup.string = soup.text.replace(old, new)
def __wrap(soup, l_wrap, r_wrap):
if soup.text.strip() != "":
soup.string = f"{l_wrap}{soup.text}{r_wrap}"
def __add_ruby_text(soup, mark, style):
if style.strip() != "":
markup = f"<rt><span style='{style}'>{mark}</span></rt>"
else:
markup = f"<rt>{mark}</rt>"
rt_soup = BeautifulSoup(markup, "xml")
soup.append(rt_soup.rt)

View file

@ -0,0 +1,151 @@
import re
import os
from bs4 import BeautifulSoup
import bot.icons as Icons
from bot.soup import delete_soup_nodes
from bot.data import load_smk8_yomichan_name_conversion
from bot.yomichan.glossary.gloss import make_gloss
from bot.yomichan.glossary.name_conversion import convert_names
def make_glossary(entry, image_dir):
soup = entry.get_page_soup()
__fill_alts(soup)
__delete_unused_nodes(soup)
__clear_styles(soup)
__set_data_class(soup)
__convert_links(soup, entry)
__convert_priority_markers(soup)
__convert_gaiji(soup, image_dir)
__convert_rectangles(soup, image_dir)
name_conversion = load_smk8_yomichan_name_conversion()
convert_names(soup, name_conversion)
gloss = make_gloss(soup.span)
glossary = [gloss]
return glossary
def __fill_alts(soup):
for name in ["親見出仮名", "親見出表記"]:
for el in soup.find_all(name):
el.name = "a"
alt = el.attrs["alt"]
el.string = alt
el.attrs["href"] = f"?query={alt}&wildcards=off"
del el.attrs["alt"]
def __delete_unused_nodes(soup):
for name in ["audio", "連濁"]:
delete_soup_nodes(soup, name)
def __clear_styles(soup):
for el in soup.select("[style]"):
del el.attrs["style"]
def __set_data_class(soup):
for el in soup.select("[class]"):
el.attrs["data-class"] = el.attrs["class"]
def __convert_links(soup, entry):
for el in soup.find_all("a"):
href = el.attrs["href"]
if href.startswith("$"):
el.unwrap()
elif re.match(r"^[0-9]+(?:-[0-9A-F]{4})?$", href):
ref_entry_id = entry.id_string_to_entry_id(href)
ref_entry = entry.ID_TO_ENTRY[ref_entry_id]
expression = ref_entry.get_first_expression()
el.attrs["href"] = f"?query={expression}&wildcards=off"
elif re.match(r"^(?:https?:|\?)[\w\W]*", href):
pass
else:
raise Exception(f"Invalid href format: {href}")
def __convert_priority_markers(soup):
style = "vertical-align: super; font-size: 0.6em"
for el in soup.find_all("img", attrs={"alt": "*"}):
el.name = "span"
el.string = ""
el.attrs["style"] = style
for el in soup.find_all("img", attrs={"alt": ""}):
el.name = "span"
el.string = ""
el.attrs["style"] = style
def __convert_gaiji(soup, image_dir):
for el in soup.find_all("img"):
src = el.attrs["src"]
path = image_dir
for part in src.split("/"):
if part.strip() == "":
continue
path = os.path.join(path, part)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": "monochrome",
"title": el.attrs["alt"] if el.has_attr("alt") else "",
"path": f"{os.path.basename(image_dir)}/{src}",
"src": src,
}
el.name = "span"
el.clear()
el.append(img)
el.attrs["style"] = "vertical-align: text-bottom;"
def __convert_rectangles(soup, image_dir):
cls_to_appearance = {
"default": "monochrome",
"fill": "monochrome",
"red": "auto",
"redfill": "auto",
}
for el in soup.find_all("rect"):
cls = el.attrs["class"] if el.has_attr("class") else "default"
filename = f"{el.text}-{cls}.svg"
path = os.path.join(image_dir, filename)
__make_rectangle(path, el.text, cls)
ratio = Icons.calculate_ratio(path)
img = BeautifulSoup("<img/>", "xml").img
img.attrs = {
"height": 1.0 if ratio > 1.0 else ratio,
"width": ratio if ratio > 1.0 else 1.0,
"sizeUnits": "em",
"collapsible": False,
"collapsed": False,
"background": False,
"appearance": cls_to_appearance[cls],
"title": el.text,
"path": f"{os.path.basename(image_dir)}/{filename}",
}
el.name = "span"
el.clear()
el.append(img)
el.attrs["style"] = "vertical-align: text-bottom; margin-right: 0.25em"
def __make_rectangle(path, text, cls):
if cls == "fill":
Icons.make_monochrome_fill_rectangle(path, text)
elif cls == "red":
Icons.make_rectangle(path, text, "red", "white", "red")
elif cls == "redfill":
Icons.make_rectangle(path, text, "red", "red", "white")
else:
Icons.make_rectangle(path, text, "black", "transparent", "black")

View file

@ -7,32 +7,29 @@ __U_KANA_LIST = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
"", "", "", "", "", "", ""]
__SUDACHI_DICTIONARY = None
__SUDACHI_INFLECTION_TYPES = None
def sudachi_rules(expression):
global __SUDACHI_DICTIONARY
global __SUDACHI_INFLECTION_TYPES
if __SUDACHI_DICTIONARY is None:
__SUDACHI_DICTIONARY = dictionary.Dictionary(dict="full").create()
if __SUDACHI_INFLECTION_TYPES is None:
categories = load_yomichan_inflection_categories()
__SUDACHI_INFLECTION_TYPES = categories["sudachi"]
categories = load_yomichan_inflection_categories()
sudachi_inflection_categories = categories["sudachi"]
splitmode = tokenizer.Tokenizer.SplitMode.A
tokens = __SUDACHI_DICTIONARY.tokenize(expression, splitmode)
if len(tokens) == 0:
return ""
pos = tokens[len(tokens)-1].part_of_speech()[4]
tags = pos.split("-")
rules = tags_to_rules(expression, tags, __SUDACHI_INFLECTION_TYPES)
rules = tags_to_rules(expression, tags, sudachi_inflection_categories)
return rules
def tags_to_rules(expression, tags, inflection_types):
def tags_to_rules(expression, tags, inflection_categories):
rules = set()
exp_final_character = expression[len(expression)-1:]
for tag in tags:
if tag in inflection_types["sahen"]:
if tag in inflection_categories["sahen"]:
if expression.endswith("する"):
rules.add("vs")
elif expression.endswith("為る"):
@ -41,20 +38,20 @@ def tags_to_rules(expression, tags, inflection_types):
rules.add("vz")
elif expression.endswith(""):
rules.add("v5")
if tag in inflection_types["godan"]:
if tag in inflection_categories["godan"]:
if exp_final_character in __U_KANA_LIST:
rules.add("v5")
if tag in inflection_types["ichidan"]:
if tag in inflection_categories["ichidan"]:
if expression.endswith(""):
rules.add("v1")
if tag in inflection_types["keiyoushi"]:
if tag in inflection_categories["keiyoushi"]:
if expression.endswith(""):
rules.add("adj-i")
if tag in inflection_types["kahen"]:
if tag in inflection_categories["kahen"]:
if expression.endswith("くる"):
rules.add("vk")
elif expression.endswith("来る"):
rules.add("vk")
if tag in inflection_types["sudachi"]:
if tag in inflection_categories["sudachi"]:
return sudachi_rules(expression)
return " ".join(list(rules))

View file

@ -0,0 +1,53 @@
from bot.data import load_yomichan_inflection_categories
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.daijirin2 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Daijirin2Terminator(Terminator):
def __init__(self, name):
super().__init__(name)
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[name]
def _definition_tags(self, entry):
return ""
def _inflection_rules(self, entry, expression):
if isinstance(entry, PhraseEntry):
return sudachi_rules(expression)
pos_tags = entry.get_part_of_speech_tags()
if len(pos_tags) > 0:
rules = tags_to_rules(expression, pos_tags,
self._inflection_categories)
else:
rules = sudachi_rules(expression)
return rules
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id[0] * 100000 + entry.entry_id[1]
def _term_tags(self, entry):
return ""
def _link_glossary_parameters(self, entry):
return [
[entry.children, ""],
[entry.phrases, ""],
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
]

View file

@ -4,8 +4,8 @@ from bot.yomichan.glossary.jitenon import make_glossary
class JitenonTerminator(Terminator):
def __init__(self):
super().__init__()
def __init__(self, name):
super().__init__(name)
def _definition_tags(self, entry):
return None
@ -28,8 +28,8 @@ class JitenonTerminator(Terminator):
class JitenonYojiTerminator(JitenonTerminator):
def __init__(self):
super().__init__()
def __init__(self, name):
super().__init__(name)
def _inflection_rules(self, entry, expression):
return ""
@ -40,8 +40,8 @@ class JitenonYojiTerminator(JitenonTerminator):
class JitenonKotowazaTerminator(JitenonTerminator):
def __init__(self):
super().__init__()
def __init__(self, name):
super().__init__(name)
def _inflection_rules(self, entry, expression):
return sudachi_rules(expression)

View file

@ -0,0 +1,58 @@
from bot.data import load_yomichan_inflection_categories
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
from bot.yomichan.terms.terminator import Terminator
from bot.yomichan.glossary.smk8 import make_glossary
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
class Smk8Terminator(Terminator):
def __init__(self, name):
super().__init__(name)
categories = load_yomichan_inflection_categories()
self._inflection_categories = categories[name]
def _definition_tags(self, entry):
if isinstance(entry, KanjiEntry):
return ""
else:
return ""
def _inflection_rules(self, entry, expression):
if isinstance(entry, PhraseEntry):
return sudachi_rules(expression)
elif isinstance(entry, KanjiEntry):
return ""
pos_tags = entry.get_part_of_speech_tags()
if len(pos_tags) == 0:
return sudachi_rules(expression)
else:
return tags_to_rules(expression, pos_tags, self._inflection_categories)
def _glossary(self, entry):
if entry.entry_id in self._glossary_cache:
return self._glossary_cache[entry.entry_id]
glossary = make_glossary(entry, self._image_dir)
self._glossary_cache[entry.entry_id] = glossary
return glossary
def _sequence(self, entry):
return entry.entry_id[0] * 100000 + entry.entry_id[1]
def _term_tags(self, entry):
return ""
def _link_glossary_parameters(self, entry):
return [
[entry.children, ""],
[entry.phrases, ""]
]
def _subentry_lists(self, entry):
return [
entry.children,
entry.phrases,
entry.kanjis
]

View file

@ -1,6 +1,11 @@
class Terminator:
def __init__(self):
def __init__(self, name):
self._name = name
self._glossary_cache = {}
self._image_dir = None
def set_image_dir(self, image_dir):
self._image_dir = image_dir
def make_terms(self, entry):
terms = []

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"8228": "Ø",
"9772": "",
"9773": "",
"10078": "Т",
"10079": "У",
"10080": "Ф",
"10081": "Х",
"10082": "Ц",
"10083": "Ч",
"10084": "Ш",
"12107": "〻",
"12180": "⮗"
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,290 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"ruby": {},
"rt": {},
"語構成": {
"name": "span",
"style": "margin-right: 0.5em;"
},
"熟語例G": {
"name": "div"
},
"漢字音G": {
"name": "ul"
},
"漢字音": {
"name": "li"
},
"sup": {
"name": "span",
"style": "font-size: 0.6em; vertical-align: super;"
},
"p": {
"name": "div",
"style": "margin-top: 0.5em; margin-bottom: 0.5em;"
},
"カット": {
"name": "div"
},
"中語義": {
"name": "div"
},
"副義": {
"name": "div"
},
"異字同訓解説": {
"name": "div"
},
"異字同訓語義G": {
"name": "div"
},
"細義": {
"name": "div"
},
"単位名": {
"name": "span",
"style": "font-size: 0.6em; vertical-align: super;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"原籍": {
"name": "span",
"style": "font-size: 0.7em; vertical-align: super;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"句仮名": {
"name": "span",
"style": "font-size: 0.6em; vertical-align: super;"
},
"品詞行": {
"name": "span",
"style": "font-size: 0.6em; vertical-align: super;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"用例": {
"name": "div"
},
"季語G": {
"name": "div"
},
"補説G": {
"name": "div",
"procedures": [
{
"procedure_name": "has_previous_sibling",
"parameters": {
"name": "語義Gnum",
"key": "name",
"value": "span"
}
},
{
"procedure_name": "has_previous_sibling",
"parameters": {
"name": "アクセントG",
"key": "name",
"value": "span"
}
}
]
},
"語釈": {
"name": "span",
"procedures": [
{
"procedure_name": "has_previous_sibling",
"parameters": {
"name": "補説G",
"key": "name",
"value": "div"
}
}
]
},
"品詞用法": {
"name": "span",
"style": "font-size: 0.6em; vertical-align: super;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"大語義": {
"name": "div"
},
"文語形": {
"name": "div"
},
"慣用G": {
"name": "div",
"style": "margin-top: 0.5em"
},
"歴史仮名": {
"name": "span",
"style": "font-size: 0.6em;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"派生G": {
"name": "div",
"style": "margin-top: 0.5em"
},
"準大語義": {
"name": "div"
},
"見出部": {
"name": "span"
},
"解説部": {
"name": "div"
},
"語義G": {
"name": "div"
},
"語義区切": {
"name": "span",
"style": "font-size: 0.7em; vertical-align: super;"
},
"返り点": {
"name": "span",
"style": "font-size: 0.5em; font-weight: normal; vertical-align: super;",
"procedures": [
{
"procedure_name": "has_class",
"parameters": {
"class_name": "熟語記号",
"key": "style",
"value": "vertical-align: baseline;"
}
}
]
},
"生没年": {
"name": "span",
"style": "font-size: 0.7em;"
},
"用法": {
"name": "span",
"style": "font-size: 0.7em; vertical-align: super;"
},
"異字同訓": {
"name": "div",
"style": "margin-top: 0.5em;"
},
"異字同訓仮名": {
"name": "span",
"style": "font-weight: bold;"
},
"異字同訓漢字": {
"name": "span",
"style": "font-weight: normal;"
},
"異字同訓表記": {
"name": "span",
"style": "font-weight: normal;"
},
"見出仮名": {
"name": "span",
"style": "font-weight: bold;"
},
"見出相当部": {
"name": "span",
"style": "font-weight: bold;"
},
"カットG": {
"name": "div",
"style": "margin-top: 0.5em;"
},
"sm": {
"name": "span",
"style": "font-size: 0.7em;"
},
"small": {
"name": "span",
"style": "font-size: 0.7em;"
},
"sub": {
"name": "span",
"style": "font-size: 0.7em; vertical-align: sub;"
},
"付記": {
"name": "span",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "",
"r_wrap": ""
}
}
]
},
"アクセントG": {
"name": "span",
"style": "margin-left: 0.25em; margin-right: 0.25em; font-size: 0.7em; vertical-align: super;"
},
"i": {
"name": "span",
"style": "font-style: italic;"
},
"h1": {
"name": "span",
"style": "font-weight: bold;"
},
"読みG": {
"name": "span",
"style": "vertical-align: super; font-size: 0.6em;"
},
"ルビG": {
"name": "span",
"style": "vertical-align: super; font-size: 0.6em; font-weight: normal;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,221 @@
{
"a": {},
"br": {},
"img": {},
"div": {},
"span": {},
"ruby": {},
"rt": {},
"語義": {
"name": "div"
},
"副義": {
"name": "div"
},
"派生": {
"name": "div"
},
"用例": {
"name": "div"
},
"参照G": {
"name": "div"
},
"用例G": {
"name": "div"
},
"解説部": {
"name": "div"
},
"大語義": {
"name": "div"
},
"名詞形G": {
"name": "div"
},
"可能形G": {
"name": "div"
},
"派生SubG": {
"name": "div"
},
"子解説部": {
"name": "div"
},
"句解説部": {
"name": "div"
},
"運用解説": {
"name": "div"
},
"表記解説": {
"name": "div"
},
"文法解説": {
"name": "div"
},
"派生SubGF": {
"name": "div"
},
"かぞえ方解説": {
"name": "div"
},
"二分": {
"name": "span",
"style": "margin-right: 1.0em;"
},
"四分": {
"name": "span",
"style": "margin-right: 0.5em;"
},
"言換M": {
"name": "span",
"style": "font-size: 0.5em;"
},
"品詞用法": {
"name": "span",
"style": "font-size: 0.7em;"
},
"ルビG": {
"name": "span",
"style": "vertical-align: super; font-size: 0.65em"
},
"アクセント": {
"name": "span",
"style": "vertical-align: super; font-size: 0.7em;"
},
"アクセント組M": {
"name": "span",
"style": "vertical-align: super; font-size: 0.7em;"
},
"IT": {
"name": "span",
"style": "font-style: italic;"
},
"EXCLAMATION": {
"name": "span",
"style": "font-style: italic;"
},
"B": {
"name": "span",
"style": "font-weight: bold;"
},
"EM": {
"name": "span",
"style": "font-weight: bold;"
},
"出現形": {
"name": "span",
"style": "font-weight: bold;"
},
"見出仮名": {
"name": "span",
"style": "font-weight: bold;"
},
"基本構文em": {
"name": "span",
"style": "font-weight: bold;"
},
"ウ濁音参照": {
"name": "span",
"style": "font-weight: bold;"
},
"表外字": {
"name": "ruby",
"procedures": [
{
"procedure_name": "add_ruby_text",
"parameters": {
"mark": "︿",
"style": "font-size: 2em;"
}
}
]
},
"表外音訓": {
"name": "ruby",
"procedures": [
{
"procedure_name": "add_ruby_text",
"parameters": {
"mark": "︽",
"style": "font-size: 2em;"
}
}
]
},
"表音式": {
"name": "ruby"
},
"表音表記": {
"name": "rt",
"procedures": [
{
"procedure_name": "replace",
"parameters": {
"old": "",
"new": ""
}
},
{
"procedure_name": "replace",
"parameters": {
"old": "",
"new": ""
}
}
]
},
"派生見出": {
"name": "span",
"style": "font-weight: bold;",
"procedures": [
{
"procedure_name": "has_class",
"parameters": {
"class_name": "normal",
"key": "style",
"value": "font-weight: normal;"
}
}
]
},
"品詞G": {
"name": "span",
"procedures": [
{
"procedure_name": "has_parent",
"parameters": {
"parent_name": "品詞用法",
"key": "style",
"value": "font-size: 1.43em;"
}
}
]
},
"歴史仮名": {
"name": "span",
"style": "font-size: 0.6em; font-weight: normal;",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
},
"ルビ": {
"name": "span",
"procedures": [
{
"procedure_name": "wrap",
"parameters": {
"l_wrap": "(",
"r_wrap": ")"
}
}
]
}
}

View file

@ -1,19 +1,45 @@
俠,侠
俱,倶
儘,侭
凜,凛
剝,剥
𠮟,叱
吞,呑
啞,唖
噓,嘘
嚙,噛
囊,嚢
塡,填
姸,妍
屛,屏
屢,屡
拋,抛
搔,掻
摑,掴
攪,撹
潑,溌
瀆,涜
焰,焔
禱,祷
竜,龍
筓,笄
簞,箪
籠,篭
繡,繍
繫,繋
腁,胼
萊,莱
藪,薮
蟬,蝉
蠟,蝋
軀,躯
醬,醤
醱,醗
頰,頬
顚,顛
驒,騨
鶯,鴬
鷗,鴎
鷽,鴬
鹼,鹸
麴,麹

1
1
2
3
4
5
6 𠮟
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

View file

@ -6,5 +6,21 @@
"keiyoushi": ["形容詞", "ナイ", "タイ", "ラシイ"],
"kahen": ["カ行変格"],
"sudachi": []
},
"smk8": {
"sahen": ["サ", "サ変型"],
"godan": ["上二", "下二", "四", "五", "上二型", "下二型", "四段型", "五型", "特殊型"],
"ichidan": ["上一", "下一", "上一型", "下一型"],
"keiyoushi": ["形", "形型"],
"kahen": ["カ"],
"sudachi": ["連体"]
},
"daijirin2": {
"sahen": ["サ変", "サ特活"],
"godan": ["ナ変", "マ特活", "ラ特活", "上二", "下二", "五", "四"],
"ichidan": ["上一", "下一"],
"keiyoushi": ["形"],
"kahen": ["カ変"],
"sudachi": ["助動", "接尾", "枕詞", "連体", "連語"]
}
}

View file

@ -24,5 +24,28 @@
"url": "https://kotowaza.jitenon.jp/"
},
"tags": []
},
"smk8": {
"index": {
"title": "新明解国語辞典 第八版",
"sequenced": true,
"format": 3
},
"tags": [
["子", "name", 0, "子項目", 0],
["句", "expression", 0, "句項目", 0],
["造", "popular", 0, "造語成分項目", 0]
]
},
"daijirin2": {
"index": {
"title": "大辞林 第四版",
"sequenced": true,
"format": 3
},
"tags": [
["子", "name", 0, "子項目", 0],
["句", "expression", 0, "句項目", 0]
]
}
}

View file

@ -16,47 +16,59 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import os
import argparse
from bot.crawlers import JitenonYojiCrawler
from bot.crawlers import JitenonKotowazaCrawler
from bot.crawlers import Smk8Crawler
from bot.crawlers import Daijirin2Crawler
crawlers = {
"jitenon-yoji": JitenonYojiCrawler,
"jitenon-kotowaza": JitenonKotowazaCrawler,
}
def directory(d):
if not os.path.isdir(d):
raise argparse.ArgumentTypeError(f"`{d}` is not a valid directory")
elif not os.access(d, os.R_OK):
raise argparse.ArgumentTypeError(f"Cannot access directory `{d}`")
else:
return d
def add_target_argument(parser):
target_argument_params = {
"choices": crawlers.keys(),
"help": "Dictionary to convert."
}
parser.add_argument("target", **target_argument_params)
def make_parser():
argument_parser_params = {
"prog": "jitenbot",
"description": "Convert Japanese dictionary files to new formats.",
}
parser = argparse.ArgumentParser(**argument_parser_params)
return parser
def parse_args():
parser = make_parser()
add_target_argument(parser)
def parse_args(targets):
parser = argparse.ArgumentParser(
prog="jitenbot",
description="Convert Japanese dictionary files to new formats.",
)
parser.add_argument(
"target",
choices=targets,
help="name of dictionary to convert"
)
parser.add_argument(
"-p", "--page-dir",
help="path to directory containing XML page files",
type=directory
)
parser.add_argument(
"-i", "--image-dir",
help="path to directory containing image files (gaiji, etc.)",
type=directory
)
args = parser.parse_args()
return args
def main():
args = parse_args()
crawlers = {
"jitenon-yoji": JitenonYojiCrawler,
"jitenon-kotowaza": JitenonKotowazaCrawler,
"smk8": Smk8Crawler,
"daijirin2": Daijirin2Crawler,
}
args = parse_args(crawlers.keys())
crawler_class = crawlers[args.target]
crawler = crawler_class()
crawler.crawl()
crawler.read_entries()
crawler = crawler_class(args)
crawler.collect_pages()
crawler.read_pages()
crawler.make_yomichan_dictionary()

View file

@ -5,9 +5,12 @@ charset-normalizer==3.1.0
css-parser==1.0.8
html5lib==1.1
idna==3.4
requests==2.28.2
lxml==4.9.2
Pillow==9.5.0
platformdirs==3.5.0
requests==2.29.0
six==1.16.0
soupsieve==2.4
soupsieve==2.4.1
SudachiDict-full==20230110
SudachiPy==0.6.7
urllib3==1.26.15