Merge branch 'main' into patch-2
This commit is contained in:
commit
09b585c49d
2
TODO.md
2
TODO.md
|
@ -1,7 +1,7 @@
|
||||||
### Todo
|
### Todo
|
||||||
|
|
||||||
- [x] Add factory classes to reduce the amount of class import statements
|
- [x] Add factory classes to reduce the amount of class import statements
|
||||||
- [ ] Add dynamic import functionality to factory classes to reduce boilerplate
|
- [x] Add dynamic import functionality to factory classes to reduce boilerplate
|
||||||
- [x] Support exporting to MDict (.MDX) dictionary format
|
- [x] Support exporting to MDict (.MDX) dictionary format
|
||||||
- [x] Validate JSON schema of Yomichan terms during export
|
- [x] Validate JSON schema of Yomichan terms during export
|
||||||
- [ ] Add support for monokakido search keys from index files
|
- [ ] Add support for monokakido search keys from index files
|
||||||
|
|
54
bot/crawlers/base/crawler.py
Normal file
54
bot/crawlers/base/crawler.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
import re
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from bot.factory import new_entry
|
||||||
|
from bot.factory import new_yomichan_exporter
|
||||||
|
from bot.factory import new_mdict_exporter
|
||||||
|
|
||||||
|
|
||||||
|
class BaseCrawler(ABC):
|
||||||
|
def __init__(self, target):
|
||||||
|
self._target = target
|
||||||
|
self._page_map = {}
|
||||||
|
self._entries = []
|
||||||
|
self._page_id_pattern = None
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def collect_pages(self, page_dir):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def read_pages(self):
|
||||||
|
pages_len = len(self._page_map)
|
||||||
|
items = self._page_map.items()
|
||||||
|
for idx, (page_id, page_path) in enumerate(items):
|
||||||
|
update = f"\tReading page {idx+1}/{pages_len}"
|
||||||
|
print(update, end='\r', flush=True)
|
||||||
|
entry = new_entry(self._target, page_id)
|
||||||
|
with open(page_path, "r", encoding="utf-8") as f:
|
||||||
|
page = f.read()
|
||||||
|
try:
|
||||||
|
entry.set_page(page)
|
||||||
|
except ValueError as err:
|
||||||
|
print(err)
|
||||||
|
print("Try deleting and redownloading file:")
|
||||||
|
print(f"\t{page_path}\n")
|
||||||
|
continue
|
||||||
|
self._entries.append(entry)
|
||||||
|
print()
|
||||||
|
|
||||||
|
def make_yomichan_dictionary(self, media_dir, validate):
|
||||||
|
exporter = new_yomichan_exporter(self._target)
|
||||||
|
exporter.export(self._entries, media_dir, validate)
|
||||||
|
|
||||||
|
def make_mdict_dictionary(self, media_dir, icon_file):
|
||||||
|
exporter = new_mdict_exporter(self._target)
|
||||||
|
exporter.export(self._entries, media_dir, icon_file)
|
||||||
|
|
||||||
|
def _parse_page_id(self, page_link):
|
||||||
|
m = re.search(self._page_id_pattern, page_link)
|
||||||
|
if m is None:
|
||||||
|
return None
|
||||||
|
page_id = int(m.group(1))
|
||||||
|
if page_id in self._page_map:
|
||||||
|
return None
|
||||||
|
return page_id
|
30
bot/crawlers/base/jitenon.py
Normal file
30
bot/crawlers/base/jitenon.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
|
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
||||||
|
from bot.crawlers.base.crawler import BaseCrawler
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonCrawler(BaseCrawler):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._gojuon_url = None
|
||||||
|
|
||||||
|
def collect_pages(self, page_dir):
|
||||||
|
print(f"{timestamp()} Scraping {self._gojuon_url}")
|
||||||
|
jitenon = JitenonScraper()
|
||||||
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||||
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||||
|
gojuon_href = gojuon_a['href']
|
||||||
|
kana_doc, _ = jitenon.scrape(gojuon_href)
|
||||||
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||||
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
|
page_link = kana_a['href']
|
||||||
|
page_id = self._parse_page_id(page_link)
|
||||||
|
if page_id is None:
|
||||||
|
continue
|
||||||
|
_, page_path = jitenon.scrape(page_link)
|
||||||
|
self._page_map[page_id] = page_path
|
||||||
|
pages_len = len(self._page_map)
|
||||||
|
print(f"\n{timestamp()} Found {pages_len} entry pages")
|
20
bot/crawlers/base/monokakido.py
Normal file
20
bot/crawlers/base/monokakido.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
import os
|
||||||
|
from bot.time import timestamp
|
||||||
|
from bot.crawlers.base.crawler import BaseCrawler
|
||||||
|
|
||||||
|
|
||||||
|
class MonokakidoCrawler(BaseCrawler):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
||||||
|
|
||||||
|
def collect_pages(self, page_dir):
|
||||||
|
print(f"{timestamp()} Searching for page files in `{page_dir}`")
|
||||||
|
for pagefile in os.listdir(page_dir):
|
||||||
|
page_id = self._parse_page_id(pagefile)
|
||||||
|
if page_id is None or page_id == 0:
|
||||||
|
continue
|
||||||
|
path = os.path.join(page_dir, pagefile)
|
||||||
|
self._page_map[page_id] = path
|
||||||
|
pages_len = len(self._page_map)
|
||||||
|
print(f"{timestamp()} Found {pages_len} page files for processing")
|
|
@ -1,158 +0,0 @@
|
||||||
import os
|
|
||||||
import re
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import bot.crawlers.scraper as Scraper
|
|
||||||
from bot.entries.factory import new_entry
|
|
||||||
from bot.yomichan.exporters.factory import new_yomi_exporter
|
|
||||||
from bot.mdict.exporters.factory import new_mdict_exporter
|
|
||||||
|
|
||||||
|
|
||||||
class Crawler(ABC):
|
|
||||||
def __init__(self, target):
|
|
||||||
self._target = target
|
|
||||||
self._page_map = {}
|
|
||||||
self._entries = []
|
|
||||||
self._page_id_pattern = None
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def collect_pages(self, page_dir):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def read_pages(self):
|
|
||||||
pages_len = len(self._page_map)
|
|
||||||
items = self._page_map.items()
|
|
||||||
for idx, (page_id, page_path) in enumerate(items):
|
|
||||||
update = f"Reading page {idx+1}/{pages_len}"
|
|
||||||
print(update, end='\r', flush=True)
|
|
||||||
entry = new_entry(self._target, page_id)
|
|
||||||
with open(page_path, "r", encoding="utf-8") as f:
|
|
||||||
page = f.read()
|
|
||||||
try:
|
|
||||||
entry.set_page(page)
|
|
||||||
except ValueError as err:
|
|
||||||
print(err)
|
|
||||||
print("Try deleting and redownloading file:")
|
|
||||||
print(f"\t{page_path}\n")
|
|
||||||
continue
|
|
||||||
self._entries.append(entry)
|
|
||||||
print()
|
|
||||||
|
|
||||||
def make_yomichan_dictionary(self, media_dir, validate):
|
|
||||||
exporter = new_yomi_exporter(self._target)
|
|
||||||
exporter.export(self._entries, media_dir, validate)
|
|
||||||
|
|
||||||
def make_mdict_dictionary(self, media_dir, icon_file):
|
|
||||||
exporter = new_mdict_exporter(self._target)
|
|
||||||
exporter.export(self._entries, media_dir, icon_file)
|
|
||||||
|
|
||||||
def _parse_page_id(self, page_link):
|
|
||||||
m = re.search(self._page_id_pattern, page_link)
|
|
||||||
if m is None:
|
|
||||||
return None
|
|
||||||
page_id = int(m.group(1))
|
|
||||||
if page_id in self._page_map:
|
|
||||||
return None
|
|
||||||
return page_id
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKokugoCrawler(Crawler):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
|
|
||||||
self._page_id_pattern = r"word/p([0-9]+)$"
|
|
||||||
|
|
||||||
def collect_pages(self, page_dir):
|
|
||||||
jitenon = Scraper.Jitenon()
|
|
||||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
||||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
||||||
gojuon_href = gojuon_a['href']
|
|
||||||
max_kana_page = 1
|
|
||||||
current_kana_page = 1
|
|
||||||
while current_kana_page <= max_kana_page:
|
|
||||||
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
|
|
||||||
current_kana_page += 1
|
|
||||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
||||||
page_total = kana_soup.find(class_="page_total").text
|
|
||||||
m = re.search(r"全([0-9]+)件", page_total)
|
|
||||||
if m:
|
|
||||||
max_kana_page = int(m.group(1))
|
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
||||||
page_link = kana_a['href']
|
|
||||||
page_id = self._parse_page_id(page_link)
|
|
||||||
if page_id is None:
|
|
||||||
continue
|
|
||||||
_, page_path = jitenon.scrape(page_link)
|
|
||||||
self._page_map[page_id] = page_path
|
|
||||||
pages_len = len(self._page_map)
|
|
||||||
print(f"Finished scraping {pages_len} pages")
|
|
||||||
|
|
||||||
|
|
||||||
class _JitenonCrawler(Crawler):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._gojuon_url = None
|
|
||||||
|
|
||||||
def collect_pages(self, page_dir):
|
|
||||||
print("Scraping jitenon.jp")
|
|
||||||
jitenon = Scraper.Jitenon()
|
|
||||||
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
|
||||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
|
||||||
gojuon_href = gojuon_a['href']
|
|
||||||
kana_doc, _ = jitenon.scrape(gojuon_href)
|
|
||||||
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
|
||||||
page_link = kana_a['href']
|
|
||||||
page_id = self._parse_page_id(page_link)
|
|
||||||
if page_id is None:
|
|
||||||
continue
|
|
||||||
_, page_path = jitenon.scrape(page_link)
|
|
||||||
self._page_map[page_id] = page_path
|
|
||||||
pages_len = len(self._page_map)
|
|
||||||
print(f"Finished scraping {pages_len} pages")
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiCrawler(_JitenonCrawler):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
|
|
||||||
self._page_id_pattern = r"([0-9]+)\.html$"
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaCrawler(_JitenonCrawler):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
|
|
||||||
self._page_id_pattern = r"([0-9]+)\.php$"
|
|
||||||
|
|
||||||
|
|
||||||
class _MonokakidoCrawler(Crawler):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._page_id_pattern = r"^([0-9]+)\.xml$"
|
|
||||||
|
|
||||||
def collect_pages(self, page_dir):
|
|
||||||
print(f"Searching for page files in `{page_dir}`")
|
|
||||||
for pagefile in os.listdir(page_dir):
|
|
||||||
page_id = self._parse_page_id(pagefile)
|
|
||||||
if page_id is None or page_id == 0:
|
|
||||||
continue
|
|
||||||
path = os.path.join(page_dir, pagefile)
|
|
||||||
self._page_map[page_id] = path
|
|
||||||
pages_len = len(self._page_map)
|
|
||||||
print(f"Found {pages_len} page files for processing")
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8Crawler(_MonokakidoCrawler):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2Crawler(_MonokakidoCrawler):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8Crawler(_MonokakidoCrawler):
|
|
||||||
pass
|
|
5
bot/crawlers/daijirin2.py
Normal file
5
bot/crawlers/daijirin2.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.crawlers.base.monokakido import MonokakidoCrawler
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler(MonokakidoCrawler):
|
||||||
|
pass
|
|
@ -1,20 +0,0 @@
|
||||||
from bot.targets import Targets
|
|
||||||
|
|
||||||
from bot.crawlers.crawlers import JitenonKokugoCrawler
|
|
||||||
from bot.crawlers.crawlers import JitenonYojiCrawler
|
|
||||||
from bot.crawlers.crawlers import JitenonKotowazaCrawler
|
|
||||||
from bot.crawlers.crawlers import Smk8Crawler
|
|
||||||
from bot.crawlers.crawlers import Daijirin2Crawler
|
|
||||||
from bot.crawlers.crawlers import Sankoku8Crawler
|
|
||||||
|
|
||||||
|
|
||||||
def new_crawler(target):
|
|
||||||
crawler_map = {
|
|
||||||
Targets.JITENON_KOKUGO: JitenonKokugoCrawler,
|
|
||||||
Targets.JITENON_YOJI: JitenonYojiCrawler,
|
|
||||||
Targets.JITENON_KOTOWAZA: JitenonKotowazaCrawler,
|
|
||||||
Targets.SMK8: Smk8Crawler,
|
|
||||||
Targets.DAIJIRIN2: Daijirin2Crawler,
|
|
||||||
Targets.SANKOKU8: Sankoku8Crawler,
|
|
||||||
}
|
|
||||||
return crawler_map[target](target)
|
|
40
bot/crawlers/jitenon_kokugo.py
Normal file
40
bot/crawlers/jitenon_kokugo.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
|
from bot.crawlers.base.crawler import BaseCrawler
|
||||||
|
from bot.crawlers.scrapers.jitenon import Jitenon as JitenonScraper
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler(BaseCrawler):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._gojuon_url = "https://kokugo.jitenon.jp/cat/gojuonindex.php"
|
||||||
|
self._page_id_pattern = r"word/p([0-9]+)$"
|
||||||
|
|
||||||
|
def collect_pages(self, page_dir):
|
||||||
|
print(f"{timestamp()} Scraping {self._gojuon_url}")
|
||||||
|
jitenon = JitenonScraper()
|
||||||
|
gojuon_doc, _ = jitenon.scrape(self._gojuon_url)
|
||||||
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||||
|
gojuon_href = gojuon_a['href']
|
||||||
|
max_kana_page = 1
|
||||||
|
current_kana_page = 1
|
||||||
|
while current_kana_page <= max_kana_page:
|
||||||
|
kana_doc, _ = jitenon.scrape(f"{gojuon_href}&page={current_kana_page}")
|
||||||
|
current_kana_page += 1
|
||||||
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||||
|
page_total = kana_soup.find(class_="page_total").text
|
||||||
|
m = re.search(r"全([0-9]+)件", page_total)
|
||||||
|
if m:
|
||||||
|
max_kana_page = int(m.group(1))
|
||||||
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
|
page_link = kana_a['href']
|
||||||
|
page_id = self._parse_page_id(page_link)
|
||||||
|
if page_id is None:
|
||||||
|
continue
|
||||||
|
_, page_path = jitenon.scrape(page_link)
|
||||||
|
self._page_map[page_id] = page_path
|
||||||
|
pages_len = len(self._page_map)
|
||||||
|
print(f"\n{timestamp()} Found {pages_len} entry pages")
|
8
bot/crawlers/jitenon_kotowaza.py
Normal file
8
bot/crawlers/jitenon_kotowaza.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from bot.crawlers.base.jitenon import JitenonCrawler
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler(JitenonCrawler):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._gojuon_url = "https://kotowaza.jitenon.jp/cat/gojuon.php"
|
||||||
|
self._page_id_pattern = r"([0-9]+)\.php$"
|
8
bot/crawlers/jitenon_yoji.py
Normal file
8
bot/crawlers/jitenon_yoji.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from bot.crawlers.base.jitenon import JitenonCrawler
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler(JitenonCrawler):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._gojuon_url = "https://yoji.jitenon.jp/cat/gojuon.html"
|
||||||
|
self._page_id_pattern = r"([0-9]+)\.html$"
|
5
bot/crawlers/sankoku8.py
Normal file
5
bot/crawlers/sankoku8.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.crawlers.base.monokakido import MonokakidoCrawler
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler(MonokakidoCrawler):
|
||||||
|
pass
|
10
bot/crawlers/scrapers/jitenon.py
Normal file
10
bot/crawlers/scrapers/jitenon.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
import re
|
||||||
|
from bot.crawlers.scrapers.scraper import BaseScraper
|
||||||
|
|
||||||
|
|
||||||
|
class Jitenon(BaseScraper):
|
||||||
|
def _get_netloc_re(self):
|
||||||
|
domain = r"jitenon\.jp"
|
||||||
|
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + domain + r"$"
|
||||||
|
netloc_re = re.compile(pattern)
|
||||||
|
return netloc_re
|
|
@ -1,24 +1,28 @@
|
||||||
import time
|
import time
|
||||||
import requests
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import random
|
||||||
|
import math
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from platformdirs import user_cache_dir
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from pathlib import Path
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from requests.packages.urllib3.util.retry import Retry
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
|
from platformdirs import user_cache_dir
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.data import load_config
|
from bot.data import load_config
|
||||||
|
|
||||||
|
|
||||||
class Scraper():
|
class BaseScraper(ABC):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.cache_count = 0
|
||||||
self._config = load_config()
|
self._config = load_config()
|
||||||
pattern = r"^(?:([A-Za-z0-9.\-]+)\.)?" + self.domain + r"$"
|
self.netloc_re = self._get_netloc_re()
|
||||||
self.netloc_re = re.compile(pattern)
|
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
|
|
||||||
def scrape(self, urlstring):
|
def scrape(self, urlstring):
|
||||||
|
@ -31,9 +35,14 @@ class Scraper():
|
||||||
with open(cache_path, "w", encoding="utf-8") as f:
|
with open(cache_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
else:
|
else:
|
||||||
print("Discovering cached files...", end='\r', flush=True)
|
self.cache_count += 1
|
||||||
|
print(f"\tDiscovering cached file {self.cache_count}", end='\r', flush=True)
|
||||||
return html, cache_path
|
return html, cache_path
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def _get_netloc_re(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def __set_session(self):
|
def __set_session(self):
|
||||||
retry_strategy = Retry(
|
retry_strategy = Retry(
|
||||||
total=3,
|
total=3,
|
||||||
|
@ -87,21 +96,14 @@ class Scraper():
|
||||||
def __get(self, urlstring):
|
def __get(self, urlstring):
|
||||||
delay = 10
|
delay = 10
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
now = datetime.now().strftime("%H:%M:%S")
|
print(f"{timestamp()} Scraping {urlstring} ...", end='')
|
||||||
print(f"{now} scraping {urlstring} ...", end='')
|
|
||||||
try:
|
try:
|
||||||
response = self.session.get(urlstring, timeout=10)
|
response = self.session.get(urlstring, timeout=10)
|
||||||
print("OK")
|
print(f"{timestamp()} OK")
|
||||||
return response.text
|
return response.text
|
||||||
except Exception:
|
except Exception as ex:
|
||||||
print("failed")
|
print(f"\tFailed: {str(ex)}")
|
||||||
print("resetting session and trying again")
|
print(f"{timestamp()} Resetting session and trying again")
|
||||||
self.__set_session()
|
self.__set_session()
|
||||||
response = self.session.get(urlstring, timeout=10)
|
response = self.session.get(urlstring, timeout=10)
|
||||||
return response.text
|
return response.text
|
||||||
|
|
||||||
|
|
||||||
class Jitenon(Scraper):
|
|
||||||
def __init__(self):
|
|
||||||
self.domain = r"jitenon\.jp"
|
|
||||||
super().__init__()
|
|
5
bot/crawlers/smk8.py
Normal file
5
bot/crawlers/smk8.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.crawlers.base.monokakido import MonokakidoCrawler
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler(MonokakidoCrawler):
|
||||||
|
pass
|
|
@ -18,15 +18,15 @@ class Entry(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_global_identifier(self):
|
def get_global_identifier(self):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def set_page(self, page):
|
def set_page(self, page):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_page_soup(self):
|
def get_page_soup(self):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_headwords(self):
|
def get_headwords(self):
|
||||||
if self._headwords is not None:
|
if self._headwords is not None:
|
||||||
|
@ -38,15 +38,15 @@ class Entry(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _get_headwords(self):
|
def _get_headwords(self):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _add_variant_expressions(self, headwords):
|
def _add_variant_expressions(self, headwords):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_part_of_speech_tags(self):
|
def get_part_of_speech_tags(self):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_parent(self):
|
def get_parent(self):
|
||||||
if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:
|
if self.entry_id in self.SUBENTRY_ID_TO_ENTRY_ID:
|
|
@ -31,11 +31,14 @@ def add_fullwidth(expressions):
|
||||||
|
|
||||||
def add_variant_kanji(expressions):
|
def add_variant_kanji(expressions):
|
||||||
variant_kanji = load_variant_kanji()
|
variant_kanji = load_variant_kanji()
|
||||||
for old_kanji, new_kanji in variant_kanji.items():
|
for kyuuji, shinji in variant_kanji.items():
|
||||||
new_exps = []
|
new_exps = []
|
||||||
for expression in expressions:
|
for expression in expressions:
|
||||||
if old_kanji in expression:
|
if kyuuji in expression:
|
||||||
new_exp = expression.replace(old_kanji, new_kanji)
|
new_exp = expression.replace(kyuuji, shinji)
|
||||||
|
new_exps.append(new_exp)
|
||||||
|
if shinji in expression:
|
||||||
|
new_exp = expression.replace(shinji, kyuuji)
|
||||||
new_exps.append(new_exp)
|
new_exps.append(new_exp)
|
||||||
for new_exp in new_exps:
|
for new_exp in new_exps:
|
||||||
if new_exp not in expressions:
|
if new_exp not in expressions:
|
||||||
|
@ -85,40 +88,3 @@ def expand_abbreviation_list(expressions):
|
||||||
if new_exp not in new_exps:
|
if new_exp not in new_exps:
|
||||||
new_exps.append(new_exp)
|
new_exps.append(new_exp)
|
||||||
return new_exps
|
return new_exps
|
||||||
|
|
||||||
|
|
||||||
def expand_smk_alternatives(text):
|
|
||||||
"""Return a list of strings described by △ notation."""
|
|
||||||
m = re.search(r"△([^(]+)(([^(]+))", text)
|
|
||||||
if m is None:
|
|
||||||
return [text]
|
|
||||||
alt_parts = [m.group(1)]
|
|
||||||
for alt_part in m.group(2).split("・"):
|
|
||||||
alt_parts.append(alt_part)
|
|
||||||
alts = []
|
|
||||||
for alt_part in alt_parts:
|
|
||||||
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
|
|
||||||
alts.append(alt_exp)
|
|
||||||
return alts
|
|
||||||
|
|
||||||
|
|
||||||
def expand_daijirin_alternatives(text):
|
|
||||||
"""Return a list of strings described by = notation."""
|
|
||||||
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
|
||||||
groups = re.findall(group_pattern, text)
|
|
||||||
expressions = [""]
|
|
||||||
for group in groups:
|
|
||||||
new_exps = []
|
|
||||||
for expression in expressions:
|
|
||||||
new_exps.append(expression + group[0])
|
|
||||||
expressions = new_exps.copy()
|
|
||||||
if group[1] == "":
|
|
||||||
continue
|
|
||||||
new_exps = []
|
|
||||||
for expression in expressions:
|
|
||||||
new_exps.append(expression + group[2])
|
|
||||||
for expression in expressions:
|
|
||||||
for alt in group[3].split("・"):
|
|
||||||
new_exps.append(expression + alt)
|
|
||||||
expressions = new_exps.copy()
|
|
||||||
return expressions
|
|
|
@ -3,11 +3,11 @@ from abc import abstractmethod
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from bot.entries.entry import Entry
|
from bot.entries.base.entry import Entry
|
||||||
import bot.entries.expressions as Expressions
|
import bot.entries.base.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
class _JitenonEntry(Entry):
|
class JitenonEntry(Entry):
|
||||||
def __init__(self, target, entry_id):
|
def __init__(self, target, entry_id):
|
||||||
super().__init__(target, entry_id)
|
super().__init__(target, entry_id)
|
||||||
self.expression = ""
|
self.expression = ""
|
||||||
|
@ -58,7 +58,7 @@ class _JitenonEntry(Entry):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _get_column_map(self):
|
def _get_column_map(self):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
def __set_modified_date(self, page):
|
def __set_modified_date(self, page):
|
||||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", page)
|
||||||
|
@ -140,104 +140,3 @@ class _JitenonEntry(Entry):
|
||||||
elif isinstance(attr_val, list):
|
elif isinstance(attr_val, list):
|
||||||
colvals.append(";".join(attr_val))
|
colvals.append(";".join(attr_val))
|
||||||
return ",".join(colvals)
|
return ",".join(colvals)
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiEntry(_JitenonEntry):
|
|
||||||
def __init__(self, target, entry_id):
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self.origin = ""
|
|
||||||
self.kanken_level = ""
|
|
||||||
self.category = ""
|
|
||||||
self.related_expressions = []
|
|
||||||
|
|
||||||
def _get_column_map(self):
|
|
||||||
return {
|
|
||||||
"四字熟語": "expression",
|
|
||||||
"読み方": "yomikata",
|
|
||||||
"意味": "definition",
|
|
||||||
"異形": "other_forms",
|
|
||||||
"出典": "origin",
|
|
||||||
"漢検級": "kanken_level",
|
|
||||||
"場面用途": "category",
|
|
||||||
"類義語": "related_expressions",
|
|
||||||
}
|
|
||||||
|
|
||||||
def _add_variant_expressions(self, headwords):
|
|
||||||
for expressions in headwords.values():
|
|
||||||
Expressions.add_variant_kanji(expressions)
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaEntry(_JitenonEntry):
|
|
||||||
def __init__(self, target, entry_id):
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self.origin = ""
|
|
||||||
self.example = ""
|
|
||||||
self.related_expressions = []
|
|
||||||
|
|
||||||
def _get_column_map(self):
|
|
||||||
return {
|
|
||||||
"言葉": "expression",
|
|
||||||
"読み方": "yomikata",
|
|
||||||
"意味": "definition",
|
|
||||||
"異形": "other_forms",
|
|
||||||
"出典": "origin",
|
|
||||||
"例文": "example",
|
|
||||||
"類句": "related_expressions",
|
|
||||||
}
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
if self.expression == "金棒引き・鉄棒引き":
|
|
||||||
headwords = {
|
|
||||||
"かなぼうひき": ["金棒引き", "鉄棒引き"]
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
headwords = super()._get_headwords()
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _add_variant_expressions(self, headwords):
|
|
||||||
for expressions in headwords.values():
|
|
||||||
Expressions.add_variant_kanji(expressions)
|
|
||||||
Expressions.add_fullwidth(expressions)
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKokugoEntry(_JitenonEntry):
|
|
||||||
def __init__(self, target, entry_id):
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self.example = ""
|
|
||||||
self.alt_expression = ""
|
|
||||||
self.antonym = ""
|
|
||||||
self.attachments = ""
|
|
||||||
self.compounds = ""
|
|
||||||
self.related_words = ""
|
|
||||||
|
|
||||||
def _get_column_map(self):
|
|
||||||
return {
|
|
||||||
"言葉": "expression",
|
|
||||||
"読み方": "yomikata",
|
|
||||||
"意味": "definition",
|
|
||||||
"例文": "example",
|
|
||||||
"別表記": "alt_expression",
|
|
||||||
"対義語": "antonym",
|
|
||||||
"活用": "attachments",
|
|
||||||
"用例": "compounds",
|
|
||||||
"類語": "related_words",
|
|
||||||
}
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
headwords = {}
|
|
||||||
for reading in self.yomikata.split("・"):
|
|
||||||
if reading not in headwords:
|
|
||||||
headwords[reading] = []
|
|
||||||
for expression in self.expression.split("・"):
|
|
||||||
headwords[reading].append(expression)
|
|
||||||
if self.alt_expression.strip() != "":
|
|
||||||
for expression in self.alt_expression.split("・"):
|
|
||||||
headwords[reading].append(expression)
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _add_variant_expressions(self, headwords):
|
|
||||||
for expressions in headwords.values():
|
|
||||||
Expressions.add_variant_kanji(expressions)
|
|
||||||
Expressions.add_fullwidth(expressions)
|
|
||||||
Expressions.remove_iteration_mark(expressions)
|
|
||||||
Expressions.add_iteration_mark(expressions)
|
|
60
bot/entries/base/sanseido_entry.py
Normal file
60
bot/entries/base/sanseido_entry.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
from abc import abstractmethod
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from bot.entries.base.entry import Entry
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
|
class SanseidoEntry(Entry):
|
||||||
|
def set_page(self, page):
|
||||||
|
page = self._decompose_subentries(page)
|
||||||
|
self._page = page
|
||||||
|
|
||||||
|
def get_page_soup(self):
|
||||||
|
soup = BeautifulSoup(self._page, "xml")
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_global_identifier(self):
|
||||||
|
parent_part = format(self.entry_id[0], '06')
|
||||||
|
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
||||||
|
return f"@{self.target.value}-{parent_part}-{child_part}"
|
||||||
|
|
||||||
|
def _decompose_subentries(self, page):
|
||||||
|
soup = BeautifulSoup(page, features="xml")
|
||||||
|
for x in self._get_subentry_parameters():
|
||||||
|
subentry_class, tags, subentry_list = x
|
||||||
|
for tag in tags:
|
||||||
|
tag_soup = soup.find(tag)
|
||||||
|
while tag_soup is not None:
|
||||||
|
tag_soup.name = "項目"
|
||||||
|
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
||||||
|
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
||||||
|
subentry = subentry_class(self.target, subentry_id)
|
||||||
|
page = tag_soup.decode()
|
||||||
|
subentry.set_page(page)
|
||||||
|
subentry_list.append(subentry)
|
||||||
|
tag_soup.decompose()
|
||||||
|
tag_soup = soup.find(tag)
|
||||||
|
return soup.decode()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def _get_subentry_parameters(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _add_variant_expressions(self, headwords):
|
||||||
|
for expressions in headwords.values():
|
||||||
|
Expressions.add_variant_kanji(expressions)
|
||||||
|
Expressions.add_fullwidth(expressions)
|
||||||
|
Expressions.remove_iteration_mark(expressions)
|
||||||
|
Expressions.add_iteration_mark(expressions)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def id_string_to_entry_id(id_string):
|
||||||
|
parts = id_string.split("-")
|
||||||
|
if len(parts) == 1:
|
||||||
|
return (int(parts[0]), 0)
|
||||||
|
elif len(parts) == 2:
|
||||||
|
# subentries have a hexadecimal part
|
||||||
|
return (int(parts[0]), int(parts[1], 16))
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid entry ID: {id_string}")
|
|
@ -1,231 +0,0 @@
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import bot.entries.expressions as Expressions
|
|
||||||
import bot.soup as Soup
|
|
||||||
from bot.data import load_phrase_readings
|
|
||||||
from bot.data import load_daijirin2_kana_abbreviations
|
|
||||||
from bot.entries.entry import Entry
|
|
||||||
from bot.entries.daijirin2_preprocess import preprocess_page
|
|
||||||
|
|
||||||
|
|
||||||
class _BaseDaijirin2Entry(Entry):
|
|
||||||
def __init__(self, target, entry_id):
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self.children = []
|
|
||||||
self.phrases = []
|
|
||||||
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
|
|
||||||
|
|
||||||
def get_global_identifier(self):
|
|
||||||
parent_part = format(self.entry_id[0], '06')
|
|
||||||
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
|
||||||
return f"@{self.target.value}-{parent_part}-{child_part}"
|
|
||||||
|
|
||||||
def set_page(self, page):
|
|
||||||
page = self.__decompose_subentries(page)
|
|
||||||
self._page = page
|
|
||||||
|
|
||||||
def get_page_soup(self):
|
|
||||||
soup = BeautifulSoup(self._page, "xml")
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def get_part_of_speech_tags(self):
|
|
||||||
if self._part_of_speech_tags is not None:
|
|
||||||
return self._part_of_speech_tags
|
|
||||||
self._part_of_speech_tags = []
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
for pos_group in soup.find_all("品詞G"):
|
|
||||||
if pos_group.parent.name == "大語義":
|
|
||||||
self._set_part_of_speech_tags(pos_group)
|
|
||||||
return self._part_of_speech_tags
|
|
||||||
|
|
||||||
def _set_part_of_speech_tags(self, el):
|
|
||||||
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
|
|
||||||
for child in el.children:
|
|
||||||
if child.name is not None:
|
|
||||||
self._set_part_of_speech_tags(child)
|
|
||||||
continue
|
|
||||||
pos = str(child)
|
|
||||||
if el.name not in pos_names:
|
|
||||||
continue
|
|
||||||
elif pos in ["[", "]"]:
|
|
||||||
continue
|
|
||||||
elif pos in self._part_of_speech_tags:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
self._part_of_speech_tags.append(pos)
|
|
||||||
|
|
||||||
def _get_regular_headwords(self, soup):
|
|
||||||
self._fill_alts(soup)
|
|
||||||
reading = soup.find("見出仮名").text
|
|
||||||
expressions = []
|
|
||||||
for el in soup.find_all("標準表記"):
|
|
||||||
expression = self._clean_expression(el.text)
|
|
||||||
if "—" in expression:
|
|
||||||
kana_abbrs = self._kana_abbreviations[self.entry_id]
|
|
||||||
for abbr in kana_abbrs:
|
|
||||||
expression = expression.replace("—", abbr, 1)
|
|
||||||
expressions.append(expression)
|
|
||||||
expressions = Expressions.expand_abbreviation_list(expressions)
|
|
||||||
if len(expressions) == 0:
|
|
||||||
expressions.append(reading)
|
|
||||||
headwords = {reading: expressions}
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _add_variant_expressions(self, headwords):
|
|
||||||
for expressions in headwords.values():
|
|
||||||
Expressions.add_variant_kanji(expressions)
|
|
||||||
Expressions.add_fullwidth(expressions)
|
|
||||||
Expressions.remove_iteration_mark(expressions)
|
|
||||||
Expressions.add_iteration_mark(expressions)
|
|
||||||
|
|
||||||
def __decompose_subentries(self, page):
|
|
||||||
soup = BeautifulSoup(page, features="xml")
|
|
||||||
subentry_parameters = [
|
|
||||||
[Daijirin2ChildEntry, ["子項目"], self.children],
|
|
||||||
[Daijirin2PhraseEntry, ["句項目"], self.phrases],
|
|
||||||
]
|
|
||||||
for x in subentry_parameters:
|
|
||||||
subentry_class, tags, subentry_list = x
|
|
||||||
for tag in tags:
|
|
||||||
tag_soup = soup.find(tag)
|
|
||||||
while tag_soup is not None:
|
|
||||||
tag_soup.name = "項目"
|
|
||||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
|
||||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
|
||||||
subentry = subentry_class(self.target, subentry_id)
|
|
||||||
page = tag_soup.decode()
|
|
||||||
subentry.set_page(page)
|
|
||||||
subentry_list.append(subentry)
|
|
||||||
tag_soup.decompose()
|
|
||||||
tag_soup = soup.find(tag)
|
|
||||||
return soup.decode()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def id_string_to_entry_id(id_string):
|
|
||||||
parts = id_string.split("-")
|
|
||||||
if len(parts) == 1:
|
|
||||||
return (int(parts[0]), 0)
|
|
||||||
elif len(parts) == 2:
|
|
||||||
# subentries have a hexadecimal part
|
|
||||||
return (int(parts[0]), int(parts[1], 16))
|
|
||||||
else:
|
|
||||||
raise Exception(f"Invalid entry ID: {id_string}")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _delete_unused_nodes(soup):
|
|
||||||
"""Remove extra markup elements that appear in the entry
|
|
||||||
headword line which are not part of the entry headword"""
|
|
||||||
unused_nodes = [
|
|
||||||
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
|
|
||||||
"表外字マーク", "表外字マーク", "ルビG"
|
|
||||||
]
|
|
||||||
for name in unused_nodes:
|
|
||||||
Soup.delete_soup_nodes(soup, name)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _clean_expression(expression):
|
|
||||||
for x in ["〈", "〉", "《", "》", " "]:
|
|
||||||
expression = expression.replace(x, "")
|
|
||||||
return expression
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fill_alts(soup):
|
|
||||||
for gaiji in soup.find_all(class_="gaiji"):
|
|
||||||
if gaiji.name == "img" and gaiji.has_attr("alt"):
|
|
||||||
gaiji.name = "span"
|
|
||||||
gaiji.string = gaiji.attrs["alt"]
|
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2Entry(_BaseDaijirin2Entry):
|
|
||||||
def __init__(self, target, page_id):
|
|
||||||
entry_id = (page_id, 0)
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
|
|
||||||
def set_page(self, page):
|
|
||||||
page = preprocess_page(page)
|
|
||||||
super().set_page(page)
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
if soup.find("漢字見出") is not None:
|
|
||||||
headwords = self._get_kanji_headwords(soup)
|
|
||||||
elif soup.find("略語G") is not None:
|
|
||||||
headwords = self._get_acronym_headwords(soup)
|
|
||||||
else:
|
|
||||||
headwords = self._get_regular_headwords(soup)
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _get_kanji_headwords(self, soup):
|
|
||||||
readings = []
|
|
||||||
for el in soup.find_all("漢字音"):
|
|
||||||
hira = Expressions.kata_to_hira(el.text)
|
|
||||||
readings.append(hira)
|
|
||||||
if soup.find("漢字音") is None:
|
|
||||||
readings.append("")
|
|
||||||
expressions = []
|
|
||||||
for el in soup.find_all("漢字見出"):
|
|
||||||
expressions.append(el.text)
|
|
||||||
headwords = {}
|
|
||||||
for reading in readings:
|
|
||||||
headwords[reading] = expressions
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _get_acronym_headwords(self, soup):
|
|
||||||
expressions = []
|
|
||||||
for el in soup.find_all("略語"):
|
|
||||||
expression_parts = []
|
|
||||||
for part in el.find_all(["欧字", "和字"]):
|
|
||||||
expression_parts.append(part.text)
|
|
||||||
expression = "".join(expression_parts)
|
|
||||||
expressions.append(expression)
|
|
||||||
headwords = {"": expressions}
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2ChildEntry(_BaseDaijirin2Entry):
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
headwords = self._get_regular_headwords(soup)
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2PhraseEntry(_BaseDaijirin2Entry):
|
|
||||||
def get_part_of_speech_tags(self):
|
|
||||||
# phrases do not contain these tags
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
headwords = {}
|
|
||||||
expressions = self._find_expressions(soup)
|
|
||||||
readings = self._find_readings()
|
|
||||||
for idx, expression in enumerate(expressions):
|
|
||||||
reading = readings[idx]
|
|
||||||
if reading in headwords:
|
|
||||||
headwords[reading].append(expression)
|
|
||||||
else:
|
|
||||||
headwords[reading] = [expression]
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _find_expressions(self, soup):
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
text = soup.find("句表記").text
|
|
||||||
text = self._clean_expression(text)
|
|
||||||
alternatives = Expressions.expand_daijirin_alternatives(text)
|
|
||||||
expressions = []
|
|
||||||
for alt in alternatives:
|
|
||||||
for exp in Expressions.expand_abbreviation(alt):
|
|
||||||
expressions.append(exp)
|
|
||||||
return expressions
|
|
||||||
|
|
||||||
def _find_readings(self):
|
|
||||||
phrase_readings = load_phrase_readings(self.target)
|
|
||||||
text = phrase_readings[self.entry_id]
|
|
||||||
alternatives = Expressions.expand_daijirin_alternatives(text)
|
|
||||||
readings = []
|
|
||||||
for alt in alternatives:
|
|
||||||
for reading in Expressions.expand_abbreviation(alt):
|
|
||||||
readings.append(reading)
|
|
||||||
return readings
|
|
88
bot/entries/daijirin2/base_entry.py
Normal file
88
bot/entries/daijirin2/base_entry.py
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
import bot.soup as Soup
|
||||||
|
from bot.data import load_daijirin2_kana_abbreviations
|
||||||
|
from bot.entries.base.sanseido_entry import SanseidoEntry
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEntry(SanseidoEntry):
|
||||||
|
def __init__(self, target, entry_id):
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self.children = []
|
||||||
|
self.phrases = []
|
||||||
|
self._kana_abbreviations = load_daijirin2_kana_abbreviations()
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
if self._part_of_speech_tags is not None:
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
self._part_of_speech_tags = []
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
for pos_group in soup.find_all("品詞G"):
|
||||||
|
if pos_group.parent.name == "大語義":
|
||||||
|
self._set_part_of_speech_tags(pos_group)
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
|
||||||
|
def _set_part_of_speech_tags(self, el):
|
||||||
|
pos_names = ["品詞", "品詞活用", "品詞行", "用法"]
|
||||||
|
for child in el.children:
|
||||||
|
if child.name is not None:
|
||||||
|
self._set_part_of_speech_tags(child)
|
||||||
|
continue
|
||||||
|
pos = str(child)
|
||||||
|
if el.name not in pos_names:
|
||||||
|
continue
|
||||||
|
elif pos in ["[", "]"]:
|
||||||
|
continue
|
||||||
|
elif pos in self._part_of_speech_tags:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self._part_of_speech_tags.append(pos)
|
||||||
|
|
||||||
|
def _get_regular_headwords(self, soup):
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = soup.find("見出仮名").text
|
||||||
|
expressions = []
|
||||||
|
for el in soup.find_all("標準表記"):
|
||||||
|
expression = self._clean_expression(el.text)
|
||||||
|
if "—" in expression:
|
||||||
|
kana_abbrs = self._kana_abbreviations[self.entry_id]
|
||||||
|
for abbr in kana_abbrs:
|
||||||
|
expression = expression.replace("—", abbr, 1)
|
||||||
|
expressions.append(expression)
|
||||||
|
expressions = Expressions.expand_abbreviation_list(expressions)
|
||||||
|
if len(expressions) == 0:
|
||||||
|
expressions.append(reading)
|
||||||
|
headwords = {reading: expressions}
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _get_subentry_parameters(self):
|
||||||
|
from bot.entries.daijirin2.child_entry import ChildEntry
|
||||||
|
from bot.entries.daijirin2.phrase_entry import PhraseEntry
|
||||||
|
subentry_parameters = [
|
||||||
|
[ChildEntry, ["子項目"], self.children],
|
||||||
|
[PhraseEntry, ["句項目"], self.phrases],
|
||||||
|
]
|
||||||
|
return subentry_parameters
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _delete_unused_nodes(soup):
|
||||||
|
"""Remove extra markup elements that appear in the entry
|
||||||
|
headword line which are not part of the entry headword"""
|
||||||
|
unused_nodes = [
|
||||||
|
"漢字音logo", "活用分節", "連語句活用分節", "語構成",
|
||||||
|
"表外字マーク", "表外字マーク", "ルビG"
|
||||||
|
]
|
||||||
|
for name in unused_nodes:
|
||||||
|
Soup.delete_soup_nodes(soup, name)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_expression(expression):
|
||||||
|
for x in ["〈", "〉", "《", "》", " "]:
|
||||||
|
expression = expression.replace(x, "")
|
||||||
|
return expression
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fill_alts(soup):
|
||||||
|
for gaiji in soup.find_all(class_="gaiji"):
|
||||||
|
if gaiji.name == "img" and gaiji.has_attr("alt"):
|
||||||
|
gaiji.name = "span"
|
||||||
|
gaiji.string = gaiji.attrs["alt"]
|
9
bot/entries/daijirin2/child_entry.py
Normal file
9
bot/entries/daijirin2/child_entry.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
from bot.entries.daijirin2.base_entry import BaseEntry
|
||||||
|
|
||||||
|
|
||||||
|
class ChildEntry(BaseEntry):
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
headwords = self._get_regular_headwords(soup)
|
||||||
|
return headwords
|
50
bot/entries/daijirin2/entry.py
Normal file
50
bot/entries/daijirin2/entry.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
from bot.entries.daijirin2.base_entry import BaseEntry
|
||||||
|
from bot.entries.daijirin2.preprocess import preprocess_page
|
||||||
|
|
||||||
|
|
||||||
|
class Entry(BaseEntry):
|
||||||
|
def __init__(self, target, page_id):
|
||||||
|
entry_id = (page_id, 0)
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
|
||||||
|
def set_page(self, page):
|
||||||
|
page = preprocess_page(page)
|
||||||
|
super().set_page(page)
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
if soup.find("漢字見出") is not None:
|
||||||
|
headwords = self._get_kanji_headwords(soup)
|
||||||
|
elif soup.find("略語G") is not None:
|
||||||
|
headwords = self._get_acronym_headwords(soup)
|
||||||
|
else:
|
||||||
|
headwords = self._get_regular_headwords(soup)
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _get_kanji_headwords(self, soup):
|
||||||
|
readings = []
|
||||||
|
for el in soup.find_all("漢字音"):
|
||||||
|
hira = Expressions.kata_to_hira(el.text)
|
||||||
|
readings.append(hira)
|
||||||
|
if soup.find("漢字音") is None:
|
||||||
|
readings.append("")
|
||||||
|
expressions = []
|
||||||
|
for el in soup.find_all("漢字見出"):
|
||||||
|
expressions.append(el.text)
|
||||||
|
headwords = {}
|
||||||
|
for reading in readings:
|
||||||
|
headwords[reading] = expressions
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _get_acronym_headwords(self, soup):
|
||||||
|
expressions = []
|
||||||
|
for el in soup.find_all("略語"):
|
||||||
|
expression_parts = []
|
||||||
|
for part in el.find_all(["欧字", "和字"]):
|
||||||
|
expression_parts.append(part.text)
|
||||||
|
expression = "".join(expression_parts)
|
||||||
|
expressions.append(expression)
|
||||||
|
headwords = {"": expressions}
|
||||||
|
return headwords
|
67
bot/entries/daijirin2/phrase_entry.py
Normal file
67
bot/entries/daijirin2/phrase_entry.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
from bot.data import load_phrase_readings
|
||||||
|
from bot.entries.daijirin2.base_entry import BaseEntry
|
||||||
|
|
||||||
|
|
||||||
|
class PhraseEntry(BaseEntry):
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
# phrases do not contain these tags
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
headwords = {}
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
readings = self._find_readings()
|
||||||
|
for idx, expression in enumerate(expressions):
|
||||||
|
reading = readings[idx]
|
||||||
|
if reading in headwords:
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
else:
|
||||||
|
headwords[reading] = [expression]
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
text = soup.find("句表記").text
|
||||||
|
text = self._clean_expression(text)
|
||||||
|
alternatives = parse_phrase(text)
|
||||||
|
expressions = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for exp in Expressions.expand_abbreviation(alt):
|
||||||
|
expressions.append(exp)
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def _find_readings(self):
|
||||||
|
phrase_readings = load_phrase_readings(self.target)
|
||||||
|
text = phrase_readings[self.entry_id]
|
||||||
|
alternatives = parse_phrase(text)
|
||||||
|
readings = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for reading in Expressions.expand_abbreviation(alt):
|
||||||
|
readings.append(reading)
|
||||||
|
return readings
|
||||||
|
|
||||||
|
|
||||||
|
def parse_phrase(text):
|
||||||
|
"""Return a list of strings described by = notation."""
|
||||||
|
group_pattern = r"([^=]+)(=([^(]+)(=([^(]+)))?"
|
||||||
|
groups = re.findall(group_pattern, text)
|
||||||
|
expressions = [""]
|
||||||
|
for group in groups:
|
||||||
|
new_exps = []
|
||||||
|
for expression in expressions:
|
||||||
|
new_exps.append(expression + group[0])
|
||||||
|
expressions = new_exps.copy()
|
||||||
|
if group[1] == "":
|
||||||
|
continue
|
||||||
|
new_exps = []
|
||||||
|
for expression in expressions:
|
||||||
|
new_exps.append(expression + group[2])
|
||||||
|
for expression in expressions:
|
||||||
|
for alt in group[3].split("・"):
|
||||||
|
new_exps.append(expression + alt)
|
||||||
|
expressions = new_exps.copy()
|
||||||
|
return expressions
|
|
@ -1,20 +0,0 @@
|
||||||
from bot.targets import Targets
|
|
||||||
|
|
||||||
from bot.entries.jitenon import JitenonKokugoEntry
|
|
||||||
from bot.entries.jitenon import JitenonYojiEntry
|
|
||||||
from bot.entries.jitenon import JitenonKotowazaEntry
|
|
||||||
from bot.entries.smk8 import Smk8Entry
|
|
||||||
from bot.entries.daijirin2 import Daijirin2Entry
|
|
||||||
from bot.entries.sankoku8 import Sankoku8Entry
|
|
||||||
|
|
||||||
|
|
||||||
def new_entry(target, page_id):
|
|
||||||
entry_map = {
|
|
||||||
Targets.JITENON_KOKUGO: JitenonKokugoEntry,
|
|
||||||
Targets.JITENON_YOJI: JitenonYojiEntry,
|
|
||||||
Targets.JITENON_KOTOWAZA: JitenonKotowazaEntry,
|
|
||||||
Targets.SMK8: Smk8Entry,
|
|
||||||
Targets.DAIJIRIN2: Daijirin2Entry,
|
|
||||||
Targets.SANKOKU8: Sankoku8Entry,
|
|
||||||
}
|
|
||||||
return entry_map[target](target, page_id)
|
|
45
bot/entries/jitenon_kokugo/entry.py
Normal file
45
bot/entries/jitenon_kokugo/entry.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
from bot.entries.base.jitenon_entry import JitenonEntry
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
|
class Entry(JitenonEntry):
|
||||||
|
def __init__(self, target, entry_id):
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self.example = ""
|
||||||
|
self.alt_expression = ""
|
||||||
|
self.antonym = ""
|
||||||
|
self.attachments = ""
|
||||||
|
self.compounds = ""
|
||||||
|
self.related_words = ""
|
||||||
|
|
||||||
|
def _get_column_map(self):
|
||||||
|
return {
|
||||||
|
"言葉": "expression",
|
||||||
|
"読み方": "yomikata",
|
||||||
|
"意味": "definition",
|
||||||
|
"例文": "example",
|
||||||
|
"別表記": "alt_expression",
|
||||||
|
"対義語": "antonym",
|
||||||
|
"活用": "attachments",
|
||||||
|
"用例": "compounds",
|
||||||
|
"類語": "related_words",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
headwords = {}
|
||||||
|
for reading in self.yomikata.split("・"):
|
||||||
|
if reading not in headwords:
|
||||||
|
headwords[reading] = []
|
||||||
|
for expression in self.expression.split("・"):
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
if self.alt_expression.strip() != "":
|
||||||
|
for expression in self.alt_expression.split("・"):
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _add_variant_expressions(self, headwords):
|
||||||
|
for expressions in headwords.values():
|
||||||
|
Expressions.add_variant_kanji(expressions)
|
||||||
|
Expressions.add_fullwidth(expressions)
|
||||||
|
Expressions.remove_iteration_mark(expressions)
|
||||||
|
Expressions.add_iteration_mark(expressions)
|
35
bot/entries/jitenon_kotowaza/entry.py
Normal file
35
bot/entries/jitenon_kotowaza/entry.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
from bot.entries.base.jitenon_entry import JitenonEntry
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
|
class Entry(JitenonEntry):
|
||||||
|
def __init__(self, target, entry_id):
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self.origin = ""
|
||||||
|
self.example = ""
|
||||||
|
self.related_expressions = []
|
||||||
|
|
||||||
|
def _get_column_map(self):
|
||||||
|
return {
|
||||||
|
"言葉": "expression",
|
||||||
|
"読み方": "yomikata",
|
||||||
|
"意味": "definition",
|
||||||
|
"異形": "other_forms",
|
||||||
|
"出典": "origin",
|
||||||
|
"例文": "example",
|
||||||
|
"類句": "related_expressions",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
if self.expression == "金棒引き・鉄棒引き":
|
||||||
|
headwords = {
|
||||||
|
"かなぼうひき": ["金棒引き", "鉄棒引き"]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
headwords = super()._get_headwords()
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _add_variant_expressions(self, headwords):
|
||||||
|
for expressions in headwords.values():
|
||||||
|
Expressions.add_variant_kanji(expressions)
|
||||||
|
Expressions.add_fullwidth(expressions)
|
27
bot/entries/jitenon_yoji/entry.py
Normal file
27
bot/entries/jitenon_yoji/entry.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
from bot.entries.base.jitenon_entry import JitenonEntry
|
||||||
|
|
||||||
|
|
||||||
|
class Entry(JitenonEntry):
|
||||||
|
def __init__(self, target, entry_id):
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self.origin = ""
|
||||||
|
self.kanken_level = ""
|
||||||
|
self.category = ""
|
||||||
|
self.related_expressions = []
|
||||||
|
|
||||||
|
def _get_column_map(self):
|
||||||
|
return {
|
||||||
|
"四字熟語": "expression",
|
||||||
|
"読み方": "yomikata",
|
||||||
|
"意味": "definition",
|
||||||
|
"異形": "other_forms",
|
||||||
|
"出典": "origin",
|
||||||
|
"漢検級": "kanken_level",
|
||||||
|
"場面用途": "category",
|
||||||
|
"類義語": "related_expressions",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _add_variant_expressions(self, headwords):
|
||||||
|
for expressions in headwords.values():
|
||||||
|
Expressions.add_variant_kanji(expressions)
|
|
@ -1,260 +0,0 @@
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import bot.entries.expressions as Expressions
|
|
||||||
import bot.soup as Soup
|
|
||||||
from bot.entries.entry import Entry
|
|
||||||
from bot.data import load_phrase_readings
|
|
||||||
from bot.entries.sankoku8_preprocess import preprocess_page
|
|
||||||
|
|
||||||
|
|
||||||
class _BaseSankoku8Entry(Entry):
|
|
||||||
def __init__(self, target, entry_id):
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self.children = []
|
|
||||||
self.phrases = []
|
|
||||||
self._hyouki_name = "表記"
|
|
||||||
self._midashi_name = None
|
|
||||||
self._midashi_kana_name = None
|
|
||||||
|
|
||||||
def get_global_identifier(self):
|
|
||||||
parent_part = format(self.entry_id[0], '06')
|
|
||||||
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
|
||||||
return f"@{self.target.value}-{parent_part}-{child_part}"
|
|
||||||
|
|
||||||
def set_page(self, page):
|
|
||||||
page = self.__decompose_subentries(page)
|
|
||||||
self._page = page
|
|
||||||
|
|
||||||
def get_page_soup(self):
|
|
||||||
soup = BeautifulSoup(self._page, "xml")
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
readings = self._find_readings(soup)
|
|
||||||
expressions = self._find_expressions(soup)
|
|
||||||
headwords = {}
|
|
||||||
for reading in readings:
|
|
||||||
headwords[reading] = []
|
|
||||||
if len(readings) == 1:
|
|
||||||
reading = readings[0]
|
|
||||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
|
||||||
headwords[reading].append(reading)
|
|
||||||
for exp in expressions:
|
|
||||||
if exp not in headwords[reading]:
|
|
||||||
headwords[reading].append(exp)
|
|
||||||
elif len(readings) > 1 and len(expressions) == 0:
|
|
||||||
for reading in readings:
|
|
||||||
headwords[reading].append(reading)
|
|
||||||
elif len(readings) > 1 and len(expressions) == 1:
|
|
||||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
|
||||||
for reading in readings:
|
|
||||||
headwords[reading].append(reading)
|
|
||||||
expression = expressions[0]
|
|
||||||
for reading in readings:
|
|
||||||
if expression not in headwords[reading]:
|
|
||||||
headwords[reading].append(expression)
|
|
||||||
elif len(readings) > 1 and len(expressions) == len(readings):
|
|
||||||
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
|
||||||
for reading in readings:
|
|
||||||
headwords[reading].append(reading)
|
|
||||||
for idx, reading in enumerate(readings):
|
|
||||||
exp = expressions[idx]
|
|
||||||
if exp not in headwords[reading]:
|
|
||||||
headwords[reading].append(exp)
|
|
||||||
else:
|
|
||||||
raise Exception() # shouldn't happen
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _add_variant_expressions(self, headwords):
|
|
||||||
for expressions in headwords.values():
|
|
||||||
Expressions.add_variant_kanji(expressions)
|
|
||||||
Expressions.add_fullwidth(expressions)
|
|
||||||
Expressions.remove_iteration_mark(expressions)
|
|
||||||
Expressions.add_iteration_mark(expressions)
|
|
||||||
|
|
||||||
def get_part_of_speech_tags(self):
|
|
||||||
if self._part_of_speech_tags is not None:
|
|
||||||
return self._part_of_speech_tags
|
|
||||||
self._part_of_speech_tags = []
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
|
|
||||||
pos_group = midashi.find("品詞G")
|
|
||||||
if pos_group is None:
|
|
||||||
continue
|
|
||||||
for tag in pos_group.find_all("a"):
|
|
||||||
if tag.text not in self._part_of_speech_tags:
|
|
||||||
self._part_of_speech_tags.append(tag.text)
|
|
||||||
return self._part_of_speech_tags
|
|
||||||
|
|
||||||
def _find_expressions(self, soup):
|
|
||||||
expressions = []
|
|
||||||
for hyouki in soup.find_all(self._hyouki_name):
|
|
||||||
for expression in parse_hyouki_soup(hyouki, [""]):
|
|
||||||
expressions.append(expression)
|
|
||||||
return expressions
|
|
||||||
|
|
||||||
def _find_readings(self, soup):
|
|
||||||
midasi_kana = soup.find(self._midashi_kana_name)
|
|
||||||
readings = parse_hyouki_soup(midasi_kana, [""])
|
|
||||||
return readings
|
|
||||||
|
|
||||||
def __decompose_subentries(self, page):
|
|
||||||
soup = BeautifulSoup(page, features="xml")
|
|
||||||
subentry_parameters = [
|
|
||||||
[Sankoku8ChildEntry, ["子項目"], self.children],
|
|
||||||
[Sankoku8PhraseEntry, ["句項目"], self.phrases],
|
|
||||||
]
|
|
||||||
for x in subentry_parameters:
|
|
||||||
subentry_class, tags, subentry_list = x
|
|
||||||
for tag in tags:
|
|
||||||
tag_soup = soup.find(tag)
|
|
||||||
while tag_soup is not None:
|
|
||||||
tag_soup.name = "項目"
|
|
||||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
|
||||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
|
||||||
subentry = subentry_class(self.target, subentry_id)
|
|
||||||
page = tag_soup.decode()
|
|
||||||
subentry.set_page(page)
|
|
||||||
subentry_list.append(subentry)
|
|
||||||
tag_soup.decompose()
|
|
||||||
tag_soup = soup.find(tag)
|
|
||||||
return soup.decode()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def id_string_to_entry_id(id_string):
|
|
||||||
parts = id_string.split("-")
|
|
||||||
if len(parts) == 1:
|
|
||||||
return (int(parts[0]), 0)
|
|
||||||
elif len(parts) == 2:
|
|
||||||
# subentries have a hexadecimal part
|
|
||||||
return (int(parts[0]), int(parts[1], 16))
|
|
||||||
else:
|
|
||||||
raise Exception(f"Invalid entry ID: {id_string}")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _delete_unused_nodes(soup):
|
|
||||||
"""Remove extra markup elements that appear in the entry
|
|
||||||
headword line which are not part of the entry headword"""
|
|
||||||
unused_nodes = [
|
|
||||||
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
|
|
||||||
"アクセント分節", "活用分節", "ルビG", "分書"
|
|
||||||
]
|
|
||||||
for name in unused_nodes:
|
|
||||||
Soup.delete_soup_nodes(soup, name)
|
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8Entry(_BaseSankoku8Entry):
|
|
||||||
def __init__(self, target, page_id):
|
|
||||||
entry_id = (page_id, 0)
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self._midashi_name = "見出部"
|
|
||||||
self._midashi_kana_name = "見出仮名"
|
|
||||||
|
|
||||||
def set_page(self, page):
|
|
||||||
page = preprocess_page(page)
|
|
||||||
super().set_page(page)
|
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8ChildEntry(_BaseSankoku8Entry):
|
|
||||||
def __init__(self, target, page_id):
|
|
||||||
super().__init__(target, page_id)
|
|
||||||
self._midashi_name = "子見出部"
|
|
||||||
self._midashi_kana_name = "子見出仮名"
|
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8PhraseEntry(_BaseSankoku8Entry):
|
|
||||||
def get_part_of_speech_tags(self):
|
|
||||||
# phrases do not contain these tags
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
expressions = self._find_expressions(soup)
|
|
||||||
readings = self._find_readings(soup)
|
|
||||||
headwords = {}
|
|
||||||
if len(expressions) != len(readings):
|
|
||||||
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
|
|
||||||
for idx, expression in enumerate(expressions):
|
|
||||||
reading = readings[idx]
|
|
||||||
if reading in headwords:
|
|
||||||
headwords[reading].append(expression)
|
|
||||||
else:
|
|
||||||
headwords[reading] = [expression]
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _find_expressions(self, soup):
|
|
||||||
phrase_soup = soup.find("句表記")
|
|
||||||
expressions = parse_hyouki_soup(phrase_soup, [""])
|
|
||||||
return expressions
|
|
||||||
|
|
||||||
def _find_readings(self, soup):
|
|
||||||
reading_patterns = load_phrase_readings(self.target)
|
|
||||||
reading_pattern = reading_patterns[self.entry_id]
|
|
||||||
readings = parse_hyouki_pattern(reading_pattern)
|
|
||||||
return readings
|
|
||||||
|
|
||||||
|
|
||||||
def parse_hyouki_soup(soup, base_exps):
|
|
||||||
omitted_characters = [
|
|
||||||
"/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
|
|
||||||
]
|
|
||||||
exps = base_exps.copy()
|
|
||||||
for child in soup.children:
|
|
||||||
new_exps = []
|
|
||||||
if child.name == "言換G":
|
|
||||||
for alt in child.find_all("言換"):
|
|
||||||
parts = parse_hyouki_soup(alt, [""])
|
|
||||||
for exp in exps:
|
|
||||||
for part in parts:
|
|
||||||
new_exps.append(exp + part)
|
|
||||||
elif child.name == "補足表記":
|
|
||||||
alt1 = child.find("表記対象")
|
|
||||||
alt2 = child.find("表記内容G")
|
|
||||||
parts1 = parse_hyouki_soup(alt1, [""])
|
|
||||||
parts2 = parse_hyouki_soup(alt2, [""])
|
|
||||||
for exp in exps:
|
|
||||||
for part in parts1:
|
|
||||||
new_exps.append(exp + part)
|
|
||||||
for part in parts2:
|
|
||||||
new_exps.append(exp + part)
|
|
||||||
elif child.name == "省略":
|
|
||||||
parts = parse_hyouki_soup(child, [""])
|
|
||||||
for exp in exps:
|
|
||||||
new_exps.append(exp)
|
|
||||||
for part in parts:
|
|
||||||
new_exps.append(exp + part)
|
|
||||||
elif child.name is not None:
|
|
||||||
new_exps = parse_hyouki_soup(child, exps)
|
|
||||||
else:
|
|
||||||
text = child.text
|
|
||||||
for char in omitted_characters:
|
|
||||||
text = text.replace(char, "")
|
|
||||||
for exp in exps:
|
|
||||||
new_exps.append(exp + text)
|
|
||||||
exps = new_exps.copy()
|
|
||||||
return exps
|
|
||||||
|
|
||||||
|
|
||||||
def parse_hyouki_pattern(pattern):
|
|
||||||
replacements = {
|
|
||||||
"(": "<省略>(",
|
|
||||||
")": ")</省略>",
|
|
||||||
"{": "<補足表記><表記対象>",
|
|
||||||
"・": "</表記対象><表記内容G>(<表記内容>",
|
|
||||||
"}": "</表記内容>)</表記内容G></補足表記>",
|
|
||||||
"〈": "<言換G>〈<言換>",
|
|
||||||
"/": "</言換>/<言換>",
|
|
||||||
"〉": "</言換>〉</言換G>",
|
|
||||||
"⦅": "<補足表記><表記対象>",
|
|
||||||
"\": "</表記対象><表記内容G>⦅<表記内容>",
|
|
||||||
"⦆": "</表記内容>⦆</表記内容G></補足表記>",
|
|
||||||
}
|
|
||||||
markup = f"<span>{pattern}</span>"
|
|
||||||
for key, val in replacements.items():
|
|
||||||
markup = markup.replace(key, val)
|
|
||||||
soup = BeautifulSoup(markup, "xml")
|
|
||||||
hyouki_soup = soup.find("span")
|
|
||||||
exps = parse_hyouki_soup(hyouki_soup, [""])
|
|
||||||
return exps
|
|
104
bot/entries/sankoku8/base_entry.py
Normal file
104
bot/entries/sankoku8/base_entry.py
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
import bot.soup as Soup
|
||||||
|
from bot.entries.base.sanseido_entry import SanseidoEntry
|
||||||
|
from bot.entries.sankoku8.parse import parse_hyouki_soup
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEntry(SanseidoEntry):
|
||||||
|
def __init__(self, target, entry_id):
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self.children = []
|
||||||
|
self.phrases = []
|
||||||
|
self._hyouki_name = "表記"
|
||||||
|
self._midashi_name = None
|
||||||
|
self._midashi_kana_name = None
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
readings = self._find_readings(soup)
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
headwords = {}
|
||||||
|
for reading in readings:
|
||||||
|
headwords[reading] = []
|
||||||
|
if len(readings) == 1:
|
||||||
|
reading = readings[0]
|
||||||
|
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||||
|
headwords[reading].append(reading)
|
||||||
|
for exp in expressions:
|
||||||
|
if exp not in headwords[reading]:
|
||||||
|
headwords[reading].append(exp)
|
||||||
|
elif len(readings) > 1 and len(expressions) == 0:
|
||||||
|
for reading in readings:
|
||||||
|
headwords[reading].append(reading)
|
||||||
|
elif len(readings) > 1 and len(expressions) == 1:
|
||||||
|
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||||
|
for reading in readings:
|
||||||
|
headwords[reading].append(reading)
|
||||||
|
expression = expressions[0]
|
||||||
|
for reading in readings:
|
||||||
|
if expression not in headwords[reading]:
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
elif len(readings) > 1 and len(expressions) == len(readings):
|
||||||
|
if soup.find(self._midashi_name).find(self._hyouki_name) is None:
|
||||||
|
for reading in readings:
|
||||||
|
headwords[reading].append(reading)
|
||||||
|
for idx, reading in enumerate(readings):
|
||||||
|
exp = expressions[idx]
|
||||||
|
if exp not in headwords[reading]:
|
||||||
|
headwords[reading].append(exp)
|
||||||
|
else:
|
||||||
|
raise Exception() # shouldn't happen
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
if self._part_of_speech_tags is not None:
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
self._part_of_speech_tags = []
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
for midashi in soup.find_all([self._midashi_name, "見出部要素"]):
|
||||||
|
pos_group = midashi.find("品詞G")
|
||||||
|
if pos_group is None:
|
||||||
|
continue
|
||||||
|
for tag in pos_group.find_all("a"):
|
||||||
|
if tag.text not in self._part_of_speech_tags:
|
||||||
|
self._part_of_speech_tags.append(tag.text)
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
expressions = []
|
||||||
|
for hyouki in soup.find_all(self._hyouki_name):
|
||||||
|
self._fill_alts(hyouki)
|
||||||
|
for expression in parse_hyouki_soup(hyouki, [""]):
|
||||||
|
expressions.append(expression)
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def _find_readings(self, soup):
|
||||||
|
midasi_kana = soup.find(self._midashi_kana_name)
|
||||||
|
readings = parse_hyouki_soup(midasi_kana, [""])
|
||||||
|
return readings
|
||||||
|
|
||||||
|
def _get_subentry_parameters(self):
|
||||||
|
from bot.entries.sankoku8.child_entry import ChildEntry
|
||||||
|
from bot.entries.sankoku8.phrase_entry import PhraseEntry
|
||||||
|
subentry_parameters = [
|
||||||
|
[ChildEntry, ["子項目"], self.children],
|
||||||
|
[PhraseEntry, ["句項目"], self.phrases],
|
||||||
|
]
|
||||||
|
return subentry_parameters
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _delete_unused_nodes(soup):
|
||||||
|
"""Remove extra markup elements that appear in the entry
|
||||||
|
headword line which are not part of the entry headword"""
|
||||||
|
unused_nodes = [
|
||||||
|
"語構成", "平板", "アクセント", "表外字マーク", "表外音訓マーク",
|
||||||
|
"アクセント分節", "活用分節", "ルビG", "分書"
|
||||||
|
]
|
||||||
|
for name in unused_nodes:
|
||||||
|
Soup.delete_soup_nodes(soup, name)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fill_alts(soup):
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
if img.has_attr("alt"):
|
||||||
|
img.string = img.attrs["alt"]
|
8
bot/entries/sankoku8/child_entry.py
Normal file
8
bot/entries/sankoku8/child_entry.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from bot.entries.sankoku8.base_entry import BaseEntry
|
||||||
|
|
||||||
|
|
||||||
|
class ChildEntry(BaseEntry):
|
||||||
|
def __init__(self, target, page_id):
|
||||||
|
super().__init__(target, page_id)
|
||||||
|
self._midashi_name = "子見出部"
|
||||||
|
self._midashi_kana_name = "子見出仮名"
|
14
bot/entries/sankoku8/entry.py
Normal file
14
bot/entries/sankoku8/entry.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
from bot.entries.sankoku8.base_entry import BaseEntry
|
||||||
|
from bot.entries.sankoku8.preprocess import preprocess_page
|
||||||
|
|
||||||
|
|
||||||
|
class Entry(BaseEntry):
|
||||||
|
def __init__(self, target, page_id):
|
||||||
|
entry_id = (page_id, 0)
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self._midashi_name = "見出部"
|
||||||
|
self._midashi_kana_name = "見出仮名"
|
||||||
|
|
||||||
|
def set_page(self, page):
|
||||||
|
page = preprocess_page(page)
|
||||||
|
super().set_page(page)
|
65
bot/entries/sankoku8/parse.py
Normal file
65
bot/entries/sankoku8/parse.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_hyouki_soup(soup, base_exps):
|
||||||
|
omitted_characters = [
|
||||||
|
"/", "〈", "〉", "(", ")", "⦅", "⦆", ":", "…"
|
||||||
|
]
|
||||||
|
exps = base_exps.copy()
|
||||||
|
for child in soup.children:
|
||||||
|
new_exps = []
|
||||||
|
if child.name == "言換G":
|
||||||
|
for alt in child.find_all("言換"):
|
||||||
|
parts = parse_hyouki_soup(alt, [""])
|
||||||
|
for exp in exps:
|
||||||
|
for part in parts:
|
||||||
|
new_exps.append(exp + part)
|
||||||
|
elif child.name == "補足表記":
|
||||||
|
alt1 = child.find("表記対象")
|
||||||
|
alt2 = child.find("表記内容G")
|
||||||
|
parts1 = parse_hyouki_soup(alt1, [""])
|
||||||
|
parts2 = parse_hyouki_soup(alt2, [""])
|
||||||
|
for exp in exps:
|
||||||
|
for part in parts1:
|
||||||
|
new_exps.append(exp + part)
|
||||||
|
for part in parts2:
|
||||||
|
new_exps.append(exp + part)
|
||||||
|
elif child.name == "省略":
|
||||||
|
parts = parse_hyouki_soup(child, [""])
|
||||||
|
for exp in exps:
|
||||||
|
new_exps.append(exp)
|
||||||
|
for part in parts:
|
||||||
|
new_exps.append(exp + part)
|
||||||
|
elif child.name is not None:
|
||||||
|
new_exps = parse_hyouki_soup(child, exps)
|
||||||
|
else:
|
||||||
|
text = child.text
|
||||||
|
for char in omitted_characters:
|
||||||
|
text = text.replace(char, "")
|
||||||
|
for exp in exps:
|
||||||
|
new_exps.append(exp + text)
|
||||||
|
exps = new_exps.copy()
|
||||||
|
return exps
|
||||||
|
|
||||||
|
|
||||||
|
def parse_hyouki_pattern(pattern):
|
||||||
|
replacements = {
|
||||||
|
"(": "<省略>(",
|
||||||
|
")": ")</省略>",
|
||||||
|
"{": "<補足表記><表記対象>",
|
||||||
|
"・": "</表記対象><表記内容G>(<表記内容>",
|
||||||
|
"}": "</表記内容>)</表記内容G></補足表記>",
|
||||||
|
"〈": "<言換G>〈<言換>",
|
||||||
|
"/": "</言換>/<言換>",
|
||||||
|
"〉": "</言換>〉</言換G>",
|
||||||
|
"⦅": "<補足表記><表記対象>",
|
||||||
|
"\": "</表記対象><表記内容G>⦅<表記内容>",
|
||||||
|
"⦆": "</表記内容>⦆</表記内容G></補足表記>",
|
||||||
|
}
|
||||||
|
markup = f"<span>{pattern}</span>"
|
||||||
|
for key, val in replacements.items():
|
||||||
|
markup = markup.replace(key, val)
|
||||||
|
soup = BeautifulSoup(markup, "xml")
|
||||||
|
hyouki_soup = soup.find("span")
|
||||||
|
exps = parse_hyouki_soup(hyouki_soup, [""])
|
||||||
|
return exps
|
37
bot/entries/sankoku8/phrase_entry.py
Normal file
37
bot/entries/sankoku8/phrase_entry.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
from bot.data import load_phrase_readings
|
||||||
|
from bot.entries.sankoku8.base_entry import BaseEntry
|
||||||
|
from bot.entries.sankoku8.parse import parse_hyouki_soup
|
||||||
|
from bot.entries.sankoku8.parse import parse_hyouki_pattern
|
||||||
|
|
||||||
|
|
||||||
|
class PhraseEntry(BaseEntry):
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
# phrases do not contain these tags
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
readings = self._find_readings(soup)
|
||||||
|
headwords = {}
|
||||||
|
if len(expressions) != len(readings):
|
||||||
|
raise Exception(f"{self.entry_id[0]}-{self.entry_id[1]}")
|
||||||
|
for idx, expression in enumerate(expressions):
|
||||||
|
reading = readings[idx]
|
||||||
|
if reading in headwords:
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
else:
|
||||||
|
headwords[reading] = [expression]
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
phrase_soup = soup.find("句表記")
|
||||||
|
expressions = parse_hyouki_soup(phrase_soup, [""])
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def _find_readings(self, soup):
|
||||||
|
reading_patterns = load_phrase_readings(self.target)
|
||||||
|
reading_pattern = reading_patterns[self.entry_id]
|
||||||
|
readings = parse_hyouki_pattern(reading_pattern)
|
||||||
|
return readings
|
|
@ -4,9 +4,17 @@ from bs4 import BeautifulSoup
|
||||||
from bot.data import get_adobe_glyph
|
from bot.data import get_adobe_glyph
|
||||||
|
|
||||||
|
|
||||||
|
__GAIJI = {
|
||||||
|
"svg-gaiji/byan.svg": "𰻞",
|
||||||
|
"svg-gaiji/G16EF.svg": "篡",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def preprocess_page(page):
|
def preprocess_page(page):
|
||||||
soup = BeautifulSoup(page, features="xml")
|
soup = BeautifulSoup(page, features="xml")
|
||||||
__replace_glyph_codes(soup)
|
__replace_glyph_codes(soup)
|
||||||
|
__add_image_alt_text(soup)
|
||||||
|
__replace_tatehyphen(soup)
|
||||||
page = __strip_page(soup)
|
page = __strip_page(soup)
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
@ -20,6 +28,21 @@ def __replace_glyph_codes(soup):
|
||||||
geta.replace_with(glyph)
|
geta.replace_with(glyph)
|
||||||
|
|
||||||
|
|
||||||
|
def __add_image_alt_text(soup):
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
if not img.has_attr("src"):
|
||||||
|
continue
|
||||||
|
src = img.attrs["src"]
|
||||||
|
if src in __GAIJI:
|
||||||
|
img.attrs["alt"] = __GAIJI[src]
|
||||||
|
|
||||||
|
|
||||||
|
def __replace_tatehyphen(soup):
|
||||||
|
for img in soup.find_all("img", {"src": "svg-gaiji/tatehyphen.svg"}):
|
||||||
|
img.string = "−"
|
||||||
|
img.unwrap()
|
||||||
|
|
||||||
|
|
||||||
def __strip_page(soup):
|
def __strip_page(soup):
|
||||||
koumoku = soup.find(["項目"])
|
koumoku = soup.find(["項目"])
|
||||||
if koumoku is not None:
|
if koumoku is not None:
|
|
@ -1,221 +0,0 @@
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import bot.entries.expressions as Expressions
|
|
||||||
import bot.soup as Soup
|
|
||||||
from bot.data import load_phrase_readings
|
|
||||||
from bot.entries.entry import Entry
|
|
||||||
from bot.entries.smk8_preprocess import preprocess_page
|
|
||||||
|
|
||||||
|
|
||||||
class _BaseSmk8Entry(Entry):
|
|
||||||
def __init__(self, target, entry_id):
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self.children = []
|
|
||||||
self.phrases = []
|
|
||||||
self.kanjis = []
|
|
||||||
|
|
||||||
def get_global_identifier(self):
|
|
||||||
parent_part = format(self.entry_id[0], '06')
|
|
||||||
child_part = hex(self.entry_id[1]).lstrip('0x').zfill(4).upper()
|
|
||||||
return f"@{self.target.value}-{parent_part}-{child_part}"
|
|
||||||
|
|
||||||
def set_page(self, page):
|
|
||||||
page = self.__decompose_subentries(page)
|
|
||||||
self._page = page
|
|
||||||
|
|
||||||
def get_page_soup(self):
|
|
||||||
soup = BeautifulSoup(self._page, "xml")
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def get_part_of_speech_tags(self):
|
|
||||||
if self._part_of_speech_tags is not None:
|
|
||||||
return self._part_of_speech_tags
|
|
||||||
self._part_of_speech_tags = []
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
headword_info = soup.find("見出要素")
|
|
||||||
if headword_info is None:
|
|
||||||
return self._part_of_speech_tags
|
|
||||||
for tag in headword_info.find_all("品詞M"):
|
|
||||||
if tag.text not in self._part_of_speech_tags:
|
|
||||||
self._part_of_speech_tags.append(tag.text)
|
|
||||||
return self._part_of_speech_tags
|
|
||||||
|
|
||||||
def _add_variant_expressions(self, headwords):
|
|
||||||
for expressions in headwords.values():
|
|
||||||
Expressions.add_variant_kanji(expressions)
|
|
||||||
Expressions.add_fullwidth(expressions)
|
|
||||||
Expressions.remove_iteration_mark(expressions)
|
|
||||||
Expressions.add_iteration_mark(expressions)
|
|
||||||
|
|
||||||
def _find_reading(self, soup):
|
|
||||||
midasi_kana = soup.find("見出仮名")
|
|
||||||
reading = midasi_kana.text
|
|
||||||
for x in [" ", "・"]:
|
|
||||||
reading = reading.replace(x, "")
|
|
||||||
return reading
|
|
||||||
|
|
||||||
def _find_expressions(self, soup):
|
|
||||||
clean_expressions = []
|
|
||||||
for expression in soup.find_all("標準表記"):
|
|
||||||
clean_expression = self._clean_expression(expression.text)
|
|
||||||
clean_expressions.append(clean_expression)
|
|
||||||
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
|
||||||
return expressions
|
|
||||||
|
|
||||||
def __decompose_subentries(self, page):
|
|
||||||
soup = BeautifulSoup(page, features="xml")
|
|
||||||
subentry_parameters = [
|
|
||||||
[Smk8ChildEntry, ["子項目F", "子項目"], self.children],
|
|
||||||
[Smk8PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
|
||||||
[Smk8KanjiEntry, ["造語成分項目"], self.kanjis],
|
|
||||||
]
|
|
||||||
for x in subentry_parameters:
|
|
||||||
subentry_class, tags, subentry_list = x
|
|
||||||
for tag in tags:
|
|
||||||
tag_soup = soup.find(tag)
|
|
||||||
while tag_soup is not None:
|
|
||||||
tag_soup.name = "項目"
|
|
||||||
subentry_id = self.id_string_to_entry_id(tag_soup.attrs["id"])
|
|
||||||
self.SUBENTRY_ID_TO_ENTRY_ID[subentry_id] = self.entry_id
|
|
||||||
subentry = subentry_class(self.target, subentry_id)
|
|
||||||
page = tag_soup.decode()
|
|
||||||
subentry.set_page(page)
|
|
||||||
subentry_list.append(subentry)
|
|
||||||
tag_soup.decompose()
|
|
||||||
tag_soup = soup.find(tag)
|
|
||||||
return soup.decode()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def id_string_to_entry_id(id_string):
|
|
||||||
parts = id_string.split("-")
|
|
||||||
if len(parts) == 1:
|
|
||||||
return (int(parts[0]), 0)
|
|
||||||
elif len(parts) == 2:
|
|
||||||
# subentries have a hexadecimal part
|
|
||||||
return (int(parts[0]), int(parts[1], 16))
|
|
||||||
else:
|
|
||||||
raise Exception(f"Invalid entry ID: {id_string}")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _delete_unused_nodes(soup):
|
|
||||||
"""Remove extra markup elements that appear in the entry
|
|
||||||
headword line which are not part of the entry headword"""
|
|
||||||
unused_nodes = [
|
|
||||||
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
|
|
||||||
]
|
|
||||||
for name in unused_nodes:
|
|
||||||
Soup.delete_soup_nodes(soup, name)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _clean_expression(expression):
|
|
||||||
for x in ["〈", "〉", "{", "}", "…", " "]:
|
|
||||||
expression = expression.replace(x, "")
|
|
||||||
return expression
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fill_alts(soup):
|
|
||||||
for el in soup.find_all(["親見出仮名", "親見出表記"]):
|
|
||||||
el.string = el.attrs["alt"]
|
|
||||||
for gaiji in soup.find_all("外字"):
|
|
||||||
gaiji.string = gaiji.img.attrs["alt"]
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8Entry(_BaseSmk8Entry):
|
|
||||||
def __init__(self, target, page_id):
|
|
||||||
entry_id = (page_id, 0)
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
|
|
||||||
def set_page(self, page):
|
|
||||||
page = preprocess_page(page)
|
|
||||||
super().set_page(page)
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
self._fill_alts(soup)
|
|
||||||
reading = self._find_reading(soup)
|
|
||||||
expressions = []
|
|
||||||
if soup.find("見出部").find("標準表記") is None:
|
|
||||||
expressions.append(reading)
|
|
||||||
for expression in self._find_expressions(soup):
|
|
||||||
if expression not in expressions:
|
|
||||||
expressions.append(expression)
|
|
||||||
headwords = {reading: expressions}
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8ChildEntry(_BaseSmk8Entry):
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
self._fill_alts(soup)
|
|
||||||
reading = self._find_reading(soup)
|
|
||||||
expressions = []
|
|
||||||
if soup.find("子見出部").find("標準表記") is None:
|
|
||||||
expressions.append(reading)
|
|
||||||
for expression in self._find_expressions(soup):
|
|
||||||
if expression not in expressions:
|
|
||||||
expressions.append(expression)
|
|
||||||
headwords = {reading: expressions}
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8PhraseEntry(_BaseSmk8Entry):
|
|
||||||
def __init__(self, target, entry_id):
|
|
||||||
super().__init__(target, entry_id)
|
|
||||||
self.__phrase_readings = load_phrase_readings(self.target)
|
|
||||||
|
|
||||||
def get_part_of_speech_tags(self):
|
|
||||||
# phrases do not contain these tags
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
headwords = {}
|
|
||||||
expressions = self._find_expressions(soup)
|
|
||||||
readings = self._find_readings()
|
|
||||||
for idx, expression in enumerate(expressions):
|
|
||||||
reading = readings[idx]
|
|
||||||
if reading in headwords:
|
|
||||||
headwords[reading].append(expression)
|
|
||||||
else:
|
|
||||||
headwords[reading] = [expression]
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def _find_expressions(self, soup):
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
self._fill_alts(soup)
|
|
||||||
text = soup.find("標準表記").text
|
|
||||||
text = self._clean_expression(text)
|
|
||||||
alternatives = Expressions.expand_smk_alternatives(text)
|
|
||||||
expressions = []
|
|
||||||
for alt in alternatives:
|
|
||||||
for exp in Expressions.expand_abbreviation(alt):
|
|
||||||
expressions.append(exp)
|
|
||||||
return expressions
|
|
||||||
|
|
||||||
def _find_readings(self):
|
|
||||||
text = self.__phrase_readings[self.entry_id]
|
|
||||||
alternatives = Expressions.expand_smk_alternatives(text)
|
|
||||||
readings = []
|
|
||||||
for alt in alternatives:
|
|
||||||
for reading in Expressions.expand_abbreviation(alt):
|
|
||||||
readings.append(reading)
|
|
||||||
return readings
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8KanjiEntry(_BaseSmk8Entry):
|
|
||||||
def _get_headwords(self):
|
|
||||||
soup = self.get_page_soup()
|
|
||||||
self._delete_unused_nodes(soup)
|
|
||||||
self._fill_alts(soup)
|
|
||||||
reading = self.__get_parent_reading()
|
|
||||||
expressions = self._find_expressions(soup)
|
|
||||||
headwords = {reading: expressions}
|
|
||||||
return headwords
|
|
||||||
|
|
||||||
def __get_parent_reading(self):
|
|
||||||
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
|
|
||||||
parent = self.ID_TO_ENTRY[parent_id]
|
|
||||||
reading = parent.get_first_reading()
|
|
||||||
return reading
|
|
73
bot/entries/smk8/base_entry.py
Normal file
73
bot/entries/smk8/base_entry.py
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
import bot.soup as Soup
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
from bot.entries.base.sanseido_entry import SanseidoEntry
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEntry(SanseidoEntry):
|
||||||
|
def __init__(self, target, entry_id):
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self.children = []
|
||||||
|
self.phrases = []
|
||||||
|
self.kanjis = []
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
if self._part_of_speech_tags is not None:
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
self._part_of_speech_tags = []
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
headword_info = soup.find("見出要素")
|
||||||
|
if headword_info is None:
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
for tag in headword_info.find_all("品詞M"):
|
||||||
|
if tag.text not in self._part_of_speech_tags:
|
||||||
|
self._part_of_speech_tags.append(tag.text)
|
||||||
|
return self._part_of_speech_tags
|
||||||
|
|
||||||
|
def _find_reading(self, soup):
|
||||||
|
midasi_kana = soup.find("見出仮名")
|
||||||
|
reading = midasi_kana.text
|
||||||
|
for x in [" ", "・"]:
|
||||||
|
reading = reading.replace(x, "")
|
||||||
|
return reading
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
clean_expressions = []
|
||||||
|
for expression in soup.find_all("標準表記"):
|
||||||
|
clean_expression = self._clean_expression(expression.text)
|
||||||
|
clean_expressions.append(clean_expression)
|
||||||
|
expressions = Expressions.expand_abbreviation_list(clean_expressions)
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def _get_subentry_parameters(self):
|
||||||
|
from bot.entries.smk8.child_entry import ChildEntry
|
||||||
|
from bot.entries.smk8.phrase_entry import PhraseEntry
|
||||||
|
from bot.entries.smk8.kanji_entry import KanjiEntry
|
||||||
|
subentry_parameters = [
|
||||||
|
[ChildEntry, ["子項目F", "子項目"], self.children],
|
||||||
|
[PhraseEntry, ["句項目F", "句項目"], self.phrases],
|
||||||
|
[KanjiEntry, ["造語成分項目"], self.kanjis],
|
||||||
|
]
|
||||||
|
return subentry_parameters
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _delete_unused_nodes(soup):
|
||||||
|
"""Remove extra markup elements that appear in the entry
|
||||||
|
headword line which are not part of the entry headword"""
|
||||||
|
unused_nodes = [
|
||||||
|
"表音表記", "表外音訓マーク", "表外字マーク", "ルビG"
|
||||||
|
]
|
||||||
|
for name in unused_nodes:
|
||||||
|
Soup.delete_soup_nodes(soup, name)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_expression(expression):
|
||||||
|
for x in ["〈", "〉", "{", "}", "…", " "]:
|
||||||
|
expression = expression.replace(x, "")
|
||||||
|
return expression
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fill_alts(soup):
|
||||||
|
for elm in soup.find_all(["親見出仮名", "親見出表記"]):
|
||||||
|
elm.string = elm.attrs["alt"]
|
||||||
|
for gaiji in soup.find_all("外字"):
|
||||||
|
gaiji.string = gaiji.img.attrs["alt"]
|
17
bot/entries/smk8/child_entry.py
Normal file
17
bot/entries/smk8/child_entry.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
from bot.entries.smk8.base_entry import BaseEntry
|
||||||
|
|
||||||
|
|
||||||
|
class ChildEntry(BaseEntry):
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = self._find_reading(soup)
|
||||||
|
expressions = []
|
||||||
|
if soup.find("子見出部").find("標準表記") is None:
|
||||||
|
expressions.append(reading)
|
||||||
|
for expression in self._find_expressions(soup):
|
||||||
|
if expression not in expressions:
|
||||||
|
expressions.append(expression)
|
||||||
|
headwords = {reading: expressions}
|
||||||
|
return headwords
|
26
bot/entries/smk8/entry.py
Normal file
26
bot/entries/smk8/entry.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
from bot.entries.smk8.base_entry import BaseEntry
|
||||||
|
from bot.entries.smk8.preprocess import preprocess_page
|
||||||
|
|
||||||
|
|
||||||
|
class Entry(BaseEntry):
|
||||||
|
def __init__(self, target, page_id):
|
||||||
|
entry_id = (page_id, 0)
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
|
||||||
|
def set_page(self, page):
|
||||||
|
page = preprocess_page(page)
|
||||||
|
super().set_page(page)
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = self._find_reading(soup)
|
||||||
|
expressions = []
|
||||||
|
if soup.find("見出部").find("標準表記") is None:
|
||||||
|
expressions.append(reading)
|
||||||
|
for expression in self._find_expressions(soup):
|
||||||
|
if expression not in expressions:
|
||||||
|
expressions.append(expression)
|
||||||
|
headwords = {reading: expressions}
|
||||||
|
return headwords
|
22
bot/entries/smk8/kanji_entry.py
Normal file
22
bot/entries/smk8/kanji_entry.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
from bot.entries.smk8.base_entry import BaseEntry
|
||||||
|
|
||||||
|
|
||||||
|
class KanjiEntry(BaseEntry):
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
# kanji entries do not contain these tags
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
self._fill_alts(soup)
|
||||||
|
reading = self.__get_parent_reading()
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
headwords = {reading: expressions}
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def __get_parent_reading(self):
|
||||||
|
parent_id = self.SUBENTRY_ID_TO_ENTRY_ID[self.entry_id]
|
||||||
|
parent = self.ID_TO_ENTRY[parent_id]
|
||||||
|
reading = parent.get_first_reading()
|
||||||
|
return reading
|
64
bot/entries/smk8/phrase_entry.py
Normal file
64
bot/entries/smk8/phrase_entry.py
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
import bot.entries.base.expressions as Expressions
|
||||||
|
from bot.data import load_phrase_readings
|
||||||
|
from bot.entries.smk8.base_entry import BaseEntry
|
||||||
|
|
||||||
|
|
||||||
|
class PhraseEntry(BaseEntry):
|
||||||
|
def __init__(self, target, entry_id):
|
||||||
|
super().__init__(target, entry_id)
|
||||||
|
self.__phrase_readings = load_phrase_readings(self.target)
|
||||||
|
|
||||||
|
def get_part_of_speech_tags(self):
|
||||||
|
# phrase entries do not contain these tags
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _get_headwords(self):
|
||||||
|
soup = self.get_page_soup()
|
||||||
|
headwords = {}
|
||||||
|
expressions = self._find_expressions(soup)
|
||||||
|
readings = self._find_readings()
|
||||||
|
for idx, expression in enumerate(expressions):
|
||||||
|
reading = readings[idx]
|
||||||
|
if reading in headwords:
|
||||||
|
headwords[reading].append(expression)
|
||||||
|
else:
|
||||||
|
headwords[reading] = [expression]
|
||||||
|
return headwords
|
||||||
|
|
||||||
|
def _find_expressions(self, soup):
|
||||||
|
self._delete_unused_nodes(soup)
|
||||||
|
self._fill_alts(soup)
|
||||||
|
text = soup.find("標準表記").text
|
||||||
|
text = self._clean_expression(text)
|
||||||
|
alternatives = parse_phrase(text)
|
||||||
|
expressions = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for exp in Expressions.expand_abbreviation(alt):
|
||||||
|
expressions.append(exp)
|
||||||
|
return expressions
|
||||||
|
|
||||||
|
def _find_readings(self):
|
||||||
|
text = self.__phrase_readings[self.entry_id]
|
||||||
|
alternatives = parse_phrase(text)
|
||||||
|
readings = []
|
||||||
|
for alt in alternatives:
|
||||||
|
for reading in Expressions.expand_abbreviation(alt):
|
||||||
|
readings.append(reading)
|
||||||
|
return readings
|
||||||
|
|
||||||
|
|
||||||
|
def parse_phrase(text):
|
||||||
|
"""Return a list of strings described by △ notation."""
|
||||||
|
match = re.search(r"△([^(]+)(([^(]+))", text)
|
||||||
|
if match is None:
|
||||||
|
return [text]
|
||||||
|
alt_parts = [match.group(1)]
|
||||||
|
for alt_part in match.group(2).split("・"):
|
||||||
|
alt_parts.append(alt_part)
|
||||||
|
alts = []
|
||||||
|
for alt_part in alt_parts:
|
||||||
|
alt_exp = re.sub(r"△[^(]+([^(]+)", alt_part, text)
|
||||||
|
alts.append(alt_exp)
|
||||||
|
return alts
|
|
@ -6,8 +6,8 @@ from bot.data import get_adobe_glyph
|
||||||
|
|
||||||
__GAIJI = {
|
__GAIJI = {
|
||||||
"gaiji/5350.svg": "卐",
|
"gaiji/5350.svg": "卐",
|
||||||
"gaiji/62cb.svg": "抛",
|
"gaiji/62cb.svg": "拋",
|
||||||
"gaiji/7be1.svg": "簒",
|
"gaiji/7be1.svg": "篡",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
37
bot/factory.py
Normal file
37
bot/factory.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
|
||||||
|
def new_crawler(target):
|
||||||
|
module_path = f"bot.crawlers.{target.name.lower()}"
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
return module.Crawler(target)
|
||||||
|
|
||||||
|
|
||||||
|
def new_entry(target, page_id):
|
||||||
|
module_path = f"bot.entries.{target.name.lower()}.entry"
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
return module.Entry(target, page_id)
|
||||||
|
|
||||||
|
|
||||||
|
def new_yomichan_exporter(target):
|
||||||
|
module_path = f"bot.yomichan.exporters.{target.name.lower()}"
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
return module.Exporter(target)
|
||||||
|
|
||||||
|
|
||||||
|
def new_yomichan_terminator(target):
|
||||||
|
module_path = f"bot.yomichan.terms.{target.name.lower()}"
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
return module.Terminator(target)
|
||||||
|
|
||||||
|
|
||||||
|
def new_mdict_exporter(target):
|
||||||
|
module_path = f"bot.mdict.exporters.{target.name.lower()}"
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
return module.Exporter(target)
|
||||||
|
|
||||||
|
|
||||||
|
def new_mdict_terminator(target):
|
||||||
|
module_path = f"bot.mdict.terms.{target.name.lower()}"
|
||||||
|
module = importlib.import_module(module_path)
|
||||||
|
return module.Terminator(target)
|
|
@ -1,20 +1,19 @@
|
||||||
# pylint: disable=too-few-public-methods
|
|
||||||
|
|
||||||
import subprocess
|
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import subprocess
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
|
||||||
from platformdirs import user_documents_dir, user_cache_dir
|
from platformdirs import user_documents_dir, user_cache_dir
|
||||||
|
|
||||||
from bot.mdict.terms.factory import new_terminator
|
from bot.time import timestamp
|
||||||
|
from bot.factory import new_mdict_terminator
|
||||||
|
|
||||||
|
|
||||||
class Exporter(ABC):
|
class BaseExporter(ABC):
|
||||||
def __init__(self, target):
|
def __init__(self, target):
|
||||||
self._target = target
|
self._target = target
|
||||||
self._terminator = new_terminator(target)
|
self._terminator = new_mdict_terminator(target)
|
||||||
self._build_dir = None
|
self._build_dir = None
|
||||||
self._build_media_dir = None
|
self._build_media_dir = None
|
||||||
self._description_file = None
|
self._description_file = None
|
||||||
|
@ -34,7 +33,7 @@ class Exporter(ABC):
|
||||||
return self._build_dir
|
return self._build_dir
|
||||||
cache_dir = user_cache_dir("jitenbot")
|
cache_dir = user_cache_dir("jitenbot")
|
||||||
build_directory = os.path.join(cache_dir, "mdict_build")
|
build_directory = os.path.join(cache_dir, "mdict_build")
|
||||||
print(f"Initializing build directory `{build_directory}`")
|
print(f"{timestamp()} Initializing build directory `{build_directory}`")
|
||||||
if Path(build_directory).is_dir():
|
if Path(build_directory).is_dir():
|
||||||
shutil.rmtree(build_directory)
|
shutil.rmtree(build_directory)
|
||||||
os.makedirs(build_directory)
|
os.makedirs(build_directory)
|
||||||
|
@ -45,7 +44,7 @@ class Exporter(ABC):
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
build_media_dir = os.path.join(build_dir, self._target.value)
|
build_media_dir = os.path.join(build_dir, self._target.value)
|
||||||
if media_dir is not None:
|
if media_dir is not None:
|
||||||
print("Copying media files to build directory...")
|
print(f"{timestamp()} Copying media files to build directory...")
|
||||||
shutil.copytree(media_dir, build_media_dir)
|
shutil.copytree(media_dir, build_media_dir)
|
||||||
else:
|
else:
|
||||||
os.makedirs(build_media_dir)
|
os.makedirs(build_media_dir)
|
||||||
|
@ -71,7 +70,7 @@ class Exporter(ABC):
|
||||||
|
|
||||||
def _write_mdx_file(self, entries):
|
def _write_mdx_file(self, entries):
|
||||||
terms = self._get_terms(entries)
|
terms = self._get_terms(entries)
|
||||||
print(f"Exporting {len(terms)} Mdict keys...")
|
print(f"{timestamp()} Exporting {len(terms)} Mdict keys...")
|
||||||
out_dir = self._get_out_dir()
|
out_dir = self._get_out_dir()
|
||||||
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
|
out_file = os.path.join(out_dir, f"{self._target.value}.mdx")
|
||||||
params = [
|
params = [
|
||||||
|
@ -87,7 +86,7 @@ class Exporter(ABC):
|
||||||
terms = []
|
terms = []
|
||||||
entries_len = len(entries)
|
entries_len = len(entries)
|
||||||
for idx, entry in enumerate(entries):
|
for idx, entry in enumerate(entries):
|
||||||
update = f"Creating Mdict terms for entry {idx+1}/{entries_len}"
|
update = f"\tCreating MDict terms for entry {idx+1}/{entries_len}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
new_terms = self._terminator.make_terms(entry)
|
new_terms = self._terminator.make_terms(entry)
|
||||||
for term in new_terms:
|
for term in new_terms:
|
||||||
|
@ -126,7 +125,7 @@ class Exporter(ABC):
|
||||||
return self._out_dir
|
return self._out_dir
|
||||||
out_dir = os.path.join(
|
out_dir = os.path.join(
|
||||||
user_documents_dir(), "jitenbot", "mdict", self._target.value)
|
user_documents_dir(), "jitenbot", "mdict", self._target.value)
|
||||||
print(f"Initializing output directory `{out_dir}`")
|
print(f"{timestamp()} Initializing output directory `{out_dir}`")
|
||||||
if Path(out_dir).is_dir():
|
if Path(out_dir).is_dir():
|
||||||
shutil.rmtree(out_dir)
|
shutil.rmtree(out_dir)
|
||||||
os.makedirs(out_dir)
|
os.makedirs(out_dir)
|
||||||
|
@ -168,58 +167,8 @@ class Exporter(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _get_revision(self, entries):
|
def _get_revision(self, entries):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _get_attribution(self, entries):
|
def _get_attribution(self, entries):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class _JitenonExporter(Exporter):
|
|
||||||
def _get_revision(self, entries):
|
|
||||||
modified_date = None
|
|
||||||
for entry in entries:
|
|
||||||
if modified_date is None or entry.modified_date > modified_date:
|
|
||||||
modified_date = entry.modified_date
|
|
||||||
revision = modified_date.strftime("%Y年%m月%d日閲覧")
|
|
||||||
return revision
|
|
||||||
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
modified_date = None
|
|
||||||
for entry in entries:
|
|
||||||
if modified_date is None or entry.modified_date > modified_date:
|
|
||||||
attribution = entry.attribution
|
|
||||||
return attribution
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKokugoExporter(_JitenonExporter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiExporter(_JitenonExporter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaExporter(_JitenonExporter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class _MonokakidoExporter(Exporter):
|
|
||||||
def _get_revision(self, entries):
|
|
||||||
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
|
|
||||||
return timestamp
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8Exporter(_MonokakidoExporter):
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
return "© Sanseido Co., LTD. 2020"
|
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2Exporter(_MonokakidoExporter):
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
return "© Sanseido Co., LTD. 2019"
|
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8Exporter(_MonokakidoExporter):
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
return "© Sanseido Co., LTD. 2021"
|
|
18
bot/mdict/exporters/base/jitenon.py
Normal file
18
bot/mdict/exporters/base/jitenon.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from bot.mdict.exporters.base.exporter import BaseExporter
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonExporter(BaseExporter):
|
||||||
|
def _get_revision(self, entries):
|
||||||
|
modified_date = None
|
||||||
|
for entry in entries:
|
||||||
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
|
modified_date = entry.modified_date
|
||||||
|
revision = modified_date.strftime("%Y年%m月%d日閲覧")
|
||||||
|
return revision
|
||||||
|
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
modified_date = None
|
||||||
|
for entry in entries:
|
||||||
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
|
attribution = entry.attribution
|
||||||
|
return attribution
|
8
bot/mdict/exporters/base/monokakido.py
Normal file
8
bot/mdict/exporters/base/monokakido.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from bot.mdict.exporters.base.exporter import BaseExporter
|
||||||
|
|
||||||
|
|
||||||
|
class MonokakidoExporter(BaseExporter):
|
||||||
|
def _get_revision(self, entries):
|
||||||
|
timestamp = datetime.now().strftime("%Y年%m月%d日作成")
|
||||||
|
return timestamp
|
6
bot/mdict/exporters/daijirin2.py
Normal file
6
bot/mdict/exporters/daijirin2.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(MonokakidoExporter):
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2019"
|
|
@ -1,20 +0,0 @@
|
||||||
from bot.targets import Targets
|
|
||||||
|
|
||||||
from bot.mdict.exporters.export import JitenonKokugoExporter
|
|
||||||
from bot.mdict.exporters.export import JitenonYojiExporter
|
|
||||||
from bot.mdict.exporters.export import JitenonKotowazaExporter
|
|
||||||
from bot.mdict.exporters.export import Smk8Exporter
|
|
||||||
from bot.mdict.exporters.export import Daijirin2Exporter
|
|
||||||
from bot.mdict.exporters.export import Sankoku8Exporter
|
|
||||||
|
|
||||||
|
|
||||||
def new_mdict_exporter(target):
|
|
||||||
exporter_map = {
|
|
||||||
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
|
|
||||||
Targets.JITENON_YOJI: JitenonYojiExporter,
|
|
||||||
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
|
|
||||||
Targets.SMK8: Smk8Exporter,
|
|
||||||
Targets.DAIJIRIN2: Daijirin2Exporter,
|
|
||||||
Targets.SANKOKU8: Sankoku8Exporter,
|
|
||||||
}
|
|
||||||
return exporter_map[target](target)
|
|
5
bot/mdict/exporters/jitenon_kokugo.py
Normal file
5
bot/mdict/exporters/jitenon_kokugo.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.mdict.exporters.base.jitenon import JitenonExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(JitenonExporter):
|
||||||
|
pass
|
5
bot/mdict/exporters/jitenon_kotowaza.py
Normal file
5
bot/mdict/exporters/jitenon_kotowaza.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.mdict.exporters.base.jitenon import JitenonExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(JitenonExporter):
|
||||||
|
pass
|
5
bot/mdict/exporters/jitenon_yoji.py
Normal file
5
bot/mdict/exporters/jitenon_yoji.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.mdict.exporters.base.jitenon import JitenonExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(JitenonExporter):
|
||||||
|
pass
|
6
bot/mdict/exporters/sankoku8.py
Normal file
6
bot/mdict/exporters/sankoku8.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(MonokakidoExporter):
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2021"
|
6
bot/mdict/exporters/smk8.py
Normal file
6
bot/mdict/exporters/smk8.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from bot.mdict.exporters.base.monokakido import MonokakidoExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(MonokakidoExporter):
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2020"
|
20
bot/mdict/terms/base/jitenon.py
Normal file
20
bot/mdict/terms/base/jitenon.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
from bot.mdict.terms.base.terminator import BaseTerminator
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonTerminator(BaseTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = None
|
||||||
|
|
||||||
|
def _glossary(self, entry):
|
||||||
|
if entry.entry_id in self._glossary_cache:
|
||||||
|
return self._glossary_cache[entry.entry_id]
|
||||||
|
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
|
||||||
|
self._glossary_cache[entry.entry_id] = glossary
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
def _link_glossary_parameters(self, entry):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _subentry_lists(self, entry):
|
||||||
|
return []
|
|
@ -2,7 +2,7 @@ import re
|
||||||
from abc import abstractmethod, ABC
|
from abc import abstractmethod, ABC
|
||||||
|
|
||||||
|
|
||||||
class Terminator(ABC):
|
class BaseTerminator(ABC):
|
||||||
def __init__(self, target):
|
def __init__(self, target):
|
||||||
self._target = target
|
self._target = target
|
||||||
self._glossary_cache = {}
|
self._glossary_cache = {}
|
||||||
|
@ -72,12 +72,12 @@ class Terminator(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _glossary(self, entry):
|
def _glossary(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _link_glossary_parameters(self, entry):
|
def _link_glossary_parameters(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _subentry_lists(self, entry):
|
def _subentry_lists(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
|
@ -1,8 +1,8 @@
|
||||||
from bot.mdict.terms.terminator import Terminator
|
from bot.mdict.terms.base.terminator import BaseTerminator
|
||||||
from bot.mdict.glossary.daijirin2 import make_glossary
|
from bot.mdict.glossary.daijirin2 import make_glossary
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2Terminator(Terminator):
|
class Terminator(BaseTerminator):
|
||||||
def _glossary(self, entry):
|
def _glossary(self, entry):
|
||||||
if entry.entry_id in self._glossary_cache:
|
if entry.entry_id in self._glossary_cache:
|
||||||
return self._glossary_cache[entry.entry_id]
|
return self._glossary_cache[entry.entry_id]
|
||||||
|
|
|
@ -1,20 +0,0 @@
|
||||||
from bot.targets import Targets
|
|
||||||
|
|
||||||
from bot.mdict.terms.jitenon import JitenonKokugoTerminator
|
|
||||||
from bot.mdict.terms.jitenon import JitenonYojiTerminator
|
|
||||||
from bot.mdict.terms.jitenon import JitenonKotowazaTerminator
|
|
||||||
from bot.mdict.terms.smk8 import Smk8Terminator
|
|
||||||
from bot.mdict.terms.daijirin2 import Daijirin2Terminator
|
|
||||||
from bot.mdict.terms.sankoku8 import Sankoku8Terminator
|
|
||||||
|
|
||||||
|
|
||||||
def new_terminator(target):
|
|
||||||
terminator_map = {
|
|
||||||
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
|
|
||||||
Targets.JITENON_YOJI: JitenonYojiTerminator,
|
|
||||||
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
|
|
||||||
Targets.SMK8: Smk8Terminator,
|
|
||||||
Targets.DAIJIRIN2: Daijirin2Terminator,
|
|
||||||
Targets.SANKOKU8: Sankoku8Terminator,
|
|
||||||
}
|
|
||||||
return terminator_map[target](target)
|
|
|
@ -1,42 +0,0 @@
|
||||||
from bot.mdict.terms.terminator import Terminator
|
|
||||||
|
|
||||||
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
|
|
||||||
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
|
|
||||||
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonTerminator(Terminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = None
|
|
||||||
|
|
||||||
def _glossary(self, entry):
|
|
||||||
if entry.entry_id in self._glossary_cache:
|
|
||||||
return self._glossary_cache[entry.entry_id]
|
|
||||||
glossary = self._glossary_maker.make_glossary(entry, self._media_dir)
|
|
||||||
self._glossary_cache[entry.entry_id] = glossary
|
|
||||||
return glossary
|
|
||||||
|
|
||||||
def _link_glossary_parameters(self, entry):
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _subentry_lists(self, entry):
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKokugoTerminator(JitenonTerminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = JitenonKokugoGlossary()
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiTerminator(JitenonTerminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = JitenonYojiGlossary()
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaTerminator(JitenonTerminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = JitenonKotowazaGlossary()
|
|
8
bot/mdict/terms/jitenon_kokugo.py
Normal file
8
bot/mdict/terms/jitenon_kokugo.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from bot.mdict.terms.base.jitenon import JitenonTerminator
|
||||||
|
from bot.mdict.glossary.jitenon import JitenonKokugoGlossary
|
||||||
|
|
||||||
|
|
||||||
|
class Terminator(JitenonTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = JitenonKokugoGlossary()
|
8
bot/mdict/terms/jitenon_kotowaza.py
Normal file
8
bot/mdict/terms/jitenon_kotowaza.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from bot.mdict.terms.base.jitenon import JitenonTerminator
|
||||||
|
from bot.mdict.glossary.jitenon import JitenonKotowazaGlossary
|
||||||
|
|
||||||
|
|
||||||
|
class Terminator(JitenonTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = JitenonKotowazaGlossary()
|
8
bot/mdict/terms/jitenon_yoji.py
Normal file
8
bot/mdict/terms/jitenon_yoji.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from bot.mdict.terms.base.jitenon import JitenonTerminator
|
||||||
|
from bot.mdict.glossary.jitenon import JitenonYojiGlossary
|
||||||
|
|
||||||
|
|
||||||
|
class Terminator(JitenonTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = JitenonYojiGlossary()
|
|
@ -1,8 +1,8 @@
|
||||||
from bot.mdict.terms.terminator import Terminator
|
from bot.mdict.terms.base.terminator import BaseTerminator
|
||||||
from bot.mdict.glossary.sankoku8 import make_glossary
|
from bot.mdict.glossary.sankoku8 import make_glossary
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8Terminator(Terminator):
|
class Terminator(BaseTerminator):
|
||||||
def _glossary(self, entry):
|
def _glossary(self, entry):
|
||||||
if entry.entry_id in self._glossary_cache:
|
if entry.entry_id in self._glossary_cache:
|
||||||
return self._glossary_cache[entry.entry_id]
|
return self._glossary_cache[entry.entry_id]
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from bot.mdict.terms.terminator import Terminator
|
from bot.mdict.terms.base.terminator import BaseTerminator
|
||||||
from bot.mdict.glossary.smk8 import make_glossary
|
from bot.mdict.glossary.smk8 import make_glossary
|
||||||
|
|
||||||
|
|
||||||
class Smk8Terminator(Terminator):
|
class Terminator(BaseTerminator):
|
||||||
def _glossary(self, entry):
|
def _glossary(self, entry):
|
||||||
if entry.entry_id in self._glossary_cache:
|
if entry.entry_id in self._glossary_cache:
|
||||||
return self._glossary_cache[entry.entry_id]
|
return self._glossary_cache[entry.entry_id]
|
||||||
|
|
5
bot/time.py
Normal file
5
bot/time.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp():
|
||||||
|
return time.strftime('%X')
|
|
@ -1,24 +1,23 @@
|
||||||
# pylint: disable=too-few-public-methods
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import copy
|
import copy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from platformdirs import user_documents_dir, user_cache_dir
|
|
||||||
|
|
||||||
import fastjsonschema
|
import fastjsonschema
|
||||||
|
from platformdirs import user_documents_dir, user_cache_dir
|
||||||
|
|
||||||
|
from bot.time import timestamp
|
||||||
from bot.data import load_yomichan_metadata
|
from bot.data import load_yomichan_metadata
|
||||||
from bot.yomichan.terms.factory import new_terminator
|
|
||||||
from bot.data import load_yomichan_term_schema
|
from bot.data import load_yomichan_term_schema
|
||||||
|
from bot.factory import new_yomichan_terminator
|
||||||
|
|
||||||
|
|
||||||
class Exporter(ABC):
|
class BaseExporter(ABC):
|
||||||
def __init__(self, target):
|
def __init__(self, target):
|
||||||
self._target = target
|
self._target = target
|
||||||
self._terminator = new_terminator(target)
|
self._terminator = new_yomichan_terminator(target)
|
||||||
self._build_dir = None
|
self._build_dir = None
|
||||||
self._terms_per_file = 2000
|
self._terms_per_file = 2000
|
||||||
|
|
||||||
|
@ -36,18 +35,18 @@ class Exporter(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _get_revision(self, entries):
|
def _get_revision(self, entries):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _get_attribution(self, entries):
|
def _get_attribution(self, entries):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
def _get_build_dir(self):
|
def _get_build_dir(self):
|
||||||
if self._build_dir is not None:
|
if self._build_dir is not None:
|
||||||
return self._build_dir
|
return self._build_dir
|
||||||
cache_dir = user_cache_dir("jitenbot")
|
cache_dir = user_cache_dir("jitenbot")
|
||||||
build_directory = os.path.join(cache_dir, "yomichan_build")
|
build_directory = os.path.join(cache_dir, "yomichan_build")
|
||||||
print(f"Initializing build directory `{build_directory}`")
|
print(f"{timestamp()} Initializing build directory `{build_directory}`")
|
||||||
if Path(build_directory).is_dir():
|
if Path(build_directory).is_dir():
|
||||||
shutil.rmtree(build_directory)
|
shutil.rmtree(build_directory)
|
||||||
os.makedirs(build_directory)
|
os.makedirs(build_directory)
|
||||||
|
@ -66,8 +65,9 @@ class Exporter(ABC):
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
build_img_dir = os.path.join(build_dir, self._target.value)
|
build_img_dir = os.path.join(build_dir, self._target.value)
|
||||||
if image_dir is not None:
|
if image_dir is not None:
|
||||||
print("Copying media files to build directory...")
|
print(f"{timestamp()} Copying media files to build directory...")
|
||||||
shutil.copytree(image_dir, build_img_dir)
|
shutil.copytree(image_dir, build_img_dir)
|
||||||
|
print(f"{timestamp()} Finished copying files")
|
||||||
else:
|
else:
|
||||||
os.makedirs(build_img_dir)
|
os.makedirs(build_img_dir)
|
||||||
self._terminator.set_image_dir(build_img_dir)
|
self._terminator.set_image_dir(build_img_dir)
|
||||||
|
@ -76,7 +76,7 @@ class Exporter(ABC):
|
||||||
terms = []
|
terms = []
|
||||||
entries_len = len(entries)
|
entries_len = len(entries)
|
||||||
for idx, entry in enumerate(entries):
|
for idx, entry in enumerate(entries):
|
||||||
update = f"Creating Yomichan terms for entry {idx+1}/{entries_len}"
|
update = f"\tCreating Yomichan terms for entry {idx+1}/{entries_len}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
new_terms = self._terminator.make_terms(entry)
|
new_terms = self._terminator.make_terms(entry)
|
||||||
for term in new_terms:
|
for term in new_terms:
|
||||||
|
@ -85,7 +85,7 @@ class Exporter(ABC):
|
||||||
return terms
|
return terms
|
||||||
|
|
||||||
def __validate_terms(self, terms):
|
def __validate_terms(self, terms):
|
||||||
print("Making a copy of term data for validation...")
|
print(f"{timestamp()} Making a copy of term data for validation...")
|
||||||
terms_copy = copy.deepcopy(terms) # because validator will alter data!
|
terms_copy = copy.deepcopy(terms) # because validator will alter data!
|
||||||
term_count = len(terms_copy)
|
term_count = len(terms_copy)
|
||||||
log_dir = self.__get_invalid_term_dir()
|
log_dir = self.__get_invalid_term_dir()
|
||||||
|
@ -93,7 +93,7 @@ class Exporter(ABC):
|
||||||
validator = fastjsonschema.compile(schema)
|
validator = fastjsonschema.compile(schema)
|
||||||
failure_count = 0
|
failure_count = 0
|
||||||
for idx, term in enumerate(terms_copy):
|
for idx, term in enumerate(terms_copy):
|
||||||
update = f"Validating term {idx+1}/{term_count}"
|
update = f"\tValidating term {idx+1}/{term_count}"
|
||||||
print(update, end='\r', flush=True)
|
print(update, end='\r', flush=True)
|
||||||
try:
|
try:
|
||||||
validator([term])
|
validator([term])
|
||||||
|
@ -102,9 +102,9 @@ class Exporter(ABC):
|
||||||
term_file = os.path.join(log_dir, f"{idx}.json")
|
term_file = os.path.join(log_dir, f"{idx}.json")
|
||||||
with open(term_file, "w", encoding='utf8') as f:
|
with open(term_file, "w", encoding='utf8') as f:
|
||||||
json.dump([term], f, indent=4, ensure_ascii=False)
|
json.dump([term], f, indent=4, ensure_ascii=False)
|
||||||
print(f"\nFinished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
|
print(f"\n{timestamp()} Finished validating with {failure_count} error{'' if failure_count == 1 else 's'}")
|
||||||
if failure_count > 0:
|
if failure_count > 0:
|
||||||
print(f"Invalid terms saved to `{log_dir}` for debugging")
|
print(f"{timestamp()} Invalid terms saved to `{log_dir}` for debugging")
|
||||||
|
|
||||||
def __make_dictionary(self, terms, index, tags):
|
def __make_dictionary(self, terms, index, tags):
|
||||||
self.__write_term_banks(terms)
|
self.__write_term_banks(terms)
|
||||||
|
@ -114,14 +114,14 @@ class Exporter(ABC):
|
||||||
self.__rm_build_dir()
|
self.__rm_build_dir()
|
||||||
|
|
||||||
def __write_term_banks(self, terms):
|
def __write_term_banks(self, terms):
|
||||||
print(f"Exporting {len(terms)} JSON terms")
|
print(f"{timestamp()} Exporting {len(terms)} JSON terms")
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
max_i = int(len(terms) / self._terms_per_file) + 1
|
max_i = int(len(terms) / self._terms_per_file) + 1
|
||||||
for i in range(max_i):
|
for i in range(max_i):
|
||||||
|
update = f"\tWriting terms to term bank {i+1}/{max_i}"
|
||||||
|
print(update, end='\r', flush=True)
|
||||||
start = self._terms_per_file * i
|
start = self._terms_per_file * i
|
||||||
end = self._terms_per_file * (i + 1)
|
end = self._terms_per_file * (i + 1)
|
||||||
update = f"Writing terms to term banks {start} - {end}"
|
|
||||||
print(update, end='\r', flush=True)
|
|
||||||
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
|
term_file = os.path.join(build_dir, f"term_bank_{i+1}.json")
|
||||||
with open(term_file, "w", encoding='utf8') as f:
|
with open(term_file, "w", encoding='utf8') as f:
|
||||||
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
||||||
|
@ -142,8 +142,8 @@ class Exporter(ABC):
|
||||||
json.dump(tags, f, indent=4, ensure_ascii=False)
|
json.dump(tags, f, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
def __write_archive(self, filename):
|
def __write_archive(self, filename):
|
||||||
print("Archiving data to ZIP file...")
|
|
||||||
archive_format = "zip"
|
archive_format = "zip"
|
||||||
|
print(f"{timestamp()} Archiving data to {archive_format.upper()} file...")
|
||||||
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
|
out_dir = os.path.join(user_documents_dir(), "jitenbot", "yomichan")
|
||||||
if not Path(out_dir).is_dir():
|
if not Path(out_dir).is_dir():
|
||||||
os.makedirs(out_dir)
|
os.makedirs(out_dir)
|
||||||
|
@ -154,58 +154,8 @@ class Exporter(ABC):
|
||||||
base_filename = os.path.join(out_dir, filename)
|
base_filename = os.path.join(out_dir, filename)
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
shutil.make_archive(base_filename, archive_format, build_dir)
|
shutil.make_archive(base_filename, archive_format, build_dir)
|
||||||
print(f"Dictionary file saved to {out_filepath}")
|
print(f"{timestamp()} Dictionary file saved to `{out_filepath}`")
|
||||||
|
|
||||||
def __rm_build_dir(self):
|
def __rm_build_dir(self):
|
||||||
build_dir = self._get_build_dir()
|
build_dir = self._get_build_dir()
|
||||||
shutil.rmtree(build_dir)
|
shutil.rmtree(build_dir)
|
||||||
|
|
||||||
|
|
||||||
class _JitenonExporter(Exporter):
|
|
||||||
def _get_revision(self, entries):
|
|
||||||
modified_date = None
|
|
||||||
for entry in entries:
|
|
||||||
if modified_date is None or entry.modified_date > modified_date:
|
|
||||||
modified_date = entry.modified_date
|
|
||||||
revision = f"{self._target.value};{modified_date}"
|
|
||||||
return revision
|
|
||||||
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
modified_date = None
|
|
||||||
for entry in entries:
|
|
||||||
if modified_date is None or entry.modified_date > modified_date:
|
|
||||||
attribution = entry.attribution
|
|
||||||
return attribution
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKokugoExporter(_JitenonExporter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiExporter(_JitenonExporter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaExporter(_JitenonExporter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class _MonokakidoExporter(Exporter):
|
|
||||||
def _get_revision(self, entries):
|
|
||||||
timestamp = datetime.now().strftime("%Y-%m-%d")
|
|
||||||
return f"{self._target.value};{timestamp}"
|
|
||||||
|
|
||||||
|
|
||||||
class Smk8Exporter(_MonokakidoExporter):
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
return "© Sanseido Co., LTD. 2020"
|
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2Exporter(_MonokakidoExporter):
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
return "© Sanseido Co., LTD. 2019"
|
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8Exporter(_MonokakidoExporter):
|
|
||||||
def _get_attribution(self, entries):
|
|
||||||
return "© Sanseido Co., LTD. 2021"
|
|
18
bot/yomichan/exporters/base/jitenon.py
Normal file
18
bot/yomichan/exporters/base/jitenon.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from bot.yomichan.exporters.base.exporter import BaseExporter
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonExporter(BaseExporter):
|
||||||
|
def _get_revision(self, entries):
|
||||||
|
modified_date = None
|
||||||
|
for entry in entries:
|
||||||
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
|
modified_date = entry.modified_date
|
||||||
|
revision = f"{self._target.value};{modified_date}"
|
||||||
|
return revision
|
||||||
|
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
modified_date = None
|
||||||
|
for entry in entries:
|
||||||
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
|
attribution = entry.attribution
|
||||||
|
return attribution
|
8
bot/yomichan/exporters/base/monokakido.py
Normal file
8
bot/yomichan/exporters/base/monokakido.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from bot.yomichan.exporters.base.exporter import BaseExporter
|
||||||
|
|
||||||
|
|
||||||
|
class MonokakidoExporter(BaseExporter):
|
||||||
|
def _get_revision(self, entries):
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
return f"{self._target.value};{timestamp}"
|
6
bot/yomichan/exporters/daijirin2.py
Normal file
6
bot/yomichan/exporters/daijirin2.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(MonokakidoExporter):
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2019"
|
|
@ -1,20 +0,0 @@
|
||||||
from bot.targets import Targets
|
|
||||||
|
|
||||||
from bot.yomichan.exporters.export import JitenonKokugoExporter
|
|
||||||
from bot.yomichan.exporters.export import JitenonYojiExporter
|
|
||||||
from bot.yomichan.exporters.export import JitenonKotowazaExporter
|
|
||||||
from bot.yomichan.exporters.export import Smk8Exporter
|
|
||||||
from bot.yomichan.exporters.export import Daijirin2Exporter
|
|
||||||
from bot.yomichan.exporters.export import Sankoku8Exporter
|
|
||||||
|
|
||||||
|
|
||||||
def new_yomi_exporter(target):
|
|
||||||
exporter_map = {
|
|
||||||
Targets.JITENON_KOKUGO: JitenonKokugoExporter,
|
|
||||||
Targets.JITENON_YOJI: JitenonYojiExporter,
|
|
||||||
Targets.JITENON_KOTOWAZA: JitenonKotowazaExporter,
|
|
||||||
Targets.SMK8: Smk8Exporter,
|
|
||||||
Targets.DAIJIRIN2: Daijirin2Exporter,
|
|
||||||
Targets.SANKOKU8: Sankoku8Exporter,
|
|
||||||
}
|
|
||||||
return exporter_map[target](target)
|
|
5
bot/yomichan/exporters/jitenon_kokugo.py
Normal file
5
bot/yomichan/exporters/jitenon_kokugo.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.yomichan.exporters.base.jitenon import JitenonExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(JitenonExporter):
|
||||||
|
pass
|
5
bot/yomichan/exporters/jitenon_kotowaza.py
Normal file
5
bot/yomichan/exporters/jitenon_kotowaza.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.yomichan.exporters.base.jitenon import JitenonExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(JitenonExporter):
|
||||||
|
pass
|
5
bot/yomichan/exporters/jitenon_yoji.py
Normal file
5
bot/yomichan/exporters/jitenon_yoji.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from bot.yomichan.exporters.base.jitenon import JitenonExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(JitenonExporter):
|
||||||
|
pass
|
6
bot/yomichan/exporters/sankoku8.py
Normal file
6
bot/yomichan/exporters/sankoku8.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(MonokakidoExporter):
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2021"
|
6
bot/yomichan/exporters/smk8.py
Normal file
6
bot/yomichan/exporters/smk8.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from bot.yomichan.exporters.base.monokakido import MonokakidoExporter
|
||||||
|
|
||||||
|
|
||||||
|
class Exporter(MonokakidoExporter):
|
||||||
|
def _get_attribution(self, entries):
|
||||||
|
return "© Sanseido Co., LTD. 2020"
|
|
@ -1,9 +1,10 @@
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import bot.yomichan.glossary.icons as Icons
|
import bot.yomichan.glossary.icons as Icons
|
||||||
from bot.soup import delete_soup_nodes
|
from bot.soup import delete_soup_nodes
|
||||||
from bot.data import load_yomichan_name_conversion
|
from bot.data import load_yomichan_name_conversion
|
||||||
|
|
26
bot/yomichan/terms/base/jitenon.py
Normal file
26
bot/yomichan/terms/base/jitenon.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
from bot.yomichan.terms.base.terminator import BaseTerminator
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonTerminator(BaseTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = None
|
||||||
|
|
||||||
|
def _definition_tags(self, entry):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _glossary(self, entry):
|
||||||
|
if entry.entry_id in self._glossary_cache:
|
||||||
|
return self._glossary_cache[entry.entry_id]
|
||||||
|
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
|
||||||
|
self._glossary_cache[entry.entry_id] = glossary
|
||||||
|
return glossary
|
||||||
|
|
||||||
|
def _sequence(self, entry):
|
||||||
|
return entry.entry_id
|
||||||
|
|
||||||
|
def _link_glossary_parameters(self, entry):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _subentry_lists(self, entry):
|
||||||
|
return []
|
|
@ -2,7 +2,7 @@ from abc import abstractmethod, ABC
|
||||||
from bot.data import load_yomichan_inflection_categories
|
from bot.data import load_yomichan_inflection_categories
|
||||||
|
|
||||||
|
|
||||||
class Terminator(ABC):
|
class BaseTerminator(ABC):
|
||||||
def __init__(self, target):
|
def __init__(self, target):
|
||||||
self._target = target
|
self._target = target
|
||||||
self._glossary_cache = {}
|
self._glossary_cache = {}
|
||||||
|
@ -66,28 +66,28 @@ class Terminator(ABC):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _definition_tags(self, entry):
|
def _definition_tags(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _inflection_rules(self, entry, expression):
|
def _inflection_rules(self, entry, expression):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _glossary(self, entry):
|
def _glossary(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _sequence(self, entry):
|
def _sequence(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _term_tags(self, entry):
|
def _term_tags(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _link_glossary_parameters(self, entry):
|
def _link_glossary_parameters(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _subentry_lists(self, entry):
|
def _subentry_lists(self, entry):
|
||||||
pass
|
raise NotImplementedError
|
|
@ -1,14 +1,10 @@
|
||||||
from bot.entries.daijirin2 import Daijirin2PhraseEntry as PhraseEntry
|
from bot.entries.daijirin2.phrase_entry import PhraseEntry
|
||||||
|
from bot.yomichan.terms.base.terminator import BaseTerminator
|
||||||
from bot.yomichan.terms.terminator import Terminator
|
|
||||||
from bot.yomichan.glossary.daijirin2 import make_glossary
|
from bot.yomichan.glossary.daijirin2 import make_glossary
|
||||||
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
||||||
|
|
||||||
|
|
||||||
class Daijirin2Terminator(Terminator):
|
class Terminator(BaseTerminator):
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
|
|
||||||
def _definition_tags(self, entry):
|
def _definition_tags(self, entry):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
|
@ -1,20 +0,0 @@
|
||||||
from bot.targets import Targets
|
|
||||||
|
|
||||||
from bot.yomichan.terms.jitenon import JitenonKokugoTerminator
|
|
||||||
from bot.yomichan.terms.jitenon import JitenonYojiTerminator
|
|
||||||
from bot.yomichan.terms.jitenon import JitenonKotowazaTerminator
|
|
||||||
from bot.yomichan.terms.smk8 import Smk8Terminator
|
|
||||||
from bot.yomichan.terms.daijirin2 import Daijirin2Terminator
|
|
||||||
from bot.yomichan.terms.sankoku8 import Sankoku8Terminator
|
|
||||||
|
|
||||||
|
|
||||||
def new_terminator(target):
|
|
||||||
terminator_map = {
|
|
||||||
Targets.JITENON_KOKUGO: JitenonKokugoTerminator,
|
|
||||||
Targets.JITENON_YOJI: JitenonYojiTerminator,
|
|
||||||
Targets.JITENON_KOTOWAZA: JitenonKotowazaTerminator,
|
|
||||||
Targets.SMK8: Smk8Terminator,
|
|
||||||
Targets.DAIJIRIN2: Daijirin2Terminator,
|
|
||||||
Targets.SANKOKU8: Sankoku8Terminator,
|
|
||||||
}
|
|
||||||
return terminator_map[target](target)
|
|
|
@ -1,68 +0,0 @@
|
||||||
from bot.yomichan.grammar import sudachi_rules
|
|
||||||
from bot.yomichan.terms.terminator import Terminator
|
|
||||||
|
|
||||||
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
|
|
||||||
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
|
|
||||||
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonTerminator(Terminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = None
|
|
||||||
|
|
||||||
def _definition_tags(self, entry):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _glossary(self, entry):
|
|
||||||
if entry.entry_id in self._glossary_cache:
|
|
||||||
return self._glossary_cache[entry.entry_id]
|
|
||||||
glossary = self._glossary_maker.make_glossary(entry, self._image_dir)
|
|
||||||
self._glossary_cache[entry.entry_id] = glossary
|
|
||||||
return glossary
|
|
||||||
|
|
||||||
def _sequence(self, entry):
|
|
||||||
return entry.entry_id
|
|
||||||
|
|
||||||
def _link_glossary_parameters(self, entry):
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _subentry_lists(self, entry):
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKokugoTerminator(JitenonTerminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = JitenonKokugoGlossary()
|
|
||||||
|
|
||||||
def _inflection_rules(self, entry, expression):
|
|
||||||
return sudachi_rules(expression)
|
|
||||||
|
|
||||||
def _term_tags(self, entry):
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonYojiTerminator(JitenonTerminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = JitenonYojiGlossary()
|
|
||||||
|
|
||||||
def _inflection_rules(self, entry, expression):
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _term_tags(self, entry):
|
|
||||||
tags = entry.kanken_level.split("/")
|
|
||||||
return " ".join(tags)
|
|
||||||
|
|
||||||
|
|
||||||
class JitenonKotowazaTerminator(JitenonTerminator):
|
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
self._glossary_maker = JitenonKotowazaGlossary()
|
|
||||||
|
|
||||||
def _inflection_rules(self, entry, expression):
|
|
||||||
return sudachi_rules(expression)
|
|
||||||
|
|
||||||
def _term_tags(self, entry):
|
|
||||||
return ""
|
|
15
bot/yomichan/terms/jitenon_kokugo.py
Normal file
15
bot/yomichan/terms/jitenon_kokugo.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
from bot.yomichan.grammar import sudachi_rules
|
||||||
|
from bot.yomichan.glossary.jitenon import JitenonKokugoGlossary
|
||||||
|
from bot.yomichan.terms.base.jitenon import JitenonTerminator
|
||||||
|
|
||||||
|
|
||||||
|
class Terminator(JitenonTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = JitenonKokugoGlossary()
|
||||||
|
|
||||||
|
def _inflection_rules(self, entry, expression):
|
||||||
|
return sudachi_rules(expression)
|
||||||
|
|
||||||
|
def _term_tags(self, entry):
|
||||||
|
return ""
|
15
bot/yomichan/terms/jitenon_kotowaza.py
Normal file
15
bot/yomichan/terms/jitenon_kotowaza.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
from bot.yomichan.grammar import sudachi_rules
|
||||||
|
from bot.yomichan.glossary.jitenon import JitenonKotowazaGlossary
|
||||||
|
from bot.yomichan.terms.base.jitenon import JitenonTerminator
|
||||||
|
|
||||||
|
|
||||||
|
class Terminator(JitenonTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = JitenonKotowazaGlossary()
|
||||||
|
|
||||||
|
def _inflection_rules(self, entry, expression):
|
||||||
|
return sudachi_rules(expression)
|
||||||
|
|
||||||
|
def _term_tags(self, entry):
|
||||||
|
return ""
|
15
bot/yomichan/terms/jitenon_yoji.py
Normal file
15
bot/yomichan/terms/jitenon_yoji.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
from bot.yomichan.glossary.jitenon import JitenonYojiGlossary
|
||||||
|
from bot.yomichan.terms.base.jitenon import JitenonTerminator
|
||||||
|
|
||||||
|
|
||||||
|
class Terminator(JitenonTerminator):
|
||||||
|
def __init__(self, target):
|
||||||
|
super().__init__(target)
|
||||||
|
self._glossary_maker = JitenonYojiGlossary()
|
||||||
|
|
||||||
|
def _inflection_rules(self, entry, expression):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _term_tags(self, entry):
|
||||||
|
tags = entry.kanken_level.split("/")
|
||||||
|
return " ".join(tags)
|
|
@ -1,14 +1,10 @@
|
||||||
from bot.entries.sankoku8 import Sankoku8PhraseEntry as PhraseEntry
|
from bot.entries.sankoku8.phrase_entry import PhraseEntry
|
||||||
|
from bot.yomichan.terms.base.terminator import BaseTerminator
|
||||||
from bot.yomichan.terms.terminator import Terminator
|
|
||||||
from bot.yomichan.glossary.sankoku8 import make_glossary
|
from bot.yomichan.glossary.sankoku8 import make_glossary
|
||||||
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
||||||
|
|
||||||
|
|
||||||
class Sankoku8Terminator(Terminator):
|
class Terminator(BaseTerminator):
|
||||||
def __init__(self, target):
|
|
||||||
super().__init__(target)
|
|
||||||
|
|
||||||
def _definition_tags(self, entry):
|
def _definition_tags(self, entry):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
from bot.entries.smk8 import Smk8KanjiEntry as KanjiEntry
|
from bot.entries.smk8.kanji_entry import KanjiEntry
|
||||||
from bot.entries.smk8 import Smk8PhraseEntry as PhraseEntry
|
from bot.entries.smk8.phrase_entry import PhraseEntry
|
||||||
|
from bot.yomichan.terms.base.terminator import BaseTerminator
|
||||||
from bot.yomichan.terms.terminator import Terminator
|
|
||||||
from bot.yomichan.glossary.smk8 import make_glossary
|
from bot.yomichan.glossary.smk8 import make_glossary
|
||||||
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
from bot.yomichan.grammar import sudachi_rules, tags_to_rules
|
||||||
|
|
||||||
|
|
||||||
class Smk8Terminator(Terminator):
|
class Terminator(BaseTerminator):
|
||||||
def __init__(self, target):
|
def __init__(self, target):
|
||||||
super().__init__(target)
|
super().__init__(target)
|
||||||
|
|
||||||
|
|
|
@ -1,65 +1,61 @@
|
||||||
亙,亘
|
𠮟,叱
|
||||||
俠,侠
|
|
||||||
俱,倶
|
|
||||||
儘,侭
|
|
||||||
凜,凛
|
|
||||||
剝,剥
|
|
||||||
吞,呑
|
吞,呑
|
||||||
|
靭,靱
|
||||||
|
臈,﨟
|
||||||
啞,唖
|
啞,唖
|
||||||
噓,嘘
|
|
||||||
嚙,噛
|
嚙,噛
|
||||||
囊,嚢
|
|
||||||
塡,填
|
|
||||||
壺,壷
|
|
||||||
屛,屏
|
屛,屏
|
||||||
屢,屡
|
|
||||||
幷,并
|
幷,并
|
||||||
彎,弯
|
彎,弯
|
||||||
搔,掻
|
搔,掻
|
||||||
摑,掴
|
|
||||||
攪,撹
|
攪,撹
|
||||||
曾,曽
|
|
||||||
枡,桝
|
枡,桝
|
||||||
檜,桧
|
|
||||||
檮,梼
|
|
||||||
潑,溌
|
|
||||||
濤,涛
|
|
||||||
濾,沪
|
濾,沪
|
||||||
瀆,涜
|
|
||||||
灌,潅
|
|
||||||
焰,焔
|
|
||||||
瘦,痩
|
|
||||||
禰,祢
|
|
||||||
禱,祷
|
|
||||||
穎,頴
|
|
||||||
竈,竃
|
|
||||||
簞,箪
|
|
||||||
籠,篭
|
|
||||||
繡,繍
|
繡,繍
|
||||||
繫,繋
|
|
||||||
萊,莱
|
|
||||||
蔣,蒋
|
蔣,蒋
|
||||||
藪,薮
|
|
||||||
蘆,芦
|
|
||||||
蟬,蝉
|
|
||||||
蠅,蝿
|
|
||||||
蠟,蝋
|
蠟,蝋
|
||||||
蠣,蛎
|
|
||||||
賤,賎
|
|
||||||
軀,躯
|
|
||||||
邇,迩
|
|
||||||
醬,醤
|
醬,醤
|
||||||
醱,醗
|
穎,頴
|
||||||
靱,靭
|
|
||||||
頰,頬
|
|
||||||
頸,頚
|
|
||||||
顚,顛
|
|
||||||
驒,騨
|
|
||||||
鰺,鯵
|
|
||||||
鶯,鴬
|
|
||||||
鷗,鴎
|
鷗,鴎
|
||||||
鹼,鹸
|
鹼,鹸
|
||||||
麴,麹
|
麴,麹
|
||||||
麵,麺
|
俠,侠
|
||||||
﨟,臈
|
俱,倶
|
||||||
𠮟,叱
|
剝,剥
|
||||||
|
噓,嘘
|
||||||
|
囊,嚢
|
||||||
|
塡,填
|
||||||
|
屢,屡
|
||||||
|
摑,掴
|
||||||
|
瀆,涜
|
||||||
|
潑,溌
|
||||||
|
焰,焔
|
||||||
|
簞,箪
|
||||||
|
繫,繋
|
||||||
|
萊,莱
|
||||||
|
蟬,蝉
|
||||||
|
軀,躯
|
||||||
|
醱,醗
|
||||||
|
頰,頬
|
||||||
|
顚,顛
|
||||||
|
驒,騨
|
||||||
|
姸,妍
|
||||||
|
攢,攅
|
||||||
|
𣜜,杤
|
||||||
|
檔,档
|
||||||
|
槶,椢
|
||||||
|
櫳,槞
|
||||||
|
纊,絋
|
||||||
|
纘,纉
|
||||||
|
隯,陦
|
||||||
|
筓,笄
|
||||||
|
逬,迸
|
||||||
|
腁,胼
|
||||||
|
騈,駢
|
||||||
|
拋,抛
|
||||||
|
篡,簒
|
||||||
|
檜,桧
|
||||||
|
禰,祢
|
||||||
|
禱,祷
|
||||||
|
蘆,芦
|
||||||
|
凜,凛
|
|
|
@ -21,7 +21,7 @@ import sys
|
||||||
import argparse
|
import argparse
|
||||||
import subprocess
|
import subprocess
|
||||||
from bot.targets import Targets
|
from bot.targets import Targets
|
||||||
from bot.crawlers.factory import new_crawler
|
from bot.factory import new_crawler
|
||||||
|
|
||||||
|
|
||||||
def filename(f):
|
def filename(f):
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
|
export PYTHONPYCACHEPREFIX=/tmp/pycache
|
||||||
|
|
||||||
python -m unittest discover -s tests
|
python -m unittest discover -s tests
|
||||||
|
|
||||||
python jitenbot.py jitenon-kokugo
|
python jitenbot.py jitenon-kokugo
|
||||||
|
|
21
tests/test_daijirin_phrases.py
Normal file
21
tests/test_daijirin_phrases.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import unittest
|
||||||
|
from bot.entries.daijirin2.phrase_entry import parse_phrase
|
||||||
|
|
||||||
|
|
||||||
|
class TestDaijirin2PhraseParse(unittest.TestCase):
|
||||||
|
def test1(self):
|
||||||
|
text = "同じ穴の=狢(=狐・狸)"
|
||||||
|
exps = parse_phrase(text)
|
||||||
|
self.assertEqual(len(exps), 3)
|
||||||
|
self.assertIn("同じ穴の狢", exps)
|
||||||
|
self.assertIn("同じ穴の狐", exps)
|
||||||
|
self.assertIn("同じ穴の狸", exps)
|
||||||
|
|
||||||
|
def test2(self):
|
||||||
|
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
|
||||||
|
exps = parse_phrase(text)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
|
||||||
|
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
|
||||||
|
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
|
||||||
|
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
|
|
@ -1,5 +1,5 @@
|
||||||
import unittest
|
import unittest
|
||||||
import bot.entries.expressions as Expressions
|
import bot.entries.base.expressions as Expressions
|
||||||
|
|
||||||
|
|
||||||
class TestExpressions(unittest.TestCase):
|
class TestExpressions(unittest.TestCase):
|
||||||
|
@ -34,8 +34,8 @@ class TestExpressions(unittest.TestCase):
|
||||||
self.assertIn("凶々しい", exps)
|
self.assertIn("凶々しい", exps)
|
||||||
self.assertIn("凶凶しい", exps)
|
self.assertIn("凶凶しい", exps)
|
||||||
|
|
||||||
def test_add_variant_kanji(self):
|
def test_add_variant_kanji1(self):
|
||||||
exps = ["剝く", "掴む", "摑む"]
|
exps = ["剥く", "摑む"]
|
||||||
Expressions.add_variant_kanji(exps)
|
Expressions.add_variant_kanji(exps)
|
||||||
self.assertEqual(len(exps), 4)
|
self.assertEqual(len(exps), 4)
|
||||||
self.assertIn("剥く", exps)
|
self.assertIn("剥く", exps)
|
||||||
|
@ -44,6 +44,15 @@ class TestExpressions(unittest.TestCase):
|
||||||
self.assertIn("摑む", exps)
|
self.assertIn("摑む", exps)
|
||||||
|
|
||||||
def test_add_variant_kanji2(self):
|
def test_add_variant_kanji2(self):
|
||||||
|
exps = ["剝く", "掴む", "摑む"]
|
||||||
|
Expressions.add_variant_kanji(exps)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("剥く", exps)
|
||||||
|
self.assertIn("剝く", exps)
|
||||||
|
self.assertIn("掴む", exps)
|
||||||
|
self.assertIn("摑む", exps)
|
||||||
|
|
||||||
|
def test_add_variant_kanji3(self):
|
||||||
exps = ["剝摑"]
|
exps = ["剝摑"]
|
||||||
Expressions.add_variant_kanji(exps)
|
Expressions.add_variant_kanji(exps)
|
||||||
self.assertEqual(len(exps), 4)
|
self.assertEqual(len(exps), 4)
|
||||||
|
@ -52,6 +61,15 @@ class TestExpressions(unittest.TestCase):
|
||||||
self.assertIn("剥掴", exps)
|
self.assertIn("剥掴", exps)
|
||||||
self.assertIn("剥摑", exps)
|
self.assertIn("剥摑", exps)
|
||||||
|
|
||||||
|
def test_add_variant_kanji4(self):
|
||||||
|
exps = ["剥掴"]
|
||||||
|
Expressions.add_variant_kanji(exps)
|
||||||
|
self.assertEqual(len(exps), 4)
|
||||||
|
self.assertIn("剝摑", exps)
|
||||||
|
self.assertIn("剝掴", exps)
|
||||||
|
self.assertIn("剥掴", exps)
|
||||||
|
self.assertIn("剥摑", exps)
|
||||||
|
|
||||||
def test_expand_abbreviation(self):
|
def test_expand_abbreviation(self):
|
||||||
text = "有(り)合(わ)せ"
|
text = "有(り)合(わ)せ"
|
||||||
abbrs = Expressions.expand_abbreviation(text)
|
abbrs = Expressions.expand_abbreviation(text)
|
||||||
|
@ -69,28 +87,3 @@ class TestExpressions(unittest.TestCase):
|
||||||
self.assertIn("有合わせ", abbrs)
|
self.assertIn("有合わせ", abbrs)
|
||||||
self.assertIn("有り合せ", abbrs)
|
self.assertIn("有り合せ", abbrs)
|
||||||
self.assertIn("有合せ", abbrs)
|
self.assertIn("有合せ", abbrs)
|
||||||
|
|
||||||
def test_smk_expand_alternatives(self):
|
|
||||||
text = "△金(時間・暇)に飽かして"
|
|
||||||
exps = Expressions.expand_smk_alternatives(text)
|
|
||||||
self.assertEqual(len(exps), 3)
|
|
||||||
self.assertIn("金に飽かして", exps)
|
|
||||||
self.assertIn("時間に飽かして", exps)
|
|
||||||
self.assertIn("暇に飽かして", exps)
|
|
||||||
|
|
||||||
def test_daijirin_expand_alternatives(self):
|
|
||||||
text = "同じ穴の=狢(=狐・狸)"
|
|
||||||
exps = Expressions.expand_daijirin_alternatives(text)
|
|
||||||
self.assertEqual(len(exps), 3)
|
|
||||||
self.assertIn("同じ穴の狢", exps)
|
|
||||||
self.assertIn("同じ穴の狐", exps)
|
|
||||||
self.assertIn("同じ穴の狸", exps)
|
|
||||||
|
|
||||||
def test_daijirin_expand_alternatives2(self):
|
|
||||||
text = "聞くは=一時(=一旦)の恥、聞かぬは=末代(=一生)の恥"
|
|
||||||
exps = Expressions.expand_daijirin_alternatives(text)
|
|
||||||
self.assertEqual(len(exps), 4)
|
|
||||||
self.assertIn("聞くは一時の恥、聞かぬは末代の恥", exps)
|
|
||||||
self.assertIn("聞くは一時の恥、聞かぬは一生の恥", exps)
|
|
||||||
self.assertIn("聞くは一旦の恥、聞かぬは末代の恥", exps)
|
|
||||||
self.assertIn("聞くは一旦の恥、聞かぬは一生の恥", exps)
|
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
import unittest
|
import unittest
|
||||||
from bot.entries.sankoku8 import parse_hyouki_pattern
|
from bot.entries.sankoku8.parse import parse_hyouki_pattern
|
||||||
|
|
||||||
|
|
||||||
class TestSankokuPhrases(unittest.TestCase):
|
class TestSankoku8PhraseParse(unittest.TestCase):
|
||||||
def test_sankoku_phrases1(self):
|
def test1(self):
|
||||||
pattern = '耳にたこ(ができる)'
|
pattern = '耳にたこ(ができる)'
|
||||||
exps = parse_hyouki_pattern(pattern)
|
exps = parse_hyouki_pattern(pattern)
|
||||||
self.assertEqual(len(exps), 2)
|
self.assertEqual(len(exps), 2)
|
||||||
self.assertIn("耳にたこ", exps)
|
self.assertIn("耳にたこ", exps)
|
||||||
self.assertIn("耳にたこができる", exps)
|
self.assertIn("耳にたこができる", exps)
|
||||||
|
|
||||||
def test_sankoku_phrases2(self):
|
def test2(self):
|
||||||
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
|
pattern = '一斑を〈見て/もって〉全豹を〈卜す/推す〉'
|
||||||
exps = parse_hyouki_pattern(pattern)
|
exps = parse_hyouki_pattern(pattern)
|
||||||
self.assertEqual(len(exps), 4)
|
self.assertEqual(len(exps), 4)
|
||||||
|
@ -19,14 +19,14 @@ class TestSankokuPhrases(unittest.TestCase):
|
||||||
self.assertIn("一斑をもって全豹を卜す", exps)
|
self.assertIn("一斑をもって全豹を卜す", exps)
|
||||||
self.assertIn("一斑をもって全豹を推す", exps)
|
self.assertIn("一斑をもって全豹を推す", exps)
|
||||||
|
|
||||||
def test_sankoku_phrases3(self):
|
def test3(self):
|
||||||
pattern = '{かじ・舵}を切る'
|
pattern = '{かじ・舵}を切る'
|
||||||
exps = parse_hyouki_pattern(pattern)
|
exps = parse_hyouki_pattern(pattern)
|
||||||
self.assertEqual(len(exps), 2)
|
self.assertEqual(len(exps), 2)
|
||||||
self.assertIn("かじを切る", exps)
|
self.assertIn("かじを切る", exps)
|
||||||
self.assertIn("舵を切る", exps)
|
self.assertIn("舵を切る", exps)
|
||||||
|
|
||||||
def test_sankoku_phrases4(self):
|
def test4(self):
|
||||||
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
|
pattern = '重箱の隅を(⦅ようじ\楊枝⦆で)〈つつく/ほじくる〉'
|
||||||
exps = parse_hyouki_pattern(pattern)
|
exps = parse_hyouki_pattern(pattern)
|
||||||
self.assertEqual(len(exps), 6)
|
self.assertEqual(len(exps), 6)
|
||||||
|
@ -37,7 +37,7 @@ class TestSankokuPhrases(unittest.TestCase):
|
||||||
self.assertIn("重箱の隅をようじでほじくる", exps)
|
self.assertIn("重箱の隅をようじでほじくる", exps)
|
||||||
self.assertIn("重箱の隅を楊枝でほじくる", exps)
|
self.assertIn("重箱の隅を楊枝でほじくる", exps)
|
||||||
|
|
||||||
def test_sankoku_phrases5(self):
|
def test5(self):
|
||||||
pattern = '群盲象を〈{な・撫}でる/評する〉'
|
pattern = '群盲象を〈{な・撫}でる/評する〉'
|
||||||
exps = parse_hyouki_pattern(pattern)
|
exps = parse_hyouki_pattern(pattern)
|
||||||
self.assertEqual(len(exps), 3)
|
self.assertEqual(len(exps), 3)
|
||||||
|
|
19
tests/test_smk_phrases.py
Normal file
19
tests/test_smk_phrases.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
import unittest
|
||||||
|
from bot.entries.smk8.phrase_entry import parse_phrase
|
||||||
|
|
||||||
|
|
||||||
|
class TestSmk8PhraseParse(unittest.TestCase):
|
||||||
|
def test1(self):
|
||||||
|
text = "目と鼻の△先(間)"
|
||||||
|
exps = parse_phrase(text)
|
||||||
|
self.assertEqual(len(exps), 2)
|
||||||
|
self.assertIn("目と鼻の先", exps)
|
||||||
|
self.assertIn("目と鼻の間", exps)
|
||||||
|
|
||||||
|
def test2(self):
|
||||||
|
text = "△金(時間・暇)に飽かして"
|
||||||
|
exps = parse_phrase(text)
|
||||||
|
self.assertEqual(len(exps), 3)
|
||||||
|
self.assertIn("金に飽かして", exps)
|
||||||
|
self.assertIn("時間に飽かして", exps)
|
||||||
|
self.assertIn("暇に飽かして", exps)
|
Loading…
Reference in a new issue