Reorganize file structure

This commit is contained in:
stephenmk 2023-04-10 15:20:33 -05:00
parent 16d694d2d2
commit 83a182e682
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
7 changed files with 231 additions and 131 deletions

View file

@ -2,9 +2,9 @@ import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import scraper as Scraper import scraper as Scraper
import yomichan as Yomichan import yomichan.export as YomichanExport
from jitenon_yoji import JitenonYoji from entries.jitenon_kotowaza import JitenonKotowaza
from jitenon_kotowaza import JitenonKotowaza from entries.jitenon_yoji import JitenonYoji
def run_all(): def run_all():
@ -13,7 +13,7 @@ def run_all():
def jitenon_yoji(): def jitenon_yoji():
entries = {} seq_to_entries = {}
jitenon = Scraper.Jitenon() jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html") gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -24,34 +24,18 @@ def jitenon_yoji():
for kana_a in kana_soup.select(".word_box a", href=True): for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href'] kana_href = kana_a['href']
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1)) sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
if sequence in entries: if sequence in seq_to_entries:
continue continue
yoji_doc = jitenon.scrape(kana_href) yoji_doc = jitenon.scrape(kana_href)
entry = JitenonYoji(sequence) entry = JitenonYoji(sequence)
entry.add_document(yoji_doc) entry.add_document(yoji_doc)
entries[sequence] = entry seq_to_entries[sequence] = entry
terms = [] entries = seq_to_entries.values()
attribution = "" YomichanExport.jitenon_yoji(entries)
modified_date = None
for entry in entries.values():
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "四字熟語辞典オンライン",
"revision": f"jitenon-yoji.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://yoji.jitenon.jp/",
"attribution": attribution,
}
Yomichan.create_zip(terms, index)
def jitenon_kotowaza(): def jitenon_kotowaza():
entries = {} seq_to_entries = {}
jitenon = Scraper.Jitenon() jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php") gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser") gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -65,29 +49,12 @@ def jitenon_kotowaza():
if m: if m:
sequence = int(m.group(1)) sequence = int(m.group(1))
else: else:
# print(f"Skipping {kana_href}")
continue continue
if sequence in entries: if sequence in seq_to_entries:
continue continue
kotowaza_doc = jitenon.scrape(kana_href) kotowaza_doc = jitenon.scrape(kana_href)
entry = JitenonKotowaza(sequence) entry = JitenonKotowaza(sequence)
entry.add_document(kotowaza_doc) entry.add_document(kotowaza_doc)
entries[sequence] = entry seq_to_entries[sequence] = entry
terms = [] entries = seq_to_entries.values()
attribution = "" YomichanExport.jitenon_kotowaza(entries)
modified_date = None
for entry in entries.values():
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "故事・ことわざ・慣用句オンライン",
"revision": f"jitenon-kotowaza.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://kotowaza.jitenon.jp/",
"attribution": attribution,
}
Yomichan.create_zip(terms, index)

View file

@ -2,22 +2,11 @@ import re
from datetime import datetime, date from datetime import datetime, date
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import yomichan as Yomichan import yomichan.soup as YomichanSoup
import util as Util import util as Util
class JitenonYoji: class Jitenon:
columns = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence): def __init__(self, sequence):
self.sequence = sequence self.sequence = sequence
self.yomichan_glossary = [""] self.yomichan_glossary = [""]
@ -35,26 +24,10 @@ class JitenonYoji:
colname = "" colname = ""
for row in rows: for row in rows:
colname = row.th.text if row.th is not None else colname colname = row.th.text if row.th is not None else colname
colval = row.td.decode_contents() colval = row.td.text
self.__set_column(colname, colval) self.__set_column(colname, colval)
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)] gloss = YomichanSoup.make_gloss(table) # note: modifies table
self.yomichan_glossary = [gloss]
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self.__headwords()):
(yoji, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
yoji, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def __set_modified_date(self, html): def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html) m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
@ -76,7 +49,7 @@ class JitenonYoji:
attr_value.append(colval) attr_value.append(colval)
setattr(self, attr_name, attr_value) setattr(self, attr_name, attr_value)
def __headwords(self): def _headwords(self):
words = [] words = []
for yomikata in self.__yomikatas(): for yomikata in self.__yomikatas():
headword = [self.expression, yomikata] headword = [self.expression, yomikata]
@ -91,33 +64,35 @@ class JitenonYoji:
def __yomikatas(self): def __yomikatas(self):
yomikata = self.yomikata.replace(" ", "") yomikata = self.yomikata.replace(" ", "")
m = re.search(r"^[ぁ-ヿ]+$", yomikata) m = re.search(r"^[ぁ-ヿ]+$", yomikata)
if m: if m:
return [yomikata] return [yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata) m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
if m: if m:
return [m.group(1)] return [m.group(1)]
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", yomikata) m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", yomikata)
if m: if m:
return Util.expand_shouryaku(yomikata) return Util.expand_shouryaku(yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", yomikata) m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", yomikata)
if m: if m:
yomikatas = [m.group(1)] yomikatas = [m.group(1)]
alts = m.group(2).split("/") alts = m.group(2).split("/")
for alt in alts: for alt in alts:
yomikatas.append(alt.strip()) yomikatas.append(alt.strip())
return yomikatas return yomikatas
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}") print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return ""
def __ikei_headwords(self): def __ikei_headwords(self):
ikei_headwords = [] ikei_headwords = []
for val in self.ikei: for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ]+)$", val) val = val.replace(" ", "")
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if m: if m:
headword = [m.group(1), m.group(2)] headword = [m.group(1), m.group(2)]
ikei_headwords.append(headword) ikei_headwords.append(headword)
else: else:
raise Exception(f"Invalid 異形 format: {val}\n{self}") print(f"Invalid 異形 format: {val}\n{self}\n")
return ikei_headwords return ikei_headwords
def __str__(self): def __str__(self):

View file

@ -0,0 +1,34 @@
from entries.jitenon import Jitenon
import yomichan.grammar as Grammar
class JitenonKotowaza(Jitenon):
columns = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"例文": ["reibun", ""],
"異形": ["ikei", []],
"類句": ["ruiku", []],
}
def __init__(self, sequence):
Jitenon.__init__(self, sequence)
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self._headwords()):
(expression, reading) = headword
definition_tags = None
inflection_rules = Grammar.sudachi_rules(expression, reading)
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms

38
entries/jitenon_yoji.py Normal file
View file

@ -0,0 +1,38 @@
from entries.jitenon import Jitenon
class JitenonYoji(Jitenon):
columns = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
Jitenon.__init__(self, sequence)
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self._headwords()):
(expression, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = self.__term_tags()
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def __term_tags(self):
tags = self.kankenkyuu.replace(" ", "").split("/")
return " ".join(tags)

88
yomichan/export.py Normal file
View file

@ -0,0 +1,88 @@
import json
import os
import shutil
import uuid
from pathlib import Path
def jitenon_yoji(entries):
terms, modified_date, attribution = __terms(entries)
index = {
"title": "四字熟語辞典オンライン",
"revision": f"jitenon-yoji.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://yoji.jitenon.jp/",
"attribution": attribution,
}
tags = [
["1級", "frequent", 0, "漢字検定(漢検)1級の四字熟語", 0],
["準1級", "frequent", 0, "漢字検定(漢検)準1級の四字熟語", 0],
["2級", "frequent", 0, "漢字検定(漢検)2級の四字熟語", 0],
["準2級", "frequent", 0, "漢字検定(漢検)準2級の四字熟語", 0],
["3級", "frequent", 0, "漢字検定(漢検)3級の四字熟語", 0],
["4級", "frequent", 0, "漢字検定(漢検)4級の四字熟語", 0],
["5級", "frequent", 0, "漢字検定(漢検)5級の四字熟語", 0],
]
__create_zip(terms, index, tags)
def jitenon_kotowaza(entries):
terms, modified_date, attribution = __terms(entries)
index = {
"title": "故事・ことわざ・慣用句オンライン",
"revision": f"jitenon-kotowaza.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://kotowaza.jitenon.jp/",
"attribution": attribution,
}
__create_zip(terms, index)
def __terms(entries):
terms = []
modified_date = None
attribution = ""
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
return terms, modified_date, attribution
def __create_zip(terms, index, tags=[]):
build_directory = str(uuid.uuid4())
os.mkdir(build_directory)
terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = terms_per_file * i
end = terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
index_file = os.path.join(build_directory, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
if len(tags) > 0:
tag_file = os.path.join(build_directory, "tag_bank_1.json")
with open(tag_file, 'w', encoding='utf8') as f:
json.dump(tags, f, indent=4, ensure_ascii=False)
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory)
out_dir = "output"
out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir():
os.mkdir(out_dir)
elif Path(out_file).is_file():
os.remove(out_file)
shutil.move(zip_file, out_dir)
shutil.rmtree(build_directory)

38
yomichan/grammar.py Normal file
View file

@ -0,0 +1,38 @@
from sudachipy import tokenizer
from sudachipy import dictionary
def sudachi_rules(expression, reading):
tokenizer_obj = dictionary.Dictionary().create()
splitmode = tokenizer.Tokenizer.SplitMode.A
tokens = tokenizer_obj.tokenize(expression, splitmode)
pos = tokens[len(tokens)-1].part_of_speech()[4]
tags = pos.split("-")
rules = __sudachi_tags_to_rules(tags, expression, reading)
return rules
def __sudachi_tags_to_rules(tags, expression, reading):
u_endings = ["", "", "", "", "", "", "",
"", "", "", "", "", "", ""]
rules = set()
for tag in tags:
if expression.endswith(""):
if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
rules.add("adj-i")
if expression.endswith(""):
if "" in tag or tag == "レル":
rules.add("v1")
if "" in tag or "" in tag or "" in tag:
for u_ending in u_endings:
if expression.endswith(u_ending):
rules.add("v5")
break
if "" in tag and (expression.endswith("する") or expression == "為る"):
rules.add("vs")
if "" in tag and expression.endswith("ずる"):
rules.add("vz")
if expression.endswith("来る") and reading.endswith("くる"):
rules = set()
rules.add("vk")
return " ".join(list(rules))

View file

@ -1,49 +1,9 @@
import json
import os
import shutil
import uuid
import re import re
from pathlib import Path
from css_parser import parseStyle from css_parser import parseStyle
def create_zip(terms, index, tags=[]): def make_gloss(soup):
build_directory = str(uuid.uuid4()) __preprocess_soup(soup)
os.mkdir(build_directory)
terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = terms_per_file * i
end = terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
index_file = os.path.join(build_directory, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
if len(tags) > 0:
tag_file = os.path.join(build_directory, "tag_bank_1.json")
with open(tag_file, 'w', encoding='utf8') as f:
json.dump(tags, f, indent=4, ensure_ascii=False)
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory)
out_dir = "output"
out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir():
os.mkdir(out_dir)
elif Path(out_file).is_file():
os.remove(out_file)
shutil.move(zip_file, out_dir)
shutil.rmtree(build_directory)
def soup_to_gloss(soup):
__sanitize_soup(soup)
structured_content = __get_markup_structure(soup) structured_content = __get_markup_structure(soup)
return { return {
"type": "structured-content", "type": "structured-content",
@ -51,10 +11,10 @@ def soup_to_gloss(soup):
} }
def __sanitize_soup(soup): def __preprocess_soup(soup):
patterns = [ patterns = [
r"^(.+)[ぁ-ヿ]+$", r"^(.+)[ぁ-ヿ\s]+$",
r"^(.+)[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$" r"^(.+)[ぁ-ヿ\s]+[ぁ-ヿ\s][ぁ-ヿ\s]+$"
] ]
for a in soup.find_all("a"): for a in soup.find_all("a"):
for pattern in patterns: for pattern in patterns: