Reorganize file structure

This commit is contained in:
stephenmk 2023-04-10 15:20:33 -05:00
parent 16d694d2d2
commit 83a182e682
No known key found for this signature in database
GPG key ID: B6DA730DB06235F1
7 changed files with 231 additions and 131 deletions

View file

@ -2,9 +2,9 @@ import re
from bs4 import BeautifulSoup
import scraper as Scraper
import yomichan as Yomichan
from jitenon_yoji import JitenonYoji
from jitenon_kotowaza import JitenonKotowaza
import yomichan.export as YomichanExport
from entries.jitenon_kotowaza import JitenonKotowaza
from entries.jitenon_yoji import JitenonYoji
def run_all():
@ -13,7 +13,7 @@ def run_all():
def jitenon_yoji():
entries = {}
seq_to_entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -24,34 +24,18 @@ def jitenon_yoji():
for kana_a in kana_soup.select(".word_box a", href=True):
kana_href = kana_a['href']
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
if sequence in entries:
if sequence in seq_to_entries:
continue
yoji_doc = jitenon.scrape(kana_href)
entry = JitenonYoji(sequence)
entry.add_document(yoji_doc)
entries[sequence] = entry
terms = []
attribution = ""
modified_date = None
for entry in entries.values():
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "四字熟語辞典オンライン",
"revision": f"jitenon-yoji.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://yoji.jitenon.jp/",
"attribution": attribution,
}
Yomichan.create_zip(terms, index)
seq_to_entries[sequence] = entry
entries = seq_to_entries.values()
YomichanExport.jitenon_yoji(entries)
def jitenon_kotowaza():
entries = {}
seq_to_entries = {}
jitenon = Scraper.Jitenon()
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
@ -65,29 +49,12 @@ def jitenon_kotowaza():
if m:
sequence = int(m.group(1))
else:
# print(f"Skipping {kana_href}")
continue
if sequence in entries:
if sequence in seq_to_entries:
continue
kotowaza_doc = jitenon.scrape(kana_href)
entry = JitenonKotowaza(sequence)
entry.add_document(kotowaza_doc)
entries[sequence] = entry
terms = []
attribution = ""
modified_date = None
for entry in entries.values():
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
index = {
"title": "故事・ことわざ・慣用句オンライン",
"revision": f"jitenon-kotowaza.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://kotowaza.jitenon.jp/",
"attribution": attribution,
}
Yomichan.create_zip(terms, index)
seq_to_entries[sequence] = entry
entries = seq_to_entries.values()
YomichanExport.jitenon_kotowaza(entries)

View file

@ -2,22 +2,11 @@ import re
from datetime import datetime, date
from bs4 import BeautifulSoup
import yomichan as Yomichan
import yomichan.soup as YomichanSoup
import util as Util
class JitenonYoji:
columns = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
class Jitenon:
def __init__(self, sequence):
self.sequence = sequence
self.yomichan_glossary = [""]
@ -35,26 +24,10 @@ class JitenonYoji:
colname = ""
for row in rows:
colname = row.th.text if row.th is not None else colname
colval = row.td.decode_contents()
colval = row.td.text
self.__set_column(colname, colval)
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self.__headwords()):
(yoji, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
yoji, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
gloss = YomichanSoup.make_gloss(table) # note: modifies table
self.yomichan_glossary = [gloss]
def __set_modified_date(self, html):
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
@ -76,7 +49,7 @@ class JitenonYoji:
attr_value.append(colval)
setattr(self, attr_name, attr_value)
def __headwords(self):
def _headwords(self):
words = []
for yomikata in self.__yomikatas():
headword = [self.expression, yomikata]
@ -91,33 +64,35 @@ class JitenonYoji:
def __yomikatas(self):
yomikata = self.yomikata.replace(" ", "")
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
if m:
return [yomikata]
m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
if m:
return [m.group(1)]
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", yomikata)
m = re.search(r"^[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$", yomikata)
if m:
return Util.expand_shouryaku(yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", yomikata)
m = re.search(r"^([ぁ-ヿ]+)([ぁ-ヿ/\s]+)$", yomikata)
if m:
yomikatas = [m.group(1)]
alts = m.group(2).split("/")
for alt in alts:
yomikatas.append(alt.strip())
return yomikatas
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
return ""
def __ikei_headwords(self):
ikei_headwords = []
for val in self.ikei:
m = re.search(r"^([^]+)([ぁ-ヿ]+)$", val)
val = val.replace(" ", "")
m = re.search(r"^([^]+)([ぁ-ヿ、]+)$", val)
if m:
headword = [m.group(1), m.group(2)]
ikei_headwords.append(headword)
else:
raise Exception(f"Invalid 異形 format: {val}\n{self}")
print(f"Invalid 異形 format: {val}\n{self}\n")
return ikei_headwords
def __str__(self):

View file

@ -0,0 +1,34 @@
from entries.jitenon import Jitenon
import yomichan.grammar as Grammar
class JitenonKotowaza(Jitenon):
columns = {
"言葉": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"例文": ["reibun", ""],
"異形": ["ikei", []],
"類句": ["ruiku", []],
}
def __init__(self, sequence):
Jitenon.__init__(self, sequence)
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self._headwords()):
(expression, reading) = headword
definition_tags = None
inflection_rules = Grammar.sudachi_rules(expression, reading)
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = ""
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms

38
entries/jitenon_yoji.py Normal file
View file

@ -0,0 +1,38 @@
from entries.jitenon import Jitenon
class JitenonYoji(Jitenon):
columns = {
"四字熟語": ["expression", ""],
"読み方": ["yomikata", ""],
"意味": ["imi", ""],
"出典": ["shutten", ""],
"漢検級": ["kankenkyuu", ""],
"場面用途": ["bamenyouto", ""],
"異形": ["ikei", []],
"類義語": ["ruigigo", []],
}
def __init__(self, sequence):
Jitenon.__init__(self, sequence)
def yomichan_terms(self):
terms = []
for idx, headword in enumerate(self._headwords()):
(expression, reading) = headword
definition_tags = None
inflection_rules = ""
score = -idx
glossary = self.yomichan_glossary
sequence = self.sequence
term_tags = self.__term_tags()
term = [
expression, reading, definition_tags, inflection_rules,
score, glossary, sequence, term_tags
]
terms.append(term)
return terms
def __term_tags(self):
tags = self.kankenkyuu.replace(" ", "").split("/")
return " ".join(tags)

88
yomichan/export.py Normal file
View file

@ -0,0 +1,88 @@
import json
import os
import shutil
import uuid
from pathlib import Path
def jitenon_yoji(entries):
terms, modified_date, attribution = __terms(entries)
index = {
"title": "四字熟語辞典オンライン",
"revision": f"jitenon-yoji.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://yoji.jitenon.jp/",
"attribution": attribution,
}
tags = [
["1級", "frequent", 0, "漢字検定(漢検)1級の四字熟語", 0],
["準1級", "frequent", 0, "漢字検定(漢検)準1級の四字熟語", 0],
["2級", "frequent", 0, "漢字検定(漢検)2級の四字熟語", 0],
["準2級", "frequent", 0, "漢字検定(漢検)準2級の四字熟語", 0],
["3級", "frequent", 0, "漢字検定(漢検)3級の四字熟語", 0],
["4級", "frequent", 0, "漢字検定(漢検)4級の四字熟語", 0],
["5級", "frequent", 0, "漢字検定(漢検)5級の四字熟語", 0],
]
__create_zip(terms, index, tags)
def jitenon_kotowaza(entries):
terms, modified_date, attribution = __terms(entries)
index = {
"title": "故事・ことわざ・慣用句オンライン",
"revision": f"jitenon-kotowaza.{modified_date}",
"sequenced": True,
"format": 3,
"url": "https://kotowaza.jitenon.jp/",
"attribution": attribution,
}
__create_zip(terms, index)
def __terms(entries):
terms = []
modified_date = None
attribution = ""
for entry in entries:
if modified_date is None or entry.modified_date > modified_date:
modified_date = entry.modified_date
attribution = entry.attribution
for term in entry.yomichan_terms():
terms.append(term)
return terms, modified_date, attribution
def __create_zip(terms, index, tags=[]):
build_directory = str(uuid.uuid4())
os.mkdir(build_directory)
terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = terms_per_file * i
end = terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
index_file = os.path.join(build_directory, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
if len(tags) > 0:
tag_file = os.path.join(build_directory, "tag_bank_1.json")
with open(tag_file, 'w', encoding='utf8') as f:
json.dump(tags, f, indent=4, ensure_ascii=False)
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory)
out_dir = "output"
out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir():
os.mkdir(out_dir)
elif Path(out_file).is_file():
os.remove(out_file)
shutil.move(zip_file, out_dir)
shutil.rmtree(build_directory)

38
yomichan/grammar.py Normal file
View file

@ -0,0 +1,38 @@
from sudachipy import tokenizer
from sudachipy import dictionary
def sudachi_rules(expression, reading):
tokenizer_obj = dictionary.Dictionary().create()
splitmode = tokenizer.Tokenizer.SplitMode.A
tokens = tokenizer_obj.tokenize(expression, splitmode)
pos = tokens[len(tokens)-1].part_of_speech()[4]
tags = pos.split("-")
rules = __sudachi_tags_to_rules(tags, expression, reading)
return rules
def __sudachi_tags_to_rules(tags, expression, reading):
u_endings = ["", "", "", "", "", "", "",
"", "", "", "", "", "", ""]
rules = set()
for tag in tags:
if expression.endswith(""):
if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
rules.add("adj-i")
if expression.endswith(""):
if "" in tag or tag == "レル":
rules.add("v1")
if "" in tag or "" in tag or "" in tag:
for u_ending in u_endings:
if expression.endswith(u_ending):
rules.add("v5")
break
if "" in tag and (expression.endswith("する") or expression == "為る"):
rules.add("vs")
if "" in tag and expression.endswith("ずる"):
rules.add("vz")
if expression.endswith("来る") and reading.endswith("くる"):
rules = set()
rules.add("vk")
return " ".join(list(rules))

View file

@ -1,49 +1,9 @@
import json
import os
import shutil
import uuid
import re
from pathlib import Path
from css_parser import parseStyle
def create_zip(terms, index, tags=[]):
build_directory = str(uuid.uuid4())
os.mkdir(build_directory)
terms_per_file = 1000
max_i = int(len(terms) / terms_per_file) + 1
for i in range(max_i):
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
with open(term_file, "w", encoding='utf8') as f:
start = terms_per_file * i
end = terms_per_file * (i + 1)
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
index_file = os.path.join(build_directory, "index.json")
with open(index_file, 'w', encoding='utf8') as f:
json.dump(index, f, indent=4, ensure_ascii=False)
if len(tags) > 0:
tag_file = os.path.join(build_directory, "tag_bank_1.json")
with open(tag_file, 'w', encoding='utf8') as f:
json.dump(tags, f, indent=4, ensure_ascii=False)
zip_filename = index["title"]
zip_file = f"{zip_filename}.zip"
shutil.make_archive(zip_filename, "zip", build_directory)
out_dir = "output"
out_file = os.path.join(out_dir, zip_file)
if not Path(out_dir).is_dir():
os.mkdir(out_dir)
elif Path(out_file).is_file():
os.remove(out_file)
shutil.move(zip_file, out_dir)
shutil.rmtree(build_directory)
def soup_to_gloss(soup):
__sanitize_soup(soup)
def make_gloss(soup):
__preprocess_soup(soup)
structured_content = __get_markup_structure(soup)
return {
"type": "structured-content",
@ -51,10 +11,10 @@ def soup_to_gloss(soup):
}
def __sanitize_soup(soup):
def __preprocess_soup(soup):
patterns = [
r"^(.+)[ぁ-ヿ]+$",
r"^(.+)[ぁ-ヿ]+[ぁ-ヿ][ぁ-ヿ]+$"
r"^(.+)[ぁ-ヿ\s]+$",
r"^(.+)[ぁ-ヿ\s]+[ぁ-ヿ\s][ぁ-ヿ\s]+$"
]
for a in soup.find_all("a"):
for pattern in patterns: