Reorganize file structure
This commit is contained in:
parent
16d694d2d2
commit
83a182e682
59
crawlers.py
59
crawlers.py
|
@ -2,9 +2,9 @@ import re
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
import scraper as Scraper
|
||||
import yomichan as Yomichan
|
||||
from jitenon_yoji import JitenonYoji
|
||||
from jitenon_kotowaza import JitenonKotowaza
|
||||
import yomichan.export as YomichanExport
|
||||
from entries.jitenon_kotowaza import JitenonKotowaza
|
||||
from entries.jitenon_yoji import JitenonYoji
|
||||
|
||||
|
||||
def run_all():
|
||||
|
@ -13,7 +13,7 @@ def run_all():
|
|||
|
||||
|
||||
def jitenon_yoji():
|
||||
entries = {}
|
||||
seq_to_entries = {}
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
|
@ -24,34 +24,18 @@ def jitenon_yoji():
|
|||
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||
kana_href = kana_a['href']
|
||||
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||
if sequence in entries:
|
||||
if sequence in seq_to_entries:
|
||||
continue
|
||||
yoji_doc = jitenon.scrape(kana_href)
|
||||
entry = JitenonYoji(sequence)
|
||||
entry.add_document(yoji_doc)
|
||||
entries[sequence] = entry
|
||||
terms = []
|
||||
attribution = ""
|
||||
modified_date = None
|
||||
for entry in entries.values():
|
||||
if modified_date is None or entry.modified_date > modified_date:
|
||||
modified_date = entry.modified_date
|
||||
attribution = entry.attribution
|
||||
for term in entry.yomichan_terms():
|
||||
terms.append(term)
|
||||
index = {
|
||||
"title": "四字熟語辞典オンライン",
|
||||
"revision": f"jitenon-yoji.{modified_date}",
|
||||
"sequenced": True,
|
||||
"format": 3,
|
||||
"url": "https://yoji.jitenon.jp/",
|
||||
"attribution": attribution,
|
||||
}
|
||||
Yomichan.create_zip(terms, index)
|
||||
seq_to_entries[sequence] = entry
|
||||
entries = seq_to_entries.values()
|
||||
YomichanExport.jitenon_yoji(entries)
|
||||
|
||||
|
||||
def jitenon_kotowaza():
|
||||
entries = {}
|
||||
seq_to_entries = {}
|
||||
jitenon = Scraper.Jitenon()
|
||||
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||
|
@ -65,29 +49,12 @@ def jitenon_kotowaza():
|
|||
if m:
|
||||
sequence = int(m.group(1))
|
||||
else:
|
||||
# print(f"Skipping {kana_href}")
|
||||
continue
|
||||
if sequence in entries:
|
||||
if sequence in seq_to_entries:
|
||||
continue
|
||||
kotowaza_doc = jitenon.scrape(kana_href)
|
||||
entry = JitenonKotowaza(sequence)
|
||||
entry.add_document(kotowaza_doc)
|
||||
entries[sequence] = entry
|
||||
terms = []
|
||||
attribution = ""
|
||||
modified_date = None
|
||||
for entry in entries.values():
|
||||
if modified_date is None or entry.modified_date > modified_date:
|
||||
modified_date = entry.modified_date
|
||||
attribution = entry.attribution
|
||||
for term in entry.yomichan_terms():
|
||||
terms.append(term)
|
||||
index = {
|
||||
"title": "故事・ことわざ・慣用句オンライン",
|
||||
"revision": f"jitenon-kotowaza.{modified_date}",
|
||||
"sequenced": True,
|
||||
"format": 3,
|
||||
"url": "https://kotowaza.jitenon.jp/",
|
||||
"attribution": attribution,
|
||||
}
|
||||
Yomichan.create_zip(terms, index)
|
||||
seq_to_entries[sequence] = entry
|
||||
entries = seq_to_entries.values()
|
||||
YomichanExport.jitenon_kotowaza(entries)
|
||||
|
|
|
@ -2,22 +2,11 @@ import re
|
|||
from datetime import datetime, date
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import yomichan as Yomichan
|
||||
import yomichan.soup as YomichanSoup
|
||||
import util as Util
|
||||
|
||||
|
||||
class JitenonYoji:
|
||||
columns = {
|
||||
"四字熟語": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"漢検級": ["kankenkyuu", ""],
|
||||
"場面用途": ["bamenyouto", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類義語": ["ruigigo", []],
|
||||
}
|
||||
|
||||
class Jitenon:
|
||||
def __init__(self, sequence):
|
||||
self.sequence = sequence
|
||||
self.yomichan_glossary = [""]
|
||||
|
@ -35,26 +24,10 @@ class JitenonYoji:
|
|||
colname = ""
|
||||
for row in rows:
|
||||
colname = row.th.text if row.th is not None else colname
|
||||
colval = row.td.decode_contents()
|
||||
colval = row.td.text
|
||||
self.__set_column(colname, colval)
|
||||
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
for idx, headword in enumerate(self.__headwords()):
|
||||
(yoji, reading) = headword
|
||||
definition_tags = None
|
||||
inflection_rules = ""
|
||||
score = -idx
|
||||
glossary = self.yomichan_glossary
|
||||
sequence = self.sequence
|
||||
term_tags = ""
|
||||
term = [
|
||||
yoji, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
return terms
|
||||
gloss = YomichanSoup.make_gloss(table) # note: modifies table
|
||||
self.yomichan_glossary = [gloss]
|
||||
|
||||
def __set_modified_date(self, html):
|
||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
||||
|
@ -76,7 +49,7 @@ class JitenonYoji:
|
|||
attr_value.append(colval)
|
||||
setattr(self, attr_name, attr_value)
|
||||
|
||||
def __headwords(self):
|
||||
def _headwords(self):
|
||||
words = []
|
||||
for yomikata in self.__yomikatas():
|
||||
headword = [self.expression, yomikata]
|
||||
|
@ -91,33 +64,35 @@ class JitenonYoji:
|
|||
|
||||
def __yomikatas(self):
|
||||
yomikata = self.yomikata.replace(" ", "")
|
||||
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
|
||||
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
||||
if m:
|
||||
return [yomikata]
|
||||
m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
|
||||
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
|
||||
if m:
|
||||
return [m.group(1)]
|
||||
m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", yomikata)
|
||||
m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata)
|
||||
if m:
|
||||
return Util.expand_shouryaku(yomikata)
|
||||
m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", yomikata)
|
||||
m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata)
|
||||
if m:
|
||||
yomikatas = [m.group(1)]
|
||||
alts = m.group(2).split("/")
|
||||
for alt in alts:
|
||||
yomikatas.append(alt.strip())
|
||||
return yomikatas
|
||||
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
|
||||
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
||||
return ""
|
||||
|
||||
def __ikei_headwords(self):
|
||||
ikei_headwords = []
|
||||
for val in self.ikei:
|
||||
m = re.search(r"^([^(]+)(([ぁ-ヿ]+))$", val)
|
||||
val = val.replace(" ", "")
|
||||
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
||||
if m:
|
||||
headword = [m.group(1), m.group(2)]
|
||||
ikei_headwords.append(headword)
|
||||
else:
|
||||
raise Exception(f"Invalid 異形 format: {val}\n{self}")
|
||||
print(f"Invalid 異形 format: {val}\n{self}\n")
|
||||
return ikei_headwords
|
||||
|
||||
def __str__(self):
|
34
entries/jitenon_kotowaza.py
Normal file
34
entries/jitenon_kotowaza.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from entries.jitenon import Jitenon
|
||||
import yomichan.grammar as Grammar
|
||||
|
||||
|
||||
class JitenonKotowaza(Jitenon):
|
||||
columns = {
|
||||
"言葉": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"例文": ["reibun", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類句": ["ruiku", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
Jitenon.__init__(self, sequence)
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
for idx, headword in enumerate(self._headwords()):
|
||||
(expression, reading) = headword
|
||||
definition_tags = None
|
||||
inflection_rules = Grammar.sudachi_rules(expression, reading)
|
||||
score = -idx
|
||||
glossary = self.yomichan_glossary
|
||||
sequence = self.sequence
|
||||
term_tags = ""
|
||||
term = [
|
||||
expression, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
return terms
|
38
entries/jitenon_yoji.py
Normal file
38
entries/jitenon_yoji.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
from entries.jitenon import Jitenon
|
||||
|
||||
|
||||
class JitenonYoji(Jitenon):
|
||||
columns = {
|
||||
"四字熟語": ["expression", ""],
|
||||
"読み方": ["yomikata", ""],
|
||||
"意味": ["imi", ""],
|
||||
"出典": ["shutten", ""],
|
||||
"漢検級": ["kankenkyuu", ""],
|
||||
"場面用途": ["bamenyouto", ""],
|
||||
"異形": ["ikei", []],
|
||||
"類義語": ["ruigigo", []],
|
||||
}
|
||||
|
||||
def __init__(self, sequence):
|
||||
Jitenon.__init__(self, sequence)
|
||||
|
||||
def yomichan_terms(self):
|
||||
terms = []
|
||||
for idx, headword in enumerate(self._headwords()):
|
||||
(expression, reading) = headword
|
||||
definition_tags = None
|
||||
inflection_rules = ""
|
||||
score = -idx
|
||||
glossary = self.yomichan_glossary
|
||||
sequence = self.sequence
|
||||
term_tags = self.__term_tags()
|
||||
term = [
|
||||
expression, reading, definition_tags, inflection_rules,
|
||||
score, glossary, sequence, term_tags
|
||||
]
|
||||
terms.append(term)
|
||||
return terms
|
||||
|
||||
def __term_tags(self):
|
||||
tags = self.kankenkyuu.replace(" ", "").split("/")
|
||||
return " ".join(tags)
|
88
yomichan/export.py
Normal file
88
yomichan/export.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
import json
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def jitenon_yoji(entries):
|
||||
terms, modified_date, attribution = __terms(entries)
|
||||
index = {
|
||||
"title": "四字熟語辞典オンライン",
|
||||
"revision": f"jitenon-yoji.{modified_date}",
|
||||
"sequenced": True,
|
||||
"format": 3,
|
||||
"url": "https://yoji.jitenon.jp/",
|
||||
"attribution": attribution,
|
||||
}
|
||||
tags = [
|
||||
["1級", "frequent", 0, "漢字検定(漢検)1級の四字熟語", 0],
|
||||
["準1級", "frequent", 0, "漢字検定(漢検)準1級の四字熟語", 0],
|
||||
["2級", "frequent", 0, "漢字検定(漢検)2級の四字熟語", 0],
|
||||
["準2級", "frequent", 0, "漢字検定(漢検)準2級の四字熟語", 0],
|
||||
["3級", "frequent", 0, "漢字検定(漢検)3級の四字熟語", 0],
|
||||
["4級", "frequent", 0, "漢字検定(漢検)4級の四字熟語", 0],
|
||||
["5級", "frequent", 0, "漢字検定(漢検)5級の四字熟語", 0],
|
||||
]
|
||||
__create_zip(terms, index, tags)
|
||||
|
||||
|
||||
def jitenon_kotowaza(entries):
|
||||
terms, modified_date, attribution = __terms(entries)
|
||||
index = {
|
||||
"title": "故事・ことわざ・慣用句オンライン",
|
||||
"revision": f"jitenon-kotowaza.{modified_date}",
|
||||
"sequenced": True,
|
||||
"format": 3,
|
||||
"url": "https://kotowaza.jitenon.jp/",
|
||||
"attribution": attribution,
|
||||
}
|
||||
__create_zip(terms, index)
|
||||
|
||||
|
||||
def __terms(entries):
|
||||
terms = []
|
||||
modified_date = None
|
||||
attribution = ""
|
||||
for entry in entries:
|
||||
if modified_date is None or entry.modified_date > modified_date:
|
||||
modified_date = entry.modified_date
|
||||
attribution = entry.attribution
|
||||
for term in entry.yomichan_terms():
|
||||
terms.append(term)
|
||||
return terms, modified_date, attribution
|
||||
|
||||
|
||||
def __create_zip(terms, index, tags=[]):
|
||||
build_directory = str(uuid.uuid4())
|
||||
os.mkdir(build_directory)
|
||||
|
||||
terms_per_file = 1000
|
||||
max_i = int(len(terms) / terms_per_file) + 1
|
||||
for i in range(max_i):
|
||||
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
||||
with open(term_file, "w", encoding='utf8') as f:
|
||||
start = terms_per_file * i
|
||||
end = terms_per_file * (i + 1)
|
||||
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
||||
|
||||
index_file = os.path.join(build_directory, "index.json")
|
||||
with open(index_file, 'w', encoding='utf8') as f:
|
||||
json.dump(index, f, indent=4, ensure_ascii=False)
|
||||
|
||||
if len(tags) > 0:
|
||||
tag_file = os.path.join(build_directory, "tag_bank_1.json")
|
||||
with open(tag_file, 'w', encoding='utf8') as f:
|
||||
json.dump(tags, f, indent=4, ensure_ascii=False)
|
||||
|
||||
zip_filename = index["title"]
|
||||
zip_file = f"{zip_filename}.zip"
|
||||
shutil.make_archive(zip_filename, "zip", build_directory)
|
||||
out_dir = "output"
|
||||
out_file = os.path.join(out_dir, zip_file)
|
||||
if not Path(out_dir).is_dir():
|
||||
os.mkdir(out_dir)
|
||||
elif Path(out_file).is_file():
|
||||
os.remove(out_file)
|
||||
shutil.move(zip_file, out_dir)
|
||||
shutil.rmtree(build_directory)
|
38
yomichan/grammar.py
Normal file
38
yomichan/grammar.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
from sudachipy import tokenizer
|
||||
from sudachipy import dictionary
|
||||
|
||||
|
||||
def sudachi_rules(expression, reading):
|
||||
tokenizer_obj = dictionary.Dictionary().create()
|
||||
splitmode = tokenizer.Tokenizer.SplitMode.A
|
||||
tokens = tokenizer_obj.tokenize(expression, splitmode)
|
||||
pos = tokens[len(tokens)-1].part_of_speech()[4]
|
||||
tags = pos.split("-")
|
||||
rules = __sudachi_tags_to_rules(tags, expression, reading)
|
||||
return rules
|
||||
|
||||
|
||||
def __sudachi_tags_to_rules(tags, expression, reading):
|
||||
u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
|
||||
"ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
|
||||
rules = set()
|
||||
for tag in tags:
|
||||
if expression.endswith("い"):
|
||||
if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
|
||||
rules.add("adj-i")
|
||||
if expression.endswith("る"):
|
||||
if "一" in tag or tag == "レル":
|
||||
rules.add("v1")
|
||||
if "二" in tag or "四" in tag or "五" in tag:
|
||||
for u_ending in u_endings:
|
||||
if expression.endswith(u_ending):
|
||||
rules.add("v5")
|
||||
break
|
||||
if "サ" in tag and (expression.endswith("する") or expression == "為る"):
|
||||
rules.add("vs")
|
||||
if "サ" in tag and expression.endswith("ずる"):
|
||||
rules.add("vz")
|
||||
if expression.endswith("来る") and reading.endswith("くる"):
|
||||
rules = set()
|
||||
rules.add("vk")
|
||||
return " ".join(list(rules))
|
|
@ -1,49 +1,9 @@
|
|||
import json
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
import re
|
||||
from pathlib import Path
|
||||
from css_parser import parseStyle
|
||||
|
||||
|
||||
def create_zip(terms, index, tags=[]):
|
||||
build_directory = str(uuid.uuid4())
|
||||
os.mkdir(build_directory)
|
||||
|
||||
terms_per_file = 1000
|
||||
max_i = int(len(terms) / terms_per_file) + 1
|
||||
for i in range(max_i):
|
||||
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
||||
with open(term_file, "w", encoding='utf8') as f:
|
||||
start = terms_per_file * i
|
||||
end = terms_per_file * (i + 1)
|
||||
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
||||
|
||||
index_file = os.path.join(build_directory, "index.json")
|
||||
with open(index_file, 'w', encoding='utf8') as f:
|
||||
json.dump(index, f, indent=4, ensure_ascii=False)
|
||||
|
||||
if len(tags) > 0:
|
||||
tag_file = os.path.join(build_directory, "tag_bank_1.json")
|
||||
with open(tag_file, 'w', encoding='utf8') as f:
|
||||
json.dump(tags, f, indent=4, ensure_ascii=False)
|
||||
|
||||
zip_filename = index["title"]
|
||||
zip_file = f"{zip_filename}.zip"
|
||||
shutil.make_archive(zip_filename, "zip", build_directory)
|
||||
out_dir = "output"
|
||||
out_file = os.path.join(out_dir, zip_file)
|
||||
if not Path(out_dir).is_dir():
|
||||
os.mkdir(out_dir)
|
||||
elif Path(out_file).is_file():
|
||||
os.remove(out_file)
|
||||
shutil.move(zip_file, out_dir)
|
||||
shutil.rmtree(build_directory)
|
||||
|
||||
|
||||
def soup_to_gloss(soup):
|
||||
__sanitize_soup(soup)
|
||||
def make_gloss(soup):
|
||||
__preprocess_soup(soup)
|
||||
structured_content = __get_markup_structure(soup)
|
||||
return {
|
||||
"type": "structured-content",
|
||||
|
@ -51,10 +11,10 @@ def soup_to_gloss(soup):
|
|||
}
|
||||
|
||||
|
||||
def __sanitize_soup(soup):
|
||||
def __preprocess_soup(soup):
|
||||
patterns = [
|
||||
r"^(.+)([ぁ-ヿ]+)$",
|
||||
r"^(.+)([ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+)$"
|
||||
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||||
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||||
]
|
||||
for a in soup.find_all("a"):
|
||||
for pattern in patterns:
|
Loading…
Reference in a new issue