Finishing touches on first Jitenon Yoji version
This commit is contained in:
parent
f9ad9e6d21
commit
79632843cb
24
crawlers.py
24
crawlers.py
|
@ -9,33 +9,37 @@ from jitenon_yoji import JitenonYoji
|
||||||
def jitenon_yoji_crawler():
|
def jitenon_yoji_crawler():
|
||||||
entries = {}
|
entries = {}
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||||
gojuon_soup = BeautifulSoup(gojuon, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
for gojuon_a in gojuon_soup.select(".kana_area a", href=True):
|
||||||
gojuon_href = gojuon_a['href']
|
gojuon_href = gojuon_a['href']
|
||||||
kana = jitenon.scrape(gojuon_href)
|
kana_doc = jitenon.scrape(gojuon_href)
|
||||||
kana_soup = BeautifulSoup(kana, features="html.parser")
|
kana_soup = BeautifulSoup(kana_doc, features="html.parser")
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
kana_href = kana_a['href']
|
kana_href = kana_a['href']
|
||||||
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||||
if sequence in entries:
|
if sequence in entries:
|
||||||
continue
|
continue
|
||||||
yoji = jitenon.scrape(kana_href)
|
yoji_doc = jitenon.scrape(kana_href)
|
||||||
yoji_soup = BeautifulSoup(yoji, features="html5lib")
|
|
||||||
entry = JitenonYoji(sequence)
|
entry = JitenonYoji(sequence)
|
||||||
entry.add_soup(yoji_soup)
|
entry.add_document(yoji_doc)
|
||||||
entries[sequence] = entry
|
entries[sequence] = entry
|
||||||
|
|
||||||
terms = []
|
terms = []
|
||||||
|
attribution = ""
|
||||||
|
modified_date = None
|
||||||
for entry in entries.values():
|
for entry in entries.values():
|
||||||
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
|
modified_date = entry.modified_date
|
||||||
|
attribution = entry.attribution
|
||||||
for term in entry.yomichan_terms():
|
for term in entry.yomichan_terms():
|
||||||
terms.append(term)
|
terms.append(term)
|
||||||
index = {
|
index = {
|
||||||
"title": "四字熟語辞典オンライン",
|
"title": "四字熟語辞典オンライン",
|
||||||
"revision": "test",
|
"revision": f"jitenon-yoji.{modified_date}",
|
||||||
"sequenced": True,
|
"sequenced": True,
|
||||||
"format": 3,
|
"format": 3,
|
||||||
"url": "https://yoji.jitenon.jp/",
|
"url": "https://yoji.jitenon.jp/",
|
||||||
"attribution": "© 2012-2023 四字熟語辞典オンライン",
|
"attribution": attribution,
|
||||||
"description": "",
|
|
||||||
}
|
}
|
||||||
Yomichan.create_zip(terms, index)
|
Yomichan.create_zip(terms, index)
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import re
|
import re
|
||||||
|
from datetime import datetime, date
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import yomichan as Yomichan
|
import yomichan as Yomichan
|
||||||
import util as Util
|
import util as Util
|
||||||
|
@ -19,10 +21,15 @@ class JitenonYoji:
|
||||||
def __init__(self, sequence):
|
def __init__(self, sequence):
|
||||||
self.sequence = sequence
|
self.sequence = sequence
|
||||||
self.yomichan_glossary = [""]
|
self.yomichan_glossary = [""]
|
||||||
|
self.modified_date = date(1970, 1, 1)
|
||||||
|
self.attribution = ""
|
||||||
for column in self.columns.values():
|
for column in self.columns.values():
|
||||||
setattr(self, column[0], column[1])
|
setattr(self, column[0], column[1])
|
||||||
|
|
||||||
def add_soup(self, yoji_soup):
|
def add_document(self, html):
|
||||||
|
yoji_soup = BeautifulSoup(html, features="html5lib")
|
||||||
|
self.__set_modified_date(html)
|
||||||
|
self.attribution = yoji_soup.find(class_="copyright").text
|
||||||
table = yoji_soup.find(class_="kanjirighttb")
|
table = yoji_soup.find(class_="kanjirighttb")
|
||||||
rows = table.find("tbody").find_all("tr")
|
rows = table.find("tbody").find_all("tr")
|
||||||
colname = ""
|
colname = ""
|
||||||
|
@ -49,6 +56,13 @@ class JitenonYoji:
|
||||||
terms.append(term)
|
terms.append(term)
|
||||||
return terms
|
return terms
|
||||||
|
|
||||||
|
def __set_modified_date(self, html):
|
||||||
|
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
||||||
|
if not m:
|
||||||
|
return
|
||||||
|
date = datetime.strptime(m.group(1), '%Y-%m-%d').date()
|
||||||
|
self.modified_date = date
|
||||||
|
|
||||||
def __set_column(self, colname, colval):
|
def __set_column(self, colname, colval):
|
||||||
attr_name = self.columns[colname][0]
|
attr_name = self.columns[colname][0]
|
||||||
attr_value = getattr(self, attr_name)
|
attr_value = getattr(self, attr_name)
|
||||||
|
|
56
yomichan.py
56
yomichan.py
|
@ -2,6 +2,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import uuid
|
import uuid
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from css_parser import parseStyle
|
from css_parser import parseStyle
|
||||||
|
|
||||||
|
@ -10,7 +11,7 @@ def create_zip(terms, index, tags=[]):
|
||||||
build_directory = str(uuid.uuid4())
|
build_directory = str(uuid.uuid4())
|
||||||
os.mkdir(build_directory)
|
os.mkdir(build_directory)
|
||||||
|
|
||||||
terms_per_file = 500
|
terms_per_file = 1000
|
||||||
max_i = int(len(terms) / terms_per_file) + 1
|
max_i = int(len(terms) / terms_per_file) + 1
|
||||||
for i in range(max_i):
|
for i in range(max_i):
|
||||||
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
||||||
|
@ -30,16 +31,19 @@ def create_zip(terms, index, tags=[]):
|
||||||
|
|
||||||
zip_filename = index["title"]
|
zip_filename = index["title"]
|
||||||
zip_file = f"{zip_filename}.zip"
|
zip_file = f"{zip_filename}.zip"
|
||||||
if Path(zip_file).is_file():
|
|
||||||
os.remove(zip_file)
|
|
||||||
shutil.make_archive(zip_filename, "zip", build_directory)
|
shutil.make_archive(zip_filename, "zip", build_directory)
|
||||||
if not Path("output").is_dir():
|
out_dir = "output"
|
||||||
os.mkdir("output")
|
out_file = os.path.join(out_dir, zip_file)
|
||||||
shutil.move(zip_file, "output")
|
if not Path(out_dir).is_dir():
|
||||||
|
os.mkdir(out_dir)
|
||||||
|
elif Path(out_file).is_file():
|
||||||
|
os.remove(out_file)
|
||||||
|
shutil.move(zip_file, out_dir)
|
||||||
shutil.rmtree(build_directory)
|
shutil.rmtree(build_directory)
|
||||||
|
|
||||||
|
|
||||||
def soup_to_gloss(soup):
|
def soup_to_gloss(soup):
|
||||||
|
__sanitize_soup(soup)
|
||||||
structured_content = __get_markup_structure(soup)
|
structured_content = __get_markup_structure(soup)
|
||||||
return {
|
return {
|
||||||
"type": "structured-content",
|
"type": "structured-content",
|
||||||
|
@ -47,6 +51,23 @@ def soup_to_gloss(soup):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def __sanitize_soup(soup):
|
||||||
|
patterns = [
|
||||||
|
r"^(.+)([ぁ-ヿ]+)$",
|
||||||
|
r"^(.+)([ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+)$"
|
||||||
|
]
|
||||||
|
for a in soup.find_all("a"):
|
||||||
|
for pattern in patterns:
|
||||||
|
m = re.search(pattern, a.text)
|
||||||
|
if m:
|
||||||
|
a['href'] = f"?query={m.group(1)}&wildcards=off"
|
||||||
|
break
|
||||||
|
for p in soup.find_all("p"):
|
||||||
|
p.name = "span"
|
||||||
|
for th in soup.find_all("th"):
|
||||||
|
th['style'] = "vertical-align: middle; text-align: center;"
|
||||||
|
|
||||||
|
|
||||||
def __get_markup_structure(soup):
|
def __get_markup_structure(soup):
|
||||||
node = {}
|
node = {}
|
||||||
content = []
|
content = []
|
||||||
|
@ -63,11 +84,6 @@ def __get_markup_structure(soup):
|
||||||
for key, val in attributes.items():
|
for key, val in attributes.items():
|
||||||
node[key] = val
|
node[key] = val
|
||||||
|
|
||||||
if node["tag"] == "th":
|
|
||||||
node["style"] = {"verticalAlign": "middle", "textAlign": "center"}
|
|
||||||
elif node["tag"] == "p":
|
|
||||||
node["tag"] = "span"
|
|
||||||
|
|
||||||
if len(content) == 0:
|
if len(content) == 0:
|
||||||
pass
|
pass
|
||||||
elif len(content) == 1:
|
elif len(content) == 1:
|
||||||
|
@ -94,16 +110,18 @@ def __get_attributes(attrs):
|
||||||
def __get_style(inline_style_string):
|
def __get_style(inline_style_string):
|
||||||
style = {}
|
style = {}
|
||||||
parsedStyle = parseStyle(inline_style_string)
|
parsedStyle = parseStyle(inline_style_string)
|
||||||
if parsedStyle.fontSize != "":
|
|
||||||
style["fontSize"] = parsedStyle.fontSize
|
|
||||||
if parsedStyle.verticalAlign != "":
|
|
||||||
style["verticalAlign"] = parsedStyle.verticalAlign
|
|
||||||
if parsedStyle.textDecoration != "":
|
|
||||||
style["textDecorationLine"] = parsedStyle.textDecoration
|
|
||||||
if parsedStyle.listStyleType != "":
|
|
||||||
style["listStyleType"] = parsedStyle.listStyleType
|
|
||||||
if parsedStyle.fontStyle != "":
|
if parsedStyle.fontStyle != "":
|
||||||
style["fontStyle"] = parsedStyle.fontStyle
|
style["fontStyle"] = parsedStyle.fontStyle
|
||||||
if parsedStyle.fontWeight != "":
|
if parsedStyle.fontWeight != "":
|
||||||
style["fontWeight"] = parsedStyle.fontWeight
|
style["fontWeight"] = parsedStyle.fontWeight
|
||||||
|
if parsedStyle.fontSize != "":
|
||||||
|
style["fontSize"] = parsedStyle.fontSize
|
||||||
|
if parsedStyle.textDecoration != "":
|
||||||
|
style["textDecorationLine"] = parsedStyle.textDecoration
|
||||||
|
if parsedStyle.verticalAlign != "":
|
||||||
|
style["verticalAlign"] = parsedStyle.verticalAlign
|
||||||
|
if parsedStyle.textAlign != "":
|
||||||
|
style["textAlign"] = parsedStyle.textAlign
|
||||||
|
if parsedStyle.listStyleType != "":
|
||||||
|
style["listStyleType"] = parsedStyle.listStyleType
|
||||||
return style
|
return style
|
||||||
|
|
Loading…
Reference in a new issue