Reorganize file structure
This commit is contained in:
parent
16d694d2d2
commit
83a182e682
59
crawlers.py
59
crawlers.py
|
@ -2,9 +2,9 @@ import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import scraper as Scraper
|
import scraper as Scraper
|
||||||
import yomichan as Yomichan
|
import yomichan.export as YomichanExport
|
||||||
from jitenon_yoji import JitenonYoji
|
from entries.jitenon_kotowaza import JitenonKotowaza
|
||||||
from jitenon_kotowaza import JitenonKotowaza
|
from entries.jitenon_yoji import JitenonYoji
|
||||||
|
|
||||||
|
|
||||||
def run_all():
|
def run_all():
|
||||||
|
@ -13,7 +13,7 @@ def run_all():
|
||||||
|
|
||||||
|
|
||||||
def jitenon_yoji():
|
def jitenon_yoji():
|
||||||
entries = {}
|
seq_to_entries = {}
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
gojuon_doc = jitenon.scrape("https://yoji.jitenon.jp/cat/gojuon.html")
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
@ -24,34 +24,18 @@ def jitenon_yoji():
|
||||||
for kana_a in kana_soup.select(".word_box a", href=True):
|
for kana_a in kana_soup.select(".word_box a", href=True):
|
||||||
kana_href = kana_a['href']
|
kana_href = kana_a['href']
|
||||||
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
sequence = int(re.search(r"([0-9]+).html", kana_href).group(1))
|
||||||
if sequence in entries:
|
if sequence in seq_to_entries:
|
||||||
continue
|
continue
|
||||||
yoji_doc = jitenon.scrape(kana_href)
|
yoji_doc = jitenon.scrape(kana_href)
|
||||||
entry = JitenonYoji(sequence)
|
entry = JitenonYoji(sequence)
|
||||||
entry.add_document(yoji_doc)
|
entry.add_document(yoji_doc)
|
||||||
entries[sequence] = entry
|
seq_to_entries[sequence] = entry
|
||||||
terms = []
|
entries = seq_to_entries.values()
|
||||||
attribution = ""
|
YomichanExport.jitenon_yoji(entries)
|
||||||
modified_date = None
|
|
||||||
for entry in entries.values():
|
|
||||||
if modified_date is None or entry.modified_date > modified_date:
|
|
||||||
modified_date = entry.modified_date
|
|
||||||
attribution = entry.attribution
|
|
||||||
for term in entry.yomichan_terms():
|
|
||||||
terms.append(term)
|
|
||||||
index = {
|
|
||||||
"title": "四字熟語辞典オンライン",
|
|
||||||
"revision": f"jitenon-yoji.{modified_date}",
|
|
||||||
"sequenced": True,
|
|
||||||
"format": 3,
|
|
||||||
"url": "https://yoji.jitenon.jp/",
|
|
||||||
"attribution": attribution,
|
|
||||||
}
|
|
||||||
Yomichan.create_zip(terms, index)
|
|
||||||
|
|
||||||
|
|
||||||
def jitenon_kotowaza():
|
def jitenon_kotowaza():
|
||||||
entries = {}
|
seq_to_entries = {}
|
||||||
jitenon = Scraper.Jitenon()
|
jitenon = Scraper.Jitenon()
|
||||||
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
gojuon_doc = jitenon.scrape("https://kotowaza.jitenon.jp/cat/gojuon.php")
|
||||||
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
gojuon_soup = BeautifulSoup(gojuon_doc, features="html.parser")
|
||||||
|
@ -65,29 +49,12 @@ def jitenon_kotowaza():
|
||||||
if m:
|
if m:
|
||||||
sequence = int(m.group(1))
|
sequence = int(m.group(1))
|
||||||
else:
|
else:
|
||||||
# print(f"Skipping {kana_href}")
|
|
||||||
continue
|
continue
|
||||||
if sequence in entries:
|
if sequence in seq_to_entries:
|
||||||
continue
|
continue
|
||||||
kotowaza_doc = jitenon.scrape(kana_href)
|
kotowaza_doc = jitenon.scrape(kana_href)
|
||||||
entry = JitenonKotowaza(sequence)
|
entry = JitenonKotowaza(sequence)
|
||||||
entry.add_document(kotowaza_doc)
|
entry.add_document(kotowaza_doc)
|
||||||
entries[sequence] = entry
|
seq_to_entries[sequence] = entry
|
||||||
terms = []
|
entries = seq_to_entries.values()
|
||||||
attribution = ""
|
YomichanExport.jitenon_kotowaza(entries)
|
||||||
modified_date = None
|
|
||||||
for entry in entries.values():
|
|
||||||
if modified_date is None or entry.modified_date > modified_date:
|
|
||||||
modified_date = entry.modified_date
|
|
||||||
attribution = entry.attribution
|
|
||||||
for term in entry.yomichan_terms():
|
|
||||||
terms.append(term)
|
|
||||||
index = {
|
|
||||||
"title": "故事・ことわざ・慣用句オンライン",
|
|
||||||
"revision": f"jitenon-kotowaza.{modified_date}",
|
|
||||||
"sequenced": True,
|
|
||||||
"format": 3,
|
|
||||||
"url": "https://kotowaza.jitenon.jp/",
|
|
||||||
"attribution": attribution,
|
|
||||||
}
|
|
||||||
Yomichan.create_zip(terms, index)
|
|
||||||
|
|
|
@ -2,22 +2,11 @@ import re
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
import yomichan as Yomichan
|
import yomichan.soup as YomichanSoup
|
||||||
import util as Util
|
import util as Util
|
||||||
|
|
||||||
|
|
||||||
class JitenonYoji:
|
class Jitenon:
|
||||||
columns = {
|
|
||||||
"四字熟語": ["expression", ""],
|
|
||||||
"読み方": ["yomikata", ""],
|
|
||||||
"意味": ["imi", ""],
|
|
||||||
"出典": ["shutten", ""],
|
|
||||||
"漢検級": ["kankenkyuu", ""],
|
|
||||||
"場面用途": ["bamenyouto", ""],
|
|
||||||
"異形": ["ikei", []],
|
|
||||||
"類義語": ["ruigigo", []],
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, sequence):
|
def __init__(self, sequence):
|
||||||
self.sequence = sequence
|
self.sequence = sequence
|
||||||
self.yomichan_glossary = [""]
|
self.yomichan_glossary = [""]
|
||||||
|
@ -35,26 +24,10 @@ class JitenonYoji:
|
||||||
colname = ""
|
colname = ""
|
||||||
for row in rows:
|
for row in rows:
|
||||||
colname = row.th.text if row.th is not None else colname
|
colname = row.th.text if row.th is not None else colname
|
||||||
colval = row.td.decode_contents()
|
colval = row.td.text
|
||||||
self.__set_column(colname, colval)
|
self.__set_column(colname, colval)
|
||||||
self.yomichan_glossary = [Yomichan.soup_to_gloss(table)]
|
gloss = YomichanSoup.make_gloss(table) # note: modifies table
|
||||||
|
self.yomichan_glossary = [gloss]
|
||||||
def yomichan_terms(self):
|
|
||||||
terms = []
|
|
||||||
for idx, headword in enumerate(self.__headwords()):
|
|
||||||
(yoji, reading) = headword
|
|
||||||
definition_tags = None
|
|
||||||
inflection_rules = ""
|
|
||||||
score = -idx
|
|
||||||
glossary = self.yomichan_glossary
|
|
||||||
sequence = self.sequence
|
|
||||||
term_tags = ""
|
|
||||||
term = [
|
|
||||||
yoji, reading, definition_tags, inflection_rules,
|
|
||||||
score, glossary, sequence, term_tags
|
|
||||||
]
|
|
||||||
terms.append(term)
|
|
||||||
return terms
|
|
||||||
|
|
||||||
def __set_modified_date(self, html):
|
def __set_modified_date(self, html):
|
||||||
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
m = re.search(r"\"dateModified\": \"(\d{4}-\d{2}-\d{2})", html)
|
||||||
|
@ -76,7 +49,7 @@ class JitenonYoji:
|
||||||
attr_value.append(colval)
|
attr_value.append(colval)
|
||||||
setattr(self, attr_name, attr_value)
|
setattr(self, attr_name, attr_value)
|
||||||
|
|
||||||
def __headwords(self):
|
def _headwords(self):
|
||||||
words = []
|
words = []
|
||||||
for yomikata in self.__yomikatas():
|
for yomikata in self.__yomikatas():
|
||||||
headword = [self.expression, yomikata]
|
headword = [self.expression, yomikata]
|
||||||
|
@ -91,33 +64,35 @@ class JitenonYoji:
|
||||||
|
|
||||||
def __yomikatas(self):
|
def __yomikatas(self):
|
||||||
yomikata = self.yomikata.replace(" ", "")
|
yomikata = self.yomikata.replace(" ", "")
|
||||||
m = re.search(r"^[ぁ-ヿ]+$", yomikata)
|
m = re.search(r"^[ぁ-ヿ、]+$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return [yomikata]
|
return [yomikata]
|
||||||
m = re.search(r"^([ぁ-ヿ]+)<br/>", yomikata)
|
m = re.search(r"^([ぁ-ヿ、]+)※", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return [m.group(1)]
|
return [m.group(1)]
|
||||||
m = re.search(r"^[ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+$", yomikata)
|
m = re.search(r"^[ぁ-ヿ、]+([ぁ-ヿ、])[ぁ-ヿ、]+$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
return Util.expand_shouryaku(yomikata)
|
return Util.expand_shouryaku(yomikata)
|
||||||
m = re.search(r"^([ぁ-ヿ]+)(([ぁ-ヿ/\s]+))$", yomikata)
|
m = re.search(r"^([ぁ-ヿ、]+)(([ぁ-ヿ/\s、]+))$", yomikata)
|
||||||
if m:
|
if m:
|
||||||
yomikatas = [m.group(1)]
|
yomikatas = [m.group(1)]
|
||||||
alts = m.group(2).split("/")
|
alts = m.group(2).split("/")
|
||||||
for alt in alts:
|
for alt in alts:
|
||||||
yomikatas.append(alt.strip())
|
yomikatas.append(alt.strip())
|
||||||
return yomikatas
|
return yomikatas
|
||||||
raise Exception(f"Invalid 読み方 format: {self.yomikata}\n{self}")
|
print(f"Invalid 読み方 format: {self.yomikata}\n{self}\n")
|
||||||
|
return ""
|
||||||
|
|
||||||
def __ikei_headwords(self):
|
def __ikei_headwords(self):
|
||||||
ikei_headwords = []
|
ikei_headwords = []
|
||||||
for val in self.ikei:
|
for val in self.ikei:
|
||||||
m = re.search(r"^([^(]+)(([ぁ-ヿ]+))$", val)
|
val = val.replace(" ", "")
|
||||||
|
m = re.search(r"^([^(]+)(([ぁ-ヿ、]+))$", val)
|
||||||
if m:
|
if m:
|
||||||
headword = [m.group(1), m.group(2)]
|
headword = [m.group(1), m.group(2)]
|
||||||
ikei_headwords.append(headword)
|
ikei_headwords.append(headword)
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Invalid 異形 format: {val}\n{self}")
|
print(f"Invalid 異形 format: {val}\n{self}\n")
|
||||||
return ikei_headwords
|
return ikei_headwords
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
34
entries/jitenon_kotowaza.py
Normal file
34
entries/jitenon_kotowaza.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
from entries.jitenon import Jitenon
|
||||||
|
import yomichan.grammar as Grammar
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonKotowaza(Jitenon):
|
||||||
|
columns = {
|
||||||
|
"言葉": ["expression", ""],
|
||||||
|
"読み方": ["yomikata", ""],
|
||||||
|
"意味": ["imi", ""],
|
||||||
|
"出典": ["shutten", ""],
|
||||||
|
"例文": ["reibun", ""],
|
||||||
|
"異形": ["ikei", []],
|
||||||
|
"類句": ["ruiku", []],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, sequence):
|
||||||
|
Jitenon.__init__(self, sequence)
|
||||||
|
|
||||||
|
def yomichan_terms(self):
|
||||||
|
terms = []
|
||||||
|
for idx, headword in enumerate(self._headwords()):
|
||||||
|
(expression, reading) = headword
|
||||||
|
definition_tags = None
|
||||||
|
inflection_rules = Grammar.sudachi_rules(expression, reading)
|
||||||
|
score = -idx
|
||||||
|
glossary = self.yomichan_glossary
|
||||||
|
sequence = self.sequence
|
||||||
|
term_tags = ""
|
||||||
|
term = [
|
||||||
|
expression, reading, definition_tags, inflection_rules,
|
||||||
|
score, glossary, sequence, term_tags
|
||||||
|
]
|
||||||
|
terms.append(term)
|
||||||
|
return terms
|
38
entries/jitenon_yoji.py
Normal file
38
entries/jitenon_yoji.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
from entries.jitenon import Jitenon
|
||||||
|
|
||||||
|
|
||||||
|
class JitenonYoji(Jitenon):
|
||||||
|
columns = {
|
||||||
|
"四字熟語": ["expression", ""],
|
||||||
|
"読み方": ["yomikata", ""],
|
||||||
|
"意味": ["imi", ""],
|
||||||
|
"出典": ["shutten", ""],
|
||||||
|
"漢検級": ["kankenkyuu", ""],
|
||||||
|
"場面用途": ["bamenyouto", ""],
|
||||||
|
"異形": ["ikei", []],
|
||||||
|
"類義語": ["ruigigo", []],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, sequence):
|
||||||
|
Jitenon.__init__(self, sequence)
|
||||||
|
|
||||||
|
def yomichan_terms(self):
|
||||||
|
terms = []
|
||||||
|
for idx, headword in enumerate(self._headwords()):
|
||||||
|
(expression, reading) = headword
|
||||||
|
definition_tags = None
|
||||||
|
inflection_rules = ""
|
||||||
|
score = -idx
|
||||||
|
glossary = self.yomichan_glossary
|
||||||
|
sequence = self.sequence
|
||||||
|
term_tags = self.__term_tags()
|
||||||
|
term = [
|
||||||
|
expression, reading, definition_tags, inflection_rules,
|
||||||
|
score, glossary, sequence, term_tags
|
||||||
|
]
|
||||||
|
terms.append(term)
|
||||||
|
return terms
|
||||||
|
|
||||||
|
def __term_tags(self):
|
||||||
|
tags = self.kankenkyuu.replace(" ", "").split("/")
|
||||||
|
return " ".join(tags)
|
88
yomichan/export.py
Normal file
88
yomichan/export.py
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def jitenon_yoji(entries):
|
||||||
|
terms, modified_date, attribution = __terms(entries)
|
||||||
|
index = {
|
||||||
|
"title": "四字熟語辞典オンライン",
|
||||||
|
"revision": f"jitenon-yoji.{modified_date}",
|
||||||
|
"sequenced": True,
|
||||||
|
"format": 3,
|
||||||
|
"url": "https://yoji.jitenon.jp/",
|
||||||
|
"attribution": attribution,
|
||||||
|
}
|
||||||
|
tags = [
|
||||||
|
["1級", "frequent", 0, "漢字検定(漢検)1級の四字熟語", 0],
|
||||||
|
["準1級", "frequent", 0, "漢字検定(漢検)準1級の四字熟語", 0],
|
||||||
|
["2級", "frequent", 0, "漢字検定(漢検)2級の四字熟語", 0],
|
||||||
|
["準2級", "frequent", 0, "漢字検定(漢検)準2級の四字熟語", 0],
|
||||||
|
["3級", "frequent", 0, "漢字検定(漢検)3級の四字熟語", 0],
|
||||||
|
["4級", "frequent", 0, "漢字検定(漢検)4級の四字熟語", 0],
|
||||||
|
["5級", "frequent", 0, "漢字検定(漢検)5級の四字熟語", 0],
|
||||||
|
]
|
||||||
|
__create_zip(terms, index, tags)
|
||||||
|
|
||||||
|
|
||||||
|
def jitenon_kotowaza(entries):
|
||||||
|
terms, modified_date, attribution = __terms(entries)
|
||||||
|
index = {
|
||||||
|
"title": "故事・ことわざ・慣用句オンライン",
|
||||||
|
"revision": f"jitenon-kotowaza.{modified_date}",
|
||||||
|
"sequenced": True,
|
||||||
|
"format": 3,
|
||||||
|
"url": "https://kotowaza.jitenon.jp/",
|
||||||
|
"attribution": attribution,
|
||||||
|
}
|
||||||
|
__create_zip(terms, index)
|
||||||
|
|
||||||
|
|
||||||
|
def __terms(entries):
|
||||||
|
terms = []
|
||||||
|
modified_date = None
|
||||||
|
attribution = ""
|
||||||
|
for entry in entries:
|
||||||
|
if modified_date is None or entry.modified_date > modified_date:
|
||||||
|
modified_date = entry.modified_date
|
||||||
|
attribution = entry.attribution
|
||||||
|
for term in entry.yomichan_terms():
|
||||||
|
terms.append(term)
|
||||||
|
return terms, modified_date, attribution
|
||||||
|
|
||||||
|
|
||||||
|
def __create_zip(terms, index, tags=[]):
|
||||||
|
build_directory = str(uuid.uuid4())
|
||||||
|
os.mkdir(build_directory)
|
||||||
|
|
||||||
|
terms_per_file = 1000
|
||||||
|
max_i = int(len(terms) / terms_per_file) + 1
|
||||||
|
for i in range(max_i):
|
||||||
|
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
||||||
|
with open(term_file, "w", encoding='utf8') as f:
|
||||||
|
start = terms_per_file * i
|
||||||
|
end = terms_per_file * (i + 1)
|
||||||
|
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
|
index_file = os.path.join(build_directory, "index.json")
|
||||||
|
with open(index_file, 'w', encoding='utf8') as f:
|
||||||
|
json.dump(index, f, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
|
if len(tags) > 0:
|
||||||
|
tag_file = os.path.join(build_directory, "tag_bank_1.json")
|
||||||
|
with open(tag_file, 'w', encoding='utf8') as f:
|
||||||
|
json.dump(tags, f, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
|
zip_filename = index["title"]
|
||||||
|
zip_file = f"{zip_filename}.zip"
|
||||||
|
shutil.make_archive(zip_filename, "zip", build_directory)
|
||||||
|
out_dir = "output"
|
||||||
|
out_file = os.path.join(out_dir, zip_file)
|
||||||
|
if not Path(out_dir).is_dir():
|
||||||
|
os.mkdir(out_dir)
|
||||||
|
elif Path(out_file).is_file():
|
||||||
|
os.remove(out_file)
|
||||||
|
shutil.move(zip_file, out_dir)
|
||||||
|
shutil.rmtree(build_directory)
|
38
yomichan/grammar.py
Normal file
38
yomichan/grammar.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
from sudachipy import tokenizer
|
||||||
|
from sudachipy import dictionary
|
||||||
|
|
||||||
|
|
||||||
|
def sudachi_rules(expression, reading):
|
||||||
|
tokenizer_obj = dictionary.Dictionary().create()
|
||||||
|
splitmode = tokenizer.Tokenizer.SplitMode.A
|
||||||
|
tokens = tokenizer_obj.tokenize(expression, splitmode)
|
||||||
|
pos = tokens[len(tokens)-1].part_of_speech()[4]
|
||||||
|
tags = pos.split("-")
|
||||||
|
rules = __sudachi_tags_to_rules(tags, expression, reading)
|
||||||
|
return rules
|
||||||
|
|
||||||
|
|
||||||
|
def __sudachi_tags_to_rules(tags, expression, reading):
|
||||||
|
u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む",
|
||||||
|
"ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"]
|
||||||
|
rules = set()
|
||||||
|
for tag in tags:
|
||||||
|
if expression.endswith("い"):
|
||||||
|
if tag == "形容詞" or "ナイ" in tag or "タイ" in tag:
|
||||||
|
rules.add("adj-i")
|
||||||
|
if expression.endswith("る"):
|
||||||
|
if "一" in tag or tag == "レル":
|
||||||
|
rules.add("v1")
|
||||||
|
if "二" in tag or "四" in tag or "五" in tag:
|
||||||
|
for u_ending in u_endings:
|
||||||
|
if expression.endswith(u_ending):
|
||||||
|
rules.add("v5")
|
||||||
|
break
|
||||||
|
if "サ" in tag and (expression.endswith("する") or expression == "為る"):
|
||||||
|
rules.add("vs")
|
||||||
|
if "サ" in tag and expression.endswith("ずる"):
|
||||||
|
rules.add("vz")
|
||||||
|
if expression.endswith("来る") and reading.endswith("くる"):
|
||||||
|
rules = set()
|
||||||
|
rules.add("vk")
|
||||||
|
return " ".join(list(rules))
|
|
@ -1,49 +1,9 @@
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
|
||||||
from css_parser import parseStyle
|
from css_parser import parseStyle
|
||||||
|
|
||||||
|
|
||||||
def create_zip(terms, index, tags=[]):
|
def make_gloss(soup):
|
||||||
build_directory = str(uuid.uuid4())
|
__preprocess_soup(soup)
|
||||||
os.mkdir(build_directory)
|
|
||||||
|
|
||||||
terms_per_file = 1000
|
|
||||||
max_i = int(len(terms) / terms_per_file) + 1
|
|
||||||
for i in range(max_i):
|
|
||||||
term_file = os.path.join(build_directory, f"term_bank_{i+1}.json")
|
|
||||||
with open(term_file, "w", encoding='utf8') as f:
|
|
||||||
start = terms_per_file * i
|
|
||||||
end = terms_per_file * (i + 1)
|
|
||||||
json.dump(terms[start:end], f, indent=4, ensure_ascii=False)
|
|
||||||
|
|
||||||
index_file = os.path.join(build_directory, "index.json")
|
|
||||||
with open(index_file, 'w', encoding='utf8') as f:
|
|
||||||
json.dump(index, f, indent=4, ensure_ascii=False)
|
|
||||||
|
|
||||||
if len(tags) > 0:
|
|
||||||
tag_file = os.path.join(build_directory, "tag_bank_1.json")
|
|
||||||
with open(tag_file, 'w', encoding='utf8') as f:
|
|
||||||
json.dump(tags, f, indent=4, ensure_ascii=False)
|
|
||||||
|
|
||||||
zip_filename = index["title"]
|
|
||||||
zip_file = f"{zip_filename}.zip"
|
|
||||||
shutil.make_archive(zip_filename, "zip", build_directory)
|
|
||||||
out_dir = "output"
|
|
||||||
out_file = os.path.join(out_dir, zip_file)
|
|
||||||
if not Path(out_dir).is_dir():
|
|
||||||
os.mkdir(out_dir)
|
|
||||||
elif Path(out_file).is_file():
|
|
||||||
os.remove(out_file)
|
|
||||||
shutil.move(zip_file, out_dir)
|
|
||||||
shutil.rmtree(build_directory)
|
|
||||||
|
|
||||||
|
|
||||||
def soup_to_gloss(soup):
|
|
||||||
__sanitize_soup(soup)
|
|
||||||
structured_content = __get_markup_structure(soup)
|
structured_content = __get_markup_structure(soup)
|
||||||
return {
|
return {
|
||||||
"type": "structured-content",
|
"type": "structured-content",
|
||||||
|
@ -51,10 +11,10 @@ def soup_to_gloss(soup):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def __sanitize_soup(soup):
|
def __preprocess_soup(soup):
|
||||||
patterns = [
|
patterns = [
|
||||||
r"^(.+)([ぁ-ヿ]+)$",
|
r"^(.+)([ぁ-ヿ、\s]+)$",
|
||||||
r"^(.+)([ぁ-ヿ]+([ぁ-ヿ])[ぁ-ヿ]+)$"
|
r"^(.+)([ぁ-ヿ、\s]+([ぁ-ヿ、\s])[ぁ-ヿ、\s]+)$"
|
||||||
]
|
]
|
||||||
for a in soup.find_all("a"):
|
for a in soup.find_all("a"):
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
Loading…
Reference in a new issue