from sudachipy import tokenizer from sudachipy import dictionary def sudachi_rules(expression, reading): tokenizer_obj = dictionary.Dictionary().create() splitmode = tokenizer.Tokenizer.SplitMode.A tokens = tokenizer_obj.tokenize(expression, splitmode) pos = tokens[len(tokens)-1].part_of_speech()[4] tags = pos.split("-") rules = __sudachi_tags_to_rules(tags, expression, reading) return rules def __sudachi_tags_to_rules(tags, expression, reading): u_endings = ["う", "く", "す", "つ", "ぬ", "ふ", "む", "ゆ", "る", "ぐ", "ず", "づ", "ぶ", "ぷ"] rules = set() for tag in tags: if expression.endswith("い"): if tag == "形容詞" or "ナイ" in tag or "タイ" in tag: rules.add("adj-i") if expression.endswith("る"): if "一" in tag or tag == "レル": rules.add("v1") if "二" in tag or "四" in tag or "五" in tag: for u_ending in u_endings: if expression.endswith(u_ending): rules.add("v5") break if "サ" in tag and (expression.endswith("する") or expression == "為る"): rules.add("vs") if "サ" in tag and expression.endswith("ずる"): rules.add("vz") if expression.endswith("来る") and reading.endswith("くる"): rules = set() rules.add("vk") return " ".join(list(rules))