2021-01-01 22:31:58 +00:00
|
|
|
|
package yomichan
|
2018-02-17 19:29:06 +00:00
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"regexp"
|
|
|
|
|
"strings"
|
2021-01-01 05:53:10 +00:00
|
|
|
|
|
2022-07-04 03:59:33 +00:00
|
|
|
|
zig "foosoft.net/projects/zero-epwing-go"
|
2018-02-17 19:29:06 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type gakkenExtractor struct {
|
|
|
|
|
partsExp *regexp.Regexp
|
|
|
|
|
readGroupExp *regexp.Regexp
|
|
|
|
|
expVarExp *regexp.Regexp
|
|
|
|
|
metaExp *regexp.Regexp
|
|
|
|
|
v5Exp *regexp.Regexp
|
|
|
|
|
v1Exp *regexp.Regexp
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func makeGakkenExtractor() epwingExtractor {
|
|
|
|
|
return &gakkenExtractor{
|
|
|
|
|
partsExp: regexp.MustCompile(`([\p{Hiragana}\p{Katakana}ー‐・]*)?(?:【(.*)】)?`),
|
|
|
|
|
readGroupExp: regexp.MustCompile(`[‐・]+`),
|
|
|
|
|
expVarExp: regexp.MustCompile(`\(([^\)]*)\)`),
|
|
|
|
|
metaExp: regexp.MustCompile(`(([^)]*))`),
|
|
|
|
|
v5Exp: regexp.MustCompile(`(動.[四五]([[^]]+])?)|(動..二)`),
|
|
|
|
|
v1Exp: regexp.MustCompile(`(動..一)`),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var cosmetics = strings.NewReplacer("(1)", "①", "(2)", "②", "(3)", "③", "(4)", "④", "(5)", "⑤", "(6)", "⑥", "(7)", "⑦", "(8)", "⑧", "(9)", "⑨", "(10)", "⑩", "(11)", "⑪", "(12)", "⑫", "(13)", "⑬", "(14)", "⑭", "(15)", "⑮", "(16)", "⑯", "(17)", "⑰", "(18)", "⑱", "(19)", "⑲", "(20)", "⑳",
|
|
|
|
|
"カ゛", "ガ",
|
|
|
|
|
"キ゛", "ギ",
|
|
|
|
|
"ク゛", "グ",
|
|
|
|
|
"ケ゛", "ゲ",
|
|
|
|
|
"コ゛", "ゴ",
|
|
|
|
|
"タ゛", "ダ",
|
|
|
|
|
"チ゛", "ヂ",
|
|
|
|
|
"ツ゛", "ヅ",
|
|
|
|
|
"テ゛", "デ",
|
|
|
|
|
"ト゛", "ド",
|
|
|
|
|
"ハ゛", "バ",
|
|
|
|
|
"ヒ゛", "ビ",
|
|
|
|
|
"フ゛", "ブ",
|
|
|
|
|
"ヘ゛", "ベ",
|
|
|
|
|
"ホ゛", "ボ",
|
|
|
|
|
"サ゛", "ザ",
|
|
|
|
|
"シ゛", "ジ",
|
|
|
|
|
"ス゛", "ズ",
|
|
|
|
|
"セ゛", "ゼ",
|
|
|
|
|
"ソ゛", "ゾ")
|
|
|
|
|
|
2021-01-01 05:53:10 +00:00
|
|
|
|
func (e *gakkenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
|
2018-02-17 19:29:06 +00:00
|
|
|
|
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
|
|
|
|
if matches == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var expressions, readings []string
|
|
|
|
|
if expression := matches[2]; len(expression) > 0 {
|
|
|
|
|
expression = e.metaExp.ReplaceAllLiteralString(expression, "")
|
|
|
|
|
for _, split := range regexp.MustCompile("(・|】【)").Split(expression, -1) {
|
|
|
|
|
splitInc := e.expVarExp.ReplaceAllString(split, "$1")
|
|
|
|
|
expressions = append(expressions, splitInc)
|
|
|
|
|
if split != splitInc {
|
|
|
|
|
splitExc := e.expVarExp.ReplaceAllLiteralString(split, "")
|
|
|
|
|
expressions = append(expressions, splitExc)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if reading := matches[1]; len(reading) > 0 {
|
|
|
|
|
reading = e.readGroupExp.ReplaceAllLiteralString(reading, "")
|
|
|
|
|
readings = append(readings, reading)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var tags []string
|
|
|
|
|
|
|
|
|
|
entryText := cosmetics.Replace(entry.Text)
|
|
|
|
|
|
|
|
|
|
for _, split := range strings.Split(entryText, "\n") {
|
|
|
|
|
if matches := e.metaExp.FindStringSubmatch(split); matches != nil {
|
|
|
|
|
for _, tag := range strings.Split(matches[1], "・") {
|
|
|
|
|
tags = append(tags, tag)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var terms []dbTerm
|
|
|
|
|
if len(expressions) == 0 {
|
|
|
|
|
for _, reading := range readings {
|
|
|
|
|
term := dbTerm{
|
|
|
|
|
Expression: reading,
|
2023-01-22 20:14:33 +00:00
|
|
|
|
Glossary: []any{entryText},
|
2018-02-17 19:29:06 +00:00
|
|
|
|
Sequence: sequence,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.exportRules(&term, tags)
|
|
|
|
|
terms = append(terms, term)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
if len(readings) == 0 {
|
|
|
|
|
readings = append(readings, "")
|
|
|
|
|
}
|
|
|
|
|
for _, expression := range expressions {
|
|
|
|
|
for _, reading := range readings {
|
|
|
|
|
term := dbTerm{
|
|
|
|
|
Expression: expression,
|
|
|
|
|
Reading: reading,
|
2023-01-22 20:14:33 +00:00
|
|
|
|
Glossary: []any{entryText},
|
2018-02-17 19:29:06 +00:00
|
|
|
|
Sequence: sequence,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
e.exportRules(&term, tags)
|
|
|
|
|
terms = append(terms, term)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return terms
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-01 05:53:10 +00:00
|
|
|
|
func (*gakkenExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
|
2018-02-17 19:29:06 +00:00
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *gakkenExtractor) exportRules(term *dbTerm, tags []string) {
|
|
|
|
|
for _, tag := range tags {
|
|
|
|
|
if tag == "形" {
|
|
|
|
|
term.addRules("adj-i")
|
|
|
|
|
} else if tag == "動サ変" && (strings.HasSuffix(term.Expression, "する") || strings.HasSuffix(term.Expression, "為る")) {
|
|
|
|
|
term.addRules("vs")
|
|
|
|
|
} else if term.Expression == "来る" {
|
|
|
|
|
term.addRules("vk")
|
|
|
|
|
} else if e.v5Exp.MatchString(tag) {
|
|
|
|
|
term.addRules("v5")
|
|
|
|
|
} else if e.v1Exp.MatchString(tag) {
|
|
|
|
|
term.addRules("v1")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (*gakkenExtractor) getRevision() string {
|
|
|
|
|
return "gakken"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (*gakkenExtractor) getFontNarrow() map[int]string {
|
|
|
|
|
return map[int]string{
|
|
|
|
|
41550: "ī",
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (*gakkenExtractor) getFontWide() map[int]string {
|
|
|
|
|
return map[int]string{
|
|
|
|
|
42017: "国",
|
|
|
|
|
42018: "古",
|
|
|
|
|
42019: "故",
|
|
|
|
|
42021: "(拡)",
|
|
|
|
|
42020: "漢",
|
|
|
|
|
42033: "",
|
|
|
|
|
42034: "",
|
|
|
|
|
42070: "㋐",
|
|
|
|
|
42071: "㋑",
|
|
|
|
|
42072: "㋒",
|
|
|
|
|
42073: "㋓",
|
|
|
|
|
42074: "㋔",
|
|
|
|
|
42075: "㋕",
|
|
|
|
|
42076: "㋖",
|
|
|
|
|
42077: "㋗",
|
|
|
|
|
42078: "㋘",
|
|
|
|
|
42079: "㋙",
|
|
|
|
|
42080: "㋚",
|
|
|
|
|
42081: "㋛",
|
|
|
|
|
42082: "㋜",
|
|
|
|
|
42083: "㋝",
|
|
|
|
|
42084: "🈩",
|
|
|
|
|
42085: "🈔",
|
|
|
|
|
42086: "🈪",
|
|
|
|
|
42087: "[四]",
|
|
|
|
|
42088: "[五]",
|
|
|
|
|
42089: "❶",
|
|
|
|
|
42090: "❷",
|
|
|
|
|
42091: "❸",
|
|
|
|
|
42092: "❹",
|
|
|
|
|
42093: "❺",
|
|
|
|
|
42094: "❻",
|
|
|
|
|
42095: "❼",
|
|
|
|
|
42096: "❽",
|
|
|
|
|
42097: "❾",
|
|
|
|
|
42098: "❿",
|
|
|
|
|
42099: "⓫",
|
|
|
|
|
42100: "⓬",
|
|
|
|
|
42101: "⓭",
|
|
|
|
|
42102: "⓮",
|
|
|
|
|
42103: "⓯",
|
|
|
|
|
42104: "⓰",
|
|
|
|
|
42105: "⓱",
|
|
|
|
|
42106: "⓲",
|
|
|
|
|
42107: "㊀",
|
|
|
|
|
42108: "㊁",
|
|
|
|
|
42109: "㊂",
|
|
|
|
|
42110: "㊃",
|
|
|
|
|
43599: "咍",
|
|
|
|
|
46176: "(扌)",
|
|
|
|
|
48753: "灾",
|
|
|
|
|
48936: "烖",
|
|
|
|
|
58176: "(呉)",
|
|
|
|
|
58177: "(漢)",
|
|
|
|
|
}
|
|
|
|
|
}
|