2021-01-01 22:31:58 +00:00
|
|
|
|
package yomichan
|
2017-02-19 23:31:37 +00:00
|
|
|
|
|
2017-02-20 23:06:35 +00:00
|
|
|
|
import (
|
|
|
|
|
"regexp"
|
|
|
|
|
"strings"
|
2021-01-01 05:53:10 +00:00
|
|
|
|
|
2023-12-31 04:43:50 +00:00
|
|
|
|
zig "git.foosoft.net/alex/zero-epwing-go"
|
2017-02-20 23:06:35 +00:00
|
|
|
|
)
|
|
|
|
|
|
2017-02-19 23:31:37 +00:00
|
|
|
|
type wadaiExtractor struct {
|
2017-02-20 23:06:35 +00:00
|
|
|
|
partsExp *regexp.Regexp
|
|
|
|
|
literalPartsExp *regexp.Regexp
|
|
|
|
|
readPartsExp *regexp.Regexp
|
|
|
|
|
quotedExp *regexp.Regexp
|
|
|
|
|
alphaExp *regexp.Regexp
|
2017-02-19 23:31:37 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func makeWadaiExtractor() epwingExtractor {
|
2017-02-20 23:06:35 +00:00
|
|
|
|
return &wadaiExtractor{
|
|
|
|
|
partsExp: regexp.MustCompile(`([^<]+)(?:<([^>【]+)(?:【([^】]+)】)?>)?`),
|
|
|
|
|
literalPartsExp: regexp.MustCompile(`(¶)?(.*)`),
|
|
|
|
|
readPartsExp: regexp.MustCompile(`([^1234567890]+)(.*)`),
|
|
|
|
|
quotedExp: regexp.MustCompile(`「?([^」]+)`),
|
|
|
|
|
alphaExp: regexp.MustCompile(`[a-z]+`),
|
|
|
|
|
}
|
2017-02-19 23:31:37 +00:00
|
|
|
|
}
|
|
|
|
|
|
2021-01-01 05:53:10 +00:00
|
|
|
|
func (e *wadaiExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
|
2017-02-20 23:06:35 +00:00
|
|
|
|
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
|
|
|
|
if matches == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
preset := false
|
|
|
|
|
literal := matches[1]
|
|
|
|
|
if literalMatches := e.literalPartsExp.FindStringSubmatch(literal); literalMatches != nil {
|
|
|
|
|
preset = len(literalMatches[1]) > 0
|
|
|
|
|
literal = literalMatches[2]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
reading := matches[2]
|
|
|
|
|
if readMatches := e.readPartsExp.FindStringSubmatch(reading); readMatches != nil {
|
|
|
|
|
reading = readMatches[1]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
expressions := strings.Split(matches[3], "・")
|
|
|
|
|
if len(expressions) == 0 {
|
|
|
|
|
expressions = append(expressions, "")
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-19 23:31:37 +00:00
|
|
|
|
var terms []dbTerm
|
2017-02-20 23:06:35 +00:00
|
|
|
|
for _, expression := range expressions {
|
|
|
|
|
if preset {
|
|
|
|
|
expression = literal
|
|
|
|
|
reading = ""
|
|
|
|
|
} else if len(expression) == 0 {
|
|
|
|
|
expression = literal
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if quotedMatches := e.quotedExp.FindStringSubmatch(reading); quotedMatches != nil {
|
|
|
|
|
reading = quotedMatches[1]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if alphaMatches := e.alphaExp.FindStringSubmatch(expression); alphaMatches != nil && len(reading) > 0 {
|
|
|
|
|
expression = reading
|
|
|
|
|
reading = ""
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-17 20:10:34 +00:00
|
|
|
|
expression = strings.TrimSpace(expression)
|
|
|
|
|
if len(expression) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-20 23:06:35 +00:00
|
|
|
|
term := dbTerm{
|
|
|
|
|
Expression: expression,
|
|
|
|
|
Reading: reading,
|
2023-01-22 20:14:33 +00:00
|
|
|
|
Glossary: []any{entry.Text},
|
2017-10-12 23:48:58 +00:00
|
|
|
|
Sequence: sequence,
|
2017-02-20 23:06:35 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
terms = append(terms, term)
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-19 23:31:37 +00:00
|
|
|
|
return terms
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-01 05:53:10 +00:00
|
|
|
|
func (e *wadaiExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
|
2017-02-19 23:31:37 +00:00
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (*wadaiExtractor) getRevision() string {
|
|
|
|
|
return "wadai1"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (*wadaiExtractor) getFontNarrow() map[int]string {
|
|
|
|
|
return map[int]string{
|
|
|
|
|
41267: "﹢",
|
|
|
|
|
41269: "*",
|
|
|
|
|
41270: "ᐦ",
|
|
|
|
|
41284: "Á",
|
|
|
|
|
41285: "É",
|
|
|
|
|
41287: "Ó",
|
|
|
|
|
41288: "Ú",
|
|
|
|
|
41290: "á",
|
|
|
|
|
41291: "é",
|
|
|
|
|
41292: "í",
|
|
|
|
|
41293: "ó",
|
|
|
|
|
41294: "ú",
|
|
|
|
|
41295: "ý",
|
|
|
|
|
41313: "À",
|
|
|
|
|
41314: "È",
|
|
|
|
|
41319: "à",
|
|
|
|
|
41320: "è",
|
|
|
|
|
41321: "ì",
|
|
|
|
|
41322: "ò",
|
|
|
|
|
41323: "ù",
|
|
|
|
|
41505: "Ö",
|
|
|
|
|
41506: "Ü",
|
|
|
|
|
41508: "ä",
|
|
|
|
|
41509: "ë",
|
|
|
|
|
41510: "ï",
|
|
|
|
|
41511: "ö",
|
|
|
|
|
41512: "ü",
|
|
|
|
|
41513: "ÿ",
|
|
|
|
|
41515: "Â",
|
|
|
|
|
41516: "Ê",
|
|
|
|
|
41517: "Î",
|
|
|
|
|
41520: "â",
|
|
|
|
|
41521: "ê",
|
|
|
|
|
41522: "î",
|
|
|
|
|
41523: "ô",
|
|
|
|
|
41524: "û",
|
|
|
|
|
41525: "ā",
|
|
|
|
|
41526: "ē",
|
|
|
|
|
41527: "ī",
|
|
|
|
|
41528: "ō",
|
|
|
|
|
41529: "ū",
|
|
|
|
|
41530: "ȳ",
|
|
|
|
|
41532: "Ç",
|
|
|
|
|
41533: "ç",
|
|
|
|
|
41534: "ɘ́",
|
|
|
|
|
41538: "ɔ́",
|
|
|
|
|
41561: "˜",
|
|
|
|
|
41566: "ã",
|
|
|
|
|
41567: "ñ",
|
|
|
|
|
41581: "ʌ",
|
|
|
|
|
41582: "ø",
|
|
|
|
|
41583: "ə",
|
|
|
|
|
41585: "ε",
|
|
|
|
|
41587: "ɔ",
|
|
|
|
|
41588: "℧",
|
|
|
|
|
41590: "ð",
|
|
|
|
|
41593: "ŋ",
|
|
|
|
|
41594: "ː",
|
|
|
|
|
41596: "Ø",
|
|
|
|
|
41762: "\\",
|
|
|
|
|
41768: "˘",
|
|
|
|
|
41773: "Ŭ",
|
|
|
|
|
41775: "ă",
|
|
|
|
|
41776: "ĕ",
|
|
|
|
|
41777: "ğ",
|
|
|
|
|
41778: "ĭ",
|
|
|
|
|
41779: "ŏ",
|
|
|
|
|
41780: "ŭ",
|
|
|
|
|
41784: "Č",
|
|
|
|
|
41788: "Š",
|
|
|
|
|
41791: "č",
|
|
|
|
|
41792: "ě",
|
|
|
|
|
41794: "ň",
|
|
|
|
|
41795: "ř",
|
|
|
|
|
41796: "š",
|
|
|
|
|
41797: "ž",
|
|
|
|
|
41804: "ą",
|
|
|
|
|
41805: "ę",
|
|
|
|
|
41811: "ș",
|
|
|
|
|
41812: "ț",
|
|
|
|
|
41822: "Ś",
|
|
|
|
|
41823: "ć",
|
|
|
|
|
41824: "ń",
|
|
|
|
|
41825: "ś",
|
|
|
|
|
41826: "ź",
|
|
|
|
|
42061: "‘",
|
|
|
|
|
42063: "Ł",
|
|
|
|
|
42068: "ł",
|
|
|
|
|
42071: "õ",
|
|
|
|
|
42075: "Å",
|
|
|
|
|
42076: "å",
|
|
|
|
|
42077: "ů",
|
|
|
|
|
42081: "Ḥ",
|
|
|
|
|
42089: "ḍ",
|
|
|
|
|
42090: "ḥ",
|
|
|
|
|
42092: "ṃ",
|
|
|
|
|
42093: "ṇ",
|
|
|
|
|
42095: "ṣ",
|
|
|
|
|
42102: "İ",
|
|
|
|
|
42104: "Ż",
|
|
|
|
|
42109: "ṅ",
|
|
|
|
|
42287: "‴",
|
|
|
|
|
42316: "Ō",
|
|
|
|
|
42322: "b̄",
|
|
|
|
|
42324: "d̅",
|
|
|
|
|
42325: "h̄",
|
|
|
|
|
42327: "s̅",
|
|
|
|
|
42330: "z̅",
|
|
|
|
|
42344: "〚",
|
|
|
|
|
42345: "〛",
|
|
|
|
|
42356: "ǔ",
|
|
|
|
|
42357: "ż",
|
|
|
|
|
42358: "Ž",
|
|
|
|
|
42359: "ž",
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (*wadaiExtractor) getFontWide() map[int]string {
|
|
|
|
|
return map[int]string{
|
|
|
|
|
45380: "☞",
|
|
|
|
|
45397: "æ",
|
|
|
|
|
45402: "œ",
|
|
|
|
|
45406: "Æ",
|
|
|
|
|
45429: "©",
|
|
|
|
|
45613: "<",
|
|
|
|
|
45614: ">",
|
|
|
|
|
45629: "┏",
|
|
|
|
|
45653: "⛤",
|
|
|
|
|
45662: "嗉",
|
|
|
|
|
45665: "圳",
|
|
|
|
|
45666: "拼",
|
|
|
|
|
45667: "攩",
|
|
|
|
|
45671: "烤",
|
|
|
|
|
45673: "玢",
|
|
|
|
|
45674: "癤",
|
|
|
|
|
45675: "皶",
|
|
|
|
|
45676: "磠",
|
|
|
|
|
45677: "稃",
|
|
|
|
|
45681: "蔲",
|
|
|
|
|
45684: "顬",
|
|
|
|
|
45685: "骶",
|
|
|
|
|
45689: "榍",
|
|
|
|
|
45857: "倻",
|
|
|
|
|
45870: "噯",
|
|
|
|
|
45876: "垜",
|
|
|
|
|
45898: "愷",
|
|
|
|
|
45900: "擤",
|
|
|
|
|
45906: "晷",
|
|
|
|
|
45909: "枘",
|
|
|
|
|
45910: "不",
|
|
|
|
|
45913: "楣",
|
|
|
|
|
45916: "梲",
|
|
|
|
|
45919: "桛",
|
|
|
|
|
45921: "楤",
|
|
|
|
|
45922: "橅",
|
|
|
|
|
45923: "檉",
|
|
|
|
|
45933: "淄",
|
|
|
|
|
46125: "煆",
|
|
|
|
|
46135: "珅",
|
|
|
|
|
46137: "琛",
|
|
|
|
|
46141: "痤",
|
|
|
|
|
46142: "癭",
|
|
|
|
|
46143: "瘭",
|
|
|
|
|
46152: "窠",
|
|
|
|
|
46154: "笯",
|
|
|
|
|
46155: "筠",
|
|
|
|
|
46156: "簎",
|
|
|
|
|
46157: "糝",
|
|
|
|
|
46161: "翟",
|
|
|
|
|
46163: "翮",
|
|
|
|
|
46166: "腊",
|
|
|
|
|
46168: "舢",
|
|
|
|
|
46169: "芷",
|
|
|
|
|
46177: "蒴",
|
|
|
|
|
46181: "蕙",
|
|
|
|
|
46190: "蚉",
|
|
|
|
|
46191: "蝲",
|
|
|
|
|
46197: "豇",
|
|
|
|
|
46198: "跑",
|
|
|
|
|
46200: "跗",
|
|
|
|
|
46201: "跆",
|
|
|
|
|
46202: "蒁",
|
|
|
|
|
46372: "鄱",
|
|
|
|
|
46374: "鄧",
|
|
|
|
|
46388: "卍",
|
|
|
|
|
46390: "𨫤",
|
|
|
|
|
46391: "鈹",
|
|
|
|
|
46398: "顥",
|
|
|
|
|
46404: "駃",
|
|
|
|
|
46405: "騠",
|
|
|
|
|
46406: "髁",
|
|
|
|
|
46409: "魳",
|
|
|
|
|
46410: "鱏",
|
|
|
|
|
46411: "鱓",
|
|
|
|
|
46414: "鱮",
|
|
|
|
|
46415: "鰶",
|
|
|
|
|
46416: "魬",
|
|
|
|
|
46417: "𩸽",
|
|
|
|
|
46418: "鯥",
|
|
|
|
|
46419: "鰙",
|
|
|
|
|
46422: "鮄",
|
|
|
|
|
46423: "鱵",
|
|
|
|
|
46424: "鷴",
|
|
|
|
|
46425: "鶍",
|
|
|
|
|
46426: "鵟",
|
|
|
|
|
46428: "鼯",
|
|
|
|
|
46449: "▶",
|
|
|
|
|
46459: "㧍",
|
|
|
|
|
46460: "嘈",
|
|
|
|
|
46461: "愈",
|
|
|
|
|
46462: "淝",
|
|
|
|
|
46634: "灤",
|
|
|
|
|
46635: "焮",
|
|
|
|
|
46636: "獮",
|
|
|
|
|
46637: "瓚",
|
|
|
|
|
46638: "絓",
|
|
|
|
|
46639: "芎",
|
|
|
|
|
46650: "薏",
|
|
|
|
|
46651: "辶",
|
|
|
|
|
46652: "醞",
|
|
|
|
|
46653: "挵",
|
|
|
|
|
46654: "飥",
|
|
|
|
|
46655: "鬐",
|
|
|
|
|
46656: "俏",
|
|
|
|
|
46657: "啐",
|
|
|
|
|
46658: "塼",
|
|
|
|
|
46659: "濰",
|
|
|
|
|
46660: "磲",
|
|
|
|
|
46661: "篊",
|
|
|
|
|
46662: "菀",
|
|
|
|
|
46663: "芩",
|
|
|
|
|
46664: "𧿹",
|
|
|
|
|
46665: "鈸",
|
|
|
|
|
46666: "驎",
|
|
|
|
|
46667: "硨",
|
|
|
|
|
46668: "蘞",
|
|
|
|
|
46669: "梣",
|
|
|
|
|
46670: "槵",
|
|
|
|
|
46671: "橉",
|
|
|
|
|
46672: "莧",
|
|
|
|
|
46682: "彔",
|
|
|
|
|
46683: "噦",
|
|
|
|
|
46684: "袘",
|
|
|
|
|
46685: "餺",
|
|
|
|
|
46686: "►",
|
|
|
|
|
46688: "棈",
|
|
|
|
|
46689: "▷",
|
|
|
|
|
46695: "[ローマ字]",
|
|
|
|
|
46699: "◧",
|
|
|
|
|
46700: "◨",
|
|
|
|
|
}
|
|
|
|
|
}
|