853d0b33dc
Necesssary for structured content support
349 lines
6.1 KiB
Go
349 lines
6.1 KiB
Go
package yomichan
|
||
|
||
import (
|
||
"regexp"
|
||
"strings"
|
||
|
||
zig "foosoft.net/projects/zero-epwing-go"
|
||
)
|
||
|
||
type wadaiExtractor struct {
|
||
partsExp *regexp.Regexp
|
||
literalPartsExp *regexp.Regexp
|
||
readPartsExp *regexp.Regexp
|
||
quotedExp *regexp.Regexp
|
||
alphaExp *regexp.Regexp
|
||
}
|
||
|
||
func makeWadaiExtractor() epwingExtractor {
|
||
return &wadaiExtractor{
|
||
partsExp: regexp.MustCompile(`([^<]+)(?:<([^>【]+)(?:【([^】]+)】)?>)?`),
|
||
literalPartsExp: regexp.MustCompile(`(¶)?(.*)`),
|
||
readPartsExp: regexp.MustCompile(`([^1234567890]+)(.*)`),
|
||
quotedExp: regexp.MustCompile(`「?([^」]+)`),
|
||
alphaExp: regexp.MustCompile(`[a-z]+`),
|
||
}
|
||
}
|
||
|
||
func (e *wadaiExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
|
||
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||
if matches == nil {
|
||
return nil
|
||
}
|
||
|
||
preset := false
|
||
literal := matches[1]
|
||
if literalMatches := e.literalPartsExp.FindStringSubmatch(literal); literalMatches != nil {
|
||
preset = len(literalMatches[1]) > 0
|
||
literal = literalMatches[2]
|
||
}
|
||
|
||
reading := matches[2]
|
||
if readMatches := e.readPartsExp.FindStringSubmatch(reading); readMatches != nil {
|
||
reading = readMatches[1]
|
||
}
|
||
|
||
expressions := strings.Split(matches[3], "・")
|
||
if len(expressions) == 0 {
|
||
expressions = append(expressions, "")
|
||
}
|
||
|
||
var terms []dbTerm
|
||
for _, expression := range expressions {
|
||
if preset {
|
||
expression = literal
|
||
reading = ""
|
||
} else if len(expression) == 0 {
|
||
expression = literal
|
||
}
|
||
|
||
if quotedMatches := e.quotedExp.FindStringSubmatch(reading); quotedMatches != nil {
|
||
reading = quotedMatches[1]
|
||
}
|
||
|
||
if alphaMatches := e.alphaExp.FindStringSubmatch(expression); alphaMatches != nil && len(reading) > 0 {
|
||
expression = reading
|
||
reading = ""
|
||
}
|
||
|
||
expression = strings.TrimSpace(expression)
|
||
if len(expression) == 0 {
|
||
continue
|
||
}
|
||
|
||
term := dbTerm{
|
||
Expression: expression,
|
||
Reading: reading,
|
||
Glossary: []any{entry.Text},
|
||
Sequence: sequence,
|
||
}
|
||
|
||
terms = append(terms, term)
|
||
}
|
||
|
||
return terms
|
||
}
|
||
|
||
func (e *wadaiExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
|
||
return nil
|
||
}
|
||
|
||
func (*wadaiExtractor) getRevision() string {
|
||
return "wadai1"
|
||
}
|
||
|
||
func (*wadaiExtractor) getFontNarrow() map[int]string {
|
||
return map[int]string{
|
||
41267: "﹢",
|
||
41269: "*",
|
||
41270: "ᐦ",
|
||
41284: "Á",
|
||
41285: "É",
|
||
41287: "Ó",
|
||
41288: "Ú",
|
||
41290: "á",
|
||
41291: "é",
|
||
41292: "í",
|
||
41293: "ó",
|
||
41294: "ú",
|
||
41295: "ý",
|
||
41313: "À",
|
||
41314: "È",
|
||
41319: "à",
|
||
41320: "è",
|
||
41321: "ì",
|
||
41322: "ò",
|
||
41323: "ù",
|
||
41505: "Ö",
|
||
41506: "Ü",
|
||
41508: "ä",
|
||
41509: "ë",
|
||
41510: "ï",
|
||
41511: "ö",
|
||
41512: "ü",
|
||
41513: "ÿ",
|
||
41515: "Â",
|
||
41516: "Ê",
|
||
41517: "Î",
|
||
41520: "â",
|
||
41521: "ê",
|
||
41522: "î",
|
||
41523: "ô",
|
||
41524: "û",
|
||
41525: "ā",
|
||
41526: "ē",
|
||
41527: "ī",
|
||
41528: "ō",
|
||
41529: "ū",
|
||
41530: "ȳ",
|
||
41532: "Ç",
|
||
41533: "ç",
|
||
41534: "ɘ́",
|
||
41538: "ɔ́",
|
||
41561: "˜",
|
||
41566: "ã",
|
||
41567: "ñ",
|
||
41581: "ʌ",
|
||
41582: "ø",
|
||
41583: "ə",
|
||
41585: "ε",
|
||
41587: "ɔ",
|
||
41588: "℧",
|
||
41590: "ð",
|
||
41593: "ŋ",
|
||
41594: "ː",
|
||
41596: "Ø",
|
||
41762: "\\",
|
||
41768: "˘",
|
||
41773: "Ŭ",
|
||
41775: "ă",
|
||
41776: "ĕ",
|
||
41777: "ğ",
|
||
41778: "ĭ",
|
||
41779: "ŏ",
|
||
41780: "ŭ",
|
||
41784: "Č",
|
||
41788: "Š",
|
||
41791: "č",
|
||
41792: "ě",
|
||
41794: "ň",
|
||
41795: "ř",
|
||
41796: "š",
|
||
41797: "ž",
|
||
41804: "ą",
|
||
41805: "ę",
|
||
41811: "ș",
|
||
41812: "ț",
|
||
41822: "Ś",
|
||
41823: "ć",
|
||
41824: "ń",
|
||
41825: "ś",
|
||
41826: "ź",
|
||
42061: "‘",
|
||
42063: "Ł",
|
||
42068: "ł",
|
||
42071: "õ",
|
||
42075: "Å",
|
||
42076: "å",
|
||
42077: "ů",
|
||
42081: "Ḥ",
|
||
42089: "ḍ",
|
||
42090: "ḥ",
|
||
42092: "ṃ",
|
||
42093: "ṇ",
|
||
42095: "ṣ",
|
||
42102: "İ",
|
||
42104: "Ż",
|
||
42109: "ṅ",
|
||
42287: "‴",
|
||
42316: "Ō",
|
||
42322: "b̄",
|
||
42324: "d̅",
|
||
42325: "h̄",
|
||
42327: "s̅",
|
||
42330: "z̅",
|
||
42344: "〚",
|
||
42345: "〛",
|
||
42356: "ǔ",
|
||
42357: "ż",
|
||
42358: "Ž",
|
||
42359: "ž",
|
||
}
|
||
}
|
||
|
||
func (*wadaiExtractor) getFontWide() map[int]string {
|
||
return map[int]string{
|
||
45380: "☞",
|
||
45397: "æ",
|
||
45402: "œ",
|
||
45406: "Æ",
|
||
45429: "©",
|
||
45613: "<",
|
||
45614: ">",
|
||
45629: "┏",
|
||
45653: "⛤",
|
||
45662: "嗉",
|
||
45665: "圳",
|
||
45666: "拼",
|
||
45667: "攩",
|
||
45671: "烤",
|
||
45673: "玢",
|
||
45674: "癤",
|
||
45675: "皶",
|
||
45676: "磠",
|
||
45677: "稃",
|
||
45681: "蔲",
|
||
45684: "顬",
|
||
45685: "骶",
|
||
45689: "榍",
|
||
45857: "倻",
|
||
45870: "噯",
|
||
45876: "垜",
|
||
45898: "愷",
|
||
45900: "擤",
|
||
45906: "晷",
|
||
45909: "枘",
|
||
45910: "不",
|
||
45913: "楣",
|
||
45916: "梲",
|
||
45919: "桛",
|
||
45921: "楤",
|
||
45922: "橅",
|
||
45923: "檉",
|
||
45933: "淄",
|
||
46125: "煆",
|
||
46135: "珅",
|
||
46137: "琛",
|
||
46141: "痤",
|
||
46142: "癭",
|
||
46143: "瘭",
|
||
46152: "窠",
|
||
46154: "笯",
|
||
46155: "筠",
|
||
46156: "簎",
|
||
46157: "糝",
|
||
46161: "翟",
|
||
46163: "翮",
|
||
46166: "腊",
|
||
46168: "舢",
|
||
46169: "芷",
|
||
46177: "蒴",
|
||
46181: "蕙",
|
||
46190: "蚉",
|
||
46191: "蝲",
|
||
46197: "豇",
|
||
46198: "跑",
|
||
46200: "跗",
|
||
46201: "跆",
|
||
46202: "蒁",
|
||
46372: "鄱",
|
||
46374: "鄧",
|
||
46388: "卍",
|
||
46390: "𨫤",
|
||
46391: "鈹",
|
||
46398: "顥",
|
||
46404: "駃",
|
||
46405: "騠",
|
||
46406: "髁",
|
||
46409: "魳",
|
||
46410: "鱏",
|
||
46411: "鱓",
|
||
46414: "鱮",
|
||
46415: "鰶",
|
||
46416: "魬",
|
||
46417: "𩸽",
|
||
46418: "鯥",
|
||
46419: "鰙",
|
||
46422: "鮄",
|
||
46423: "鱵",
|
||
46424: "鷴",
|
||
46425: "鶍",
|
||
46426: "鵟",
|
||
46428: "鼯",
|
||
46449: "▶",
|
||
46459: "㧍",
|
||
46460: "嘈",
|
||
46461: "愈",
|
||
46462: "淝",
|
||
46634: "灤",
|
||
46635: "焮",
|
||
46636: "獮",
|
||
46637: "瓚",
|
||
46638: "絓",
|
||
46639: "芎",
|
||
46650: "薏",
|
||
46651: "辶",
|
||
46652: "醞",
|
||
46653: "挵",
|
||
46654: "飥",
|
||
46655: "鬐",
|
||
46656: "俏",
|
||
46657: "啐",
|
||
46658: "塼",
|
||
46659: "濰",
|
||
46660: "磲",
|
||
46661: "篊",
|
||
46662: "菀",
|
||
46663: "芩",
|
||
46664: "𧿹",
|
||
46665: "鈸",
|
||
46666: "驎",
|
||
46667: "硨",
|
||
46668: "蘞",
|
||
46669: "梣",
|
||
46670: "槵",
|
||
46671: "橉",
|
||
46672: "莧",
|
||
46682: "彔",
|
||
46683: "噦",
|
||
46684: "袘",
|
||
46685: "餺",
|
||
46686: "►",
|
||
46688: "棈",
|
||
46689: "▷",
|
||
46695: "[ローマ字]",
|
||
46699: "◧",
|
||
46700: "◨",
|
||
}
|
||
}
|