448 lines
9.0 KiB
Go
448 lines
9.0 KiB
Go
/*
|
||
* Copyright (c) 2016-2021 Alex Yatskov <alex@foosoft.net>
|
||
*
|
||
* Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||
* this software and associated documentation files (the "Software"), to deal in
|
||
* the Software without restriction, including without limitation the rights to
|
||
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||
* the Software, and to permit persons to whom the Software is furnished to do so,
|
||
* subject to the following conditions:
|
||
*
|
||
* The above copyright notice and this permission notice shall be included in all
|
||
* copies or substantial portions of the Software.
|
||
*
|
||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
*/
|
||
|
||
package yomichan
|
||
|
||
import (
|
||
"regexp"
|
||
"strings"
|
||
|
||
zig "github.com/FooSoft/zero-epwing-go"
|
||
)
|
||
|
||
type daijisenExtractor struct {
|
||
partsExp *regexp.Regexp
|
||
expShapesExp *regexp.Regexp
|
||
expVarExp *regexp.Regexp
|
||
readGroupExp *regexp.Regexp
|
||
metaExp *regexp.Regexp
|
||
v5Exp *regexp.Regexp
|
||
v1Exp *regexp.Regexp
|
||
}
|
||
|
||
func makeDaijisenExtractor() epwingExtractor {
|
||
return &daijisenExtractor{
|
||
partsExp: regexp.MustCompile(`([^【]+)(?:【(.*)】)?`),
|
||
expShapesExp: regexp.MustCompile(`[×△]+`),
|
||
expVarExp: regexp.MustCompile(`(([^)]*))`),
|
||
readGroupExp: regexp.MustCompile(`[‐・]+`),
|
||
metaExp: regexp.MustCompile(`[([^]]*)]`),
|
||
v5Exp: regexp.MustCompile(`(動.[四五]([[^]]+])?)|(動..二)`),
|
||
v1Exp: regexp.MustCompile(`(動..一)`),
|
||
}
|
||
}
|
||
|
||
func (e *daijisenExtractor) extractTerms(entry zig.BookEntry, sequence int) []dbTerm {
|
||
matches := e.partsExp.FindStringSubmatch(entry.Heading)
|
||
if matches == nil {
|
||
return nil
|
||
}
|
||
|
||
var expressions []string
|
||
if expression := matches[2]; len(expression) > 0 {
|
||
expression = e.expShapesExp.ReplaceAllString(expression, "")
|
||
for _, split := range strings.Split(expression, "・") {
|
||
splitInc := e.expVarExp.ReplaceAllString(split, "$1")
|
||
expressions = append(expressions, splitInc)
|
||
if split != splitInc {
|
||
splitExc := e.expVarExp.ReplaceAllLiteralString(split, "")
|
||
expressions = append(expressions, splitExc)
|
||
}
|
||
}
|
||
}
|
||
|
||
var reading string
|
||
if reading = matches[1]; len(reading) > 0 {
|
||
reading = e.readGroupExp.ReplaceAllLiteralString(reading, "")
|
||
reading = e.expVarExp.ReplaceAllLiteralString(reading, "")
|
||
}
|
||
|
||
var tags []string
|
||
for _, split := range strings.Split(entry.Text, "\n") {
|
||
if matches := e.metaExp.FindStringSubmatch(split); matches != nil {
|
||
for _, tag := range strings.Split(matches[1], "・") {
|
||
tags = append(tags, tag)
|
||
}
|
||
}
|
||
}
|
||
|
||
var terms []dbTerm
|
||
if len(expressions) == 0 {
|
||
term := dbTerm{
|
||
Expression: reading,
|
||
Glossary: []string{entry.Text},
|
||
Sequence: sequence,
|
||
}
|
||
|
||
e.exportRules(&term, tags)
|
||
terms = append(terms, term)
|
||
|
||
} else {
|
||
for _, expression := range expressions {
|
||
term := dbTerm{
|
||
Expression: expression,
|
||
Reading: reading,
|
||
Glossary: []string{entry.Text},
|
||
Sequence: sequence,
|
||
}
|
||
|
||
e.exportRules(&term, tags)
|
||
terms = append(terms, term)
|
||
}
|
||
}
|
||
|
||
return terms
|
||
}
|
||
|
||
func (*daijisenExtractor) extractKanji(entry zig.BookEntry) []dbKanji {
|
||
return nil
|
||
}
|
||
|
||
func (e *daijisenExtractor) exportRules(term *dbTerm, tags []string) {
|
||
for _, tag := range tags {
|
||
if tag == "形" {
|
||
term.addRules("adj-i")
|
||
} else if tag == "動サ変" && (strings.HasSuffix(term.Expression, "する") || strings.HasSuffix(term.Expression, "為る")) {
|
||
term.addRules("vs")
|
||
} else if term.Expression == "来る" {
|
||
term.addRules("vk")
|
||
} else if e.v5Exp.MatchString(tag) {
|
||
term.addRules("v5")
|
||
} else if e.v1Exp.MatchString(tag) {
|
||
term.addRules("v1")
|
||
}
|
||
}
|
||
}
|
||
|
||
func (*daijisenExtractor) getRevision() string {
|
||
return "daijisen1"
|
||
}
|
||
|
||
func (*daijisenExtractor) getFontNarrow() map[int]string {
|
||
return map[int]string{
|
||
0xa121: " ",
|
||
0xa122: "¡",
|
||
0xa123: "¢",
|
||
0xa124: "£",
|
||
0xa125: "¤",
|
||
0xa126: "¥",
|
||
0xa127: "¦",
|
||
0xa128: "§",
|
||
0xa129: "¨",
|
||
0xa12a: "©",
|
||
0xa12b: "ª",
|
||
0xa12c: "«",
|
||
0xa12d: "¬",
|
||
0xa12e: "",
|
||
0xa12f: "®",
|
||
0xa130: "¯",
|
||
0xa131: "°",
|
||
0xa132: "±",
|
||
0xa133: "²",
|
||
0xa134: "³",
|
||
0xa135: "´",
|
||
0xa136: "µ",
|
||
0xa137: "¶",
|
||
0xa138: "·",
|
||
0xa139: "¸",
|
||
0xa13a: "¹",
|
||
0xa13b: "º",
|
||
0xa13c: "»",
|
||
0xa13d: "¼",
|
||
0xa13e: "½",
|
||
0xa13f: "¾",
|
||
0xa140: "¿",
|
||
0xa141: "À",
|
||
0xa142: "Á",
|
||
0xa143: "Â",
|
||
0xa144: "Ã",
|
||
0xa145: "Ä",
|
||
0xa146: "Å",
|
||
0xa147: "Æ",
|
||
0xa148: "Ç",
|
||
0xa149: "È",
|
||
0xa14a: "É",
|
||
0xa14b: "Ê",
|
||
0xa14c: "Ë",
|
||
0xa14d: "Ì",
|
||
0xa14e: "Í",
|
||
0xa14f: "Î",
|
||
0xa150: "Ï",
|
||
0xa151: "Ð",
|
||
0xa152: "Ñ",
|
||
0xa153: "Ò",
|
||
0xa154: "Ó",
|
||
0xa155: "Ô",
|
||
0xa156: "Õ",
|
||
0xa157: "Ö",
|
||
0xa158: "×",
|
||
0xa159: "Ø",
|
||
0xa15a: "Ù",
|
||
0xa15b: "Ú",
|
||
0xa15c: "Û",
|
||
0xa15d: "Ü",
|
||
0xa15e: "Ý",
|
||
0xa15f: "Þ",
|
||
0xa160: "ß",
|
||
0xa161: "à",
|
||
0xa162: "á",
|
||
0xa163: "â",
|
||
0xa164: "ã",
|
||
0xa165: "ä",
|
||
0xa166: "å",
|
||
0xa167: "æ",
|
||
0xa168: "ç",
|
||
0xa169: "è",
|
||
0xa16a: "é",
|
||
0xa16b: "ê",
|
||
0xa16c: "ë",
|
||
0xa16d: "ì",
|
||
0xa16e: "í",
|
||
0xa16f: "î",
|
||
0xa170: "ï",
|
||
0xa171: "ð",
|
||
0xa172: "ñ",
|
||
0xa173: "ò",
|
||
0xa174: "ó",
|
||
0xa175: "ô",
|
||
0xa176: "õ",
|
||
0xa177: "ö",
|
||
0xa178: "÷",
|
||
0xa179: "ø",
|
||
0xa17a: "ù",
|
||
0xa17b: "ú",
|
||
0xa17c: "û",
|
||
0xa17d: "ü",
|
||
0xa17e: "ý",
|
||
0xa221: "þ",
|
||
0xa222: "ÿ",
|
||
0xa223: "Ā",
|
||
0xa224: "ā",
|
||
0xa225: "Ă",
|
||
0xa226: "ă",
|
||
0xa227: "Ą",
|
||
0xa228: "ą",
|
||
0xa229: "Ć",
|
||
0xa22a: "ć",
|
||
0xa22b: "Ĉ",
|
||
0xa22c: "ĉ",
|
||
0xa22d: "Ċ",
|
||
0xa22e: "ċ",
|
||
0xa22f: "Č",
|
||
0xa230: "č",
|
||
0xa231: "Ď",
|
||
0xa232: "ď",
|
||
0xa233: "Đ",
|
||
0xa234: "đ",
|
||
0xa235: "Ē",
|
||
0xa236: "ē",
|
||
0xa237: "Ĕ",
|
||
0xa238: "ĕ",
|
||
0xa239: "Ė",
|
||
0xa23a: "ė",
|
||
0xa23b: "Ę",
|
||
0xa23c: "ę",
|
||
0xa23d: "Ě",
|
||
0xa23e: "ě",
|
||
0xa23f: "Ĝ",
|
||
0xa240: "ĝ",
|
||
0xa241: "Ğ",
|
||
0xa242: "ğ",
|
||
0xa243: "Ġ",
|
||
0xa244: "ġ",
|
||
0xa245: "Ģ",
|
||
0xa246: "ģ",
|
||
0xa247: "Ĥ",
|
||
0xa248: "ĥ",
|
||
0xa249: "Ħ",
|
||
0xa24a: "ħ",
|
||
0xa24b: "Ĩ",
|
||
0xa24c: "ĩ",
|
||
0xa24d: "Ī",
|
||
0xa24e: "ī",
|
||
0xa24f: "Ĭ",
|
||
0xa250: "ĭ",
|
||
0xa251: "Į",
|
||
0xa252: "į",
|
||
0xa253: "İ",
|
||
0xa254: "ı",
|
||
0xa255: "IJ",
|
||
0xa256: "ij",
|
||
0xa257: "Ĵ",
|
||
0xa258: "ĵ",
|
||
0xa259: "Ķ",
|
||
0xa25a: "ķ",
|
||
0xa25b: "ĸ",
|
||
0xa25c: "Ĺ",
|
||
0xa25d: "ĺ",
|
||
0xa25e: "Ļ",
|
||
0xa25f: "ļ",
|
||
0xa260: "Ľ",
|
||
0xa261: "ľ",
|
||
0xa262: "Ŀ",
|
||
0xa263: "ŀ",
|
||
0xa264: "Ł",
|
||
0xa265: "ł",
|
||
0xa266: "Ń",
|
||
0xa267: "ń",
|
||
0xa268: "Ņ",
|
||
0xa269: "ņ",
|
||
0xa26a: "Ň",
|
||
0xa26b: "ň",
|
||
0xa26c: "ʼn",
|
||
0xa26d: "Ŋ",
|
||
0xa26e: "ŋ",
|
||
0xa26f: "Ō",
|
||
0xa270: "ō",
|
||
0xa271: "Ŏ",
|
||
0xa272: "ŏ",
|
||
0xa273: "Ő",
|
||
0xa274: "ő",
|
||
0xa275: "Œ",
|
||
0xa276: "œ",
|
||
0xa277: "Ŕ",
|
||
0xa278: "ŕ",
|
||
0xa279: "Ŗ",
|
||
0xa27a: "ŗ",
|
||
0xa27b: "Ř",
|
||
0xa27c: "ř",
|
||
0xa27d: "Ś",
|
||
0xa27e: "ś",
|
||
0xa321: "Ŝ",
|
||
0xa322: "ŝ",
|
||
0xa323: "Ş",
|
||
0xa324: "ş",
|
||
0xa325: "Š",
|
||
0xa326: "š",
|
||
0xa327: "Ţ",
|
||
0xa328: "ţ",
|
||
0xa329: "Ť",
|
||
0xa32a: "ť",
|
||
0xa32b: "Ŧ",
|
||
0xa32c: "ŧ",
|
||
0xa32d: "Ũ",
|
||
0xa32e: "ũ",
|
||
0xa32f: "Ū",
|
||
0xa330: "ū",
|
||
0xa331: "Ŭ",
|
||
0xa332: "ŭ",
|
||
0xa333: "Ů",
|
||
0xa334: "ů",
|
||
0xa335: "Ű",
|
||
0xa336: "ű",
|
||
0xa337: "Ų",
|
||
0xa338: "ų",
|
||
0xa339: "Ŵ",
|
||
0xa33a: "ŵ",
|
||
0xa33b: "Ŷ",
|
||
0xa33c: "ŷ",
|
||
0xa33d: "Ÿ",
|
||
0xa33e: "Ź",
|
||
0xa33f: "ź",
|
||
0xa340: "Ż",
|
||
0xa341: "ż",
|
||
0xa342: "Ž",
|
||
0xa343: "ž",
|
||
0xa344: "ſ",
|
||
0xa34d: "ƒ",
|
||
0xa34e: "ˆ",
|
||
0xa34f: "˜",
|
||
}
|
||
}
|
||
|
||
func (*daijisenExtractor) getFontWide() map[int]string {
|
||
return map[int]string{
|
||
0xb322: "㋘",
|
||
0xb323: "㋙",
|
||
0xb324: "㋚",
|
||
0xb325: "㋛",
|
||
0xb326: "㋜",
|
||
0xb327: "㋝",
|
||
0xb424: "↔",
|
||
0xb646: "㋐",
|
||
0xb647: "㋑",
|
||
0xb648: "㋒",
|
||
0xb649: "㋓",
|
||
0xb64a: "㋔",
|
||
0xb64b: "㋕",
|
||
0xb64c: "㋖",
|
||
0xb64d: "㋗",
|
||
0xb852: "⇒",
|
||
0xbc2c: "・",
|
||
0xc36e: "❶",
|
||
0xc36f: "❷",
|
||
0xc370: "❸",
|
||
0xc371: "❹",
|
||
0xc372: "❺",
|
||
0xc373: "①",
|
||
0xc374: "②",
|
||
0xc375: "③",
|
||
0xc376: "④",
|
||
0xc377: "⑤",
|
||
0xc378: "⑥",
|
||
0xc379: "⑦",
|
||
0xc37a: "⑧",
|
||
0xc37b: "⑨",
|
||
0xc37c: "⑩",
|
||
0xc37d: "⑪",
|
||
0xc37e: "⑫",
|
||
0xc421: "⑬",
|
||
0xc422: "⑭",
|
||
0xc423: "⑮",
|
||
0xc424: "⑯",
|
||
0xc425: "⑰",
|
||
0xc426: "⑱",
|
||
0xc427: "⑲",
|
||
0xc428: "⑳",
|
||
0xc429: "㉑",
|
||
0xc42a: "㉒",
|
||
0xc42b: "㉓",
|
||
0xc42c: "㉔",
|
||
0xc42d: "㉕",
|
||
0xc431: "Ⅰ",
|
||
0xc432: "Ⅱ",
|
||
0xc437: "㊀",
|
||
0xc438: "㊁",
|
||
0xc439: "㊂",
|
||
0xc43a: "㊃",
|
||
0xc43b: "㊄",
|
||
0xc43c: "㊅",
|
||
0xc43d: "㊆",
|
||
0xc43e: "㊇",
|
||
0xc43f: "㊈",
|
||
0xc440: "㉖",
|
||
0xc441: "㉗",
|
||
0xc442: "㉘",
|
||
0xc443: "㉙",
|
||
0xc444: "㉚",
|
||
0xc445: "㉛",
|
||
0xc446: "㉜",
|
||
0xc447: "㉜",
|
||
0xc448: "㉝",
|
||
0xc449: "㉞",
|
||
0xc44a: "㉟",
|
||
0xc455: "[",
|
||
0xc463: "[",
|
||
0xc464: "[",
|
||
0xc465: "♪",
|
||
}
|
||
}
|