1
This commit is contained in:
Alex Yatskov 2017-01-29 12:18:05 -08:00
parent 6012bb555f
commit d41e2aa3c8

445
daijisen.go Normal file
View File

@ -0,0 +1,445 @@
/*
* Copyright (c) 2016 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"regexp"
"strings"
)
type daijisenExtractor struct {
partsExp *regexp.Regexp
readGroupExp *regexp.Regexp
expVarExp *regexp.Regexp
metaExp *regexp.Regexp
v5Exp *regexp.Regexp
v1Exp *regexp.Regexp
}
func makeDaijisenExtractor() epwingExtractor {
return &daijisenExtractor{
partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:(.*))?`),
readGroupExp: regexp.MustCompile(`[-・]+`),
expVarExp: regexp.MustCompile(`\(([^\)]*)\)`),
metaExp: regexp.MustCompile(`([^]*)`),
v5Exp: regexp.MustCompile(`(動.[四五]([^]+)?)|(動..二)`),
v1Exp: regexp.MustCompile(`(動..一)`),
}
}
func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
}
var expressions, readings []string
if expression := matches[2]; len(expression) > 0 {
expression = e.metaExp.ReplaceAllLiteralString(expression, "")
for _, split := range strings.Split(expression, "・") {
splitInc := e.expVarExp.ReplaceAllString(split, "$1")
expressions = append(expressions, splitInc)
if split != splitInc {
splitExc := e.expVarExp.ReplaceAllLiteralString(split, "")
expressions = append(expressions, splitExc)
}
}
}
if reading := matches[1]; len(reading) > 0 {
reading = e.readGroupExp.ReplaceAllLiteralString(reading, "")
readings = append(readings, reading)
}
var tags []string
for _, split := range strings.Split(entry.Text, "\n") {
if matches := e.metaExp.FindStringSubmatch(split); matches != nil {
for _, tag := range strings.Split(matches[1], "・") {
tags = append(tags, tag)
}
}
}
var terms []dbTerm
if len(expressions) == 0 {
for _, reading := range readings {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
}
e.exportRules(&term, tags)
terms = append(terms, term)
}
} else {
for _, expression := range expressions {
for _, reading := range readings {
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
}
e.exportRules(&term, tags)
terms = append(terms, term)
}
}
}
return terms
}
func (*daijisenExtractor) extractKanji(entry epwingEntry) []dbKanji {
return nil
}
func (e *daijisenExtractor) exportRules(term *dbTerm, tags []string) {
for _, tag := range tags {
if tag == "形" {
term.addRules("adj-i")
} else if tag == "動サ変" && (strings.HasSuffix(term.Expression, "する") || strings.HasSuffix(term.Expression, "為る")) {
term.addRules("vs")
} else if term.Expression == "来る" {
term.addRules("vk")
} else if e.v5Exp.MatchString(tag) {
term.addRules("v5")
} else if e.v1Exp.MatchString(tag) {
term.addRules("v1")
}
}
}
func (*daijisenExtractor) getRevision() string {
return "daijisen1"
}
func (*daijisenExtractor) getFontNarrow() map[int]string {
return map[int]string{
0xa121: " ",
0xa122: "¡",
0xa123: "¢",
0xa124: "£",
0xa125: "¤",
0xa126: "¥",
0xa127: "¦",
0xa128: "§",
0xa129: "¨",
0xa12a: "©",
0xa12b: "ª",
0xa12c: "«",
0xa12d: "¬",
0xa12e: "­",
0xa12f: "®",
0xa130: "¯",
0xa131: "°",
0xa132: "±",
0xa133: "²",
0xa134: "³",
0xa135: "´",
0xa136: "µ",
0xa137: "¶",
0xa138: "·",
0xa139: "¸",
0xa13a: "¹",
0xa13b: "º",
0xa13c: "»",
0xa13d: "¼",
0xa13e: "½",
0xa13f: "¾",
0xa140: "¿",
0xa141: "À",
0xa142: "Á",
0xa143: "Â",
0xa144: "Ã",
0xa145: "Ä",
0xa146: "Å",
0xa147: "Æ",
0xa148: "Ç",
0xa149: "È",
0xa14a: "É",
0xa14b: "Ê",
0xa14c: "Ë",
0xa14d: "Ì",
0xa14e: "Í",
0xa14f: "Î",
0xa150: "Ï",
0xa151: "Ð",
0xa152: "Ñ",
0xa153: "Ò",
0xa154: "Ó",
0xa155: "Ô",
0xa156: "Õ",
0xa157: "Ö",
0xa158: "×",
0xa159: "Ø",
0xa15a: "Ù",
0xa15b: "Ú",
0xa15c: "Û",
0xa15d: "Ü",
0xa15e: "Ý",
0xa15f: "Þ",
0xa160: "ß",
0xa161: "à",
0xa162: "á",
0xa163: "â",
0xa164: "ã",
0xa165: "ä",
0xa166: "å",
0xa167: "æ",
0xa168: "ç",
0xa169: "è",
0xa16a: "é",
0xa16b: "ê",
0xa16c: "ë",
0xa16d: "ì",
0xa16e: "í",
0xa16f: "î",
0xa170: "ï",
0xa171: "ð",
0xa172: "ñ",
0xa173: "ò",
0xa174: "ó",
0xa175: "ô",
0xa176: "õ",
0xa177: "ö",
0xa178: "÷",
0xa179: "ø",
0xa17a: "ù",
0xa17b: "ú",
0xa17c: "û",
0xa17d: "ü",
0xa17e: "ý",
0xa221: "þ",
0xa222: "ÿ",
0xa223: "Ā",
0xa224: "ā",
0xa225: "Ă",
0xa226: "ă",
0xa227: "Ą",
0xa228: "ą",
0xa229: "Ć",
0xa22a: "ć",
0xa22b: "Ĉ",
0xa22c: "ĉ",
0xa22d: "Ċ",
0xa22e: "ċ",
0xa22f: "Č",
0xa230: "č",
0xa231: "Ď",
0xa232: "ď",
0xa233: "Đ",
0xa234: "đ",
0xa235: "Ē",
0xa236: "ē",
0xa237: "Ĕ",
0xa238: "ĕ",
0xa239: "Ė",
0xa23a: "ė",
0xa23b: "Ę",
0xa23c: "ę",
0xa23d: "Ě",
0xa23e: "ě",
0xa23f: "Ĝ",
0xa240: "ĝ",
0xa241: "Ğ",
0xa242: "ğ",
0xa243: "Ġ",
0xa244: "ġ",
0xa245: "Ģ",
0xa246: "ģ",
0xa247: "Ĥ",
0xa248: "ĥ",
0xa249: "Ħ",
0xa24a: "ħ",
0xa24b: "Ĩ",
0xa24c: "ĩ",
0xa24d: "Ī",
0xa24e: "ī",
0xa24f: "Ĭ",
0xa250: "ĭ",
0xa251: "Į",
0xa252: "į",
0xa253: "İ",
0xa254: "ı",
0xa255: "IJ",
0xa256: "ij",
0xa257: "Ĵ",
0xa258: "ĵ",
0xa259: "Ķ",
0xa25a: "ķ",
0xa25b: "ĸ",
0xa25c: "Ĺ",
0xa25d: "ĺ",
0xa25e: "Ļ",
0xa25f: "ļ",
0xa260: "Ľ",
0xa261: "ľ",
0xa262: "Ŀ",
0xa263: "ŀ",
0xa264: "Ł",
0xa265: "ł",
0xa266: "Ń",
0xa267: "ń",
0xa268: "Ņ",
0xa269: "ņ",
0xa26a: "Ň",
0xa26b: "ň",
0xa26c: "ʼn",
0xa26d: "Ŋ",
0xa26e: "ŋ",
0xa26f: "Ō",
0xa270: "ō",
0xa271: "Ŏ",
0xa272: "ŏ",
0xa273: "Ő",
0xa274: "ő",
0xa275: "Œ",
0xa276: "œ",
0xa277: "Ŕ",
0xa278: "ŕ",
0xa279: "Ŗ",
0xa27a: "ŗ",
0xa27b: "Ř",
0xa27c: "ř",
0xa27d: "Ś",
0xa27e: "ś",
0xa321: "Ŝ",
0xa322: "ŝ",
0xa323: "Ş",
0xa324: "ş",
0xa325: "Š",
0xa326: "š",
0xa327: "Ţ",
0xa328: "ţ",
0xa329: "Ť",
0xa32a: "ť",
0xa32b: "Ŧ",
0xa32c: "ŧ",
0xa32d: "Ũ",
0xa32e: "ũ",
0xa32f: "Ū",
0xa330: "ū",
0xa331: "Ŭ",
0xa332: "ŭ",
0xa333: "Ů",
0xa334: "ů",
0xa335: "Ű",
0xa336: "ű",
0xa337: "Ų",
0xa338: "ų",
0xa339: "Ŵ",
0xa33a: "ŵ",
0xa33b: "Ŷ",
0xa33c: "ŷ",
0xa33d: "Ÿ",
0xa33e: "Ź",
0xa33f: "ź",
0xa340: "Ż",
0xa341: "ż",
0xa342: "Ž",
0xa343: "ž",
0xa344: "ſ",
0xa34d: "ƒ",
0xa34e: "ˆ",
0xa34f: "˜",
}
}
func (*daijisenExtractor) getFontWide() map[int]string {
return map[int]string{
0xb322: "㋘",
0xb323: "㋙",
0xb324: "㋚",
0xb325: "㋛",
0xb326: "㋜",
0xb327: "㋝",
0xb424: "↔",
0xb646: "㋐",
0xb647: "㋑",
0xb648: "㋒",
0xb649: "㋓",
0xb64a: "㋔",
0xb64b: "㋕",
0xb64c: "㋖",
0xb64d: "㋗",
0xb852: "⇒",
0xbc2c: "・",
0xc36e: "❶",
0xc36f: "❷",
0xc370: "❸",
0xc371: "❹",
0xc372: "❺",
0xc373: "①",
0xc374: "②",
0xc375: "③",
0xc376: "④",
0xc377: "⑤",
0xc378: "⑥",
0xc379: "⑦",
0xc37a: "⑧",
0xc37b: "⑨",
0xc37c: "⑩",
0xc37d: "⑪",
0xc37e: "⑫",
0xc421: "⑬",
0xc422: "⑭",
0xc423: "⑮",
0xc424: "⑯",
0xc425: "⑰",
0xc426: "⑱",
0xc427: "⑲",
0xc428: "⑳",
0xc429: "㉑",
0xc42a: "㉒",
0xc42b: "㉓",
0xc42c: "㉔",
0xc42d: "㉕",
0xc431: "",
0xc432: "Ⅱ",
0xc437: "㊀",
0xc438: "㊁",
0xc439: "㊂",
0xc43a: "㊃",
0xc43b: "㊄",
0xc43c: "㊅",
0xc43d: "㊆",
0xc43e: "㊇",
0xc43f: "㊈",
0xc440: "㉖",
0xc441: "㉗",
0xc442: "㉘",
0xc443: "㉙",
0xc444: "㉚",
0xc445: "㉛",
0xc446: "㉜",
0xc447: "㉜",
0xc448: "㉝",
0xc449: "㉞",
0xc44a: "㉟",
0xc455: "[",
0xc463: "[",
0xc464: "[",
0xc465: "♪",
}
}