1

add support for gakken and koujien

This commit is contained in:
Alex Yatskov 2018-02-17 11:29:06 -08:00
parent f0e6fa2812
commit 78c48a0a55
3 changed files with 457 additions and 0 deletions

View File

@ -144,6 +144,12 @@ func epwingExportDb(inputPath, outputPath, language, title string, stride int, p
"明鏡国語辞典": makeMeikyouExtractor(),
"故事ことわざの辞典": makeKotowazaExtractor(),
"研究社 新和英大辞典 第5版": makeWadaiExtractor(),
"広辞苑第六版": makeKoujienExtractor(),
"付属資料": makeKoujienExtractor(),
"学研国語大辞典": makeGakkenExtractor(),
"古語辞典": makeGakkenExtractor(),
"故事ことわざ辞典": makeGakkenExtractor(),
"学研漢和大字典": makeGakkenExtractor(),
}
var (

230
gakken.go Normal file
View File

@ -0,0 +1,230 @@
/*
* Copyright (c) 2016 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"regexp"
"strings"
)
type gakkenExtractor struct {
partsExp *regexp.Regexp
readGroupExp *regexp.Regexp
expVarExp *regexp.Regexp
metaExp *regexp.Regexp
v5Exp *regexp.Regexp
v1Exp *regexp.Regexp
}
func makeGakkenExtractor() epwingExtractor {
return &gakkenExtractor{
partsExp: regexp.MustCompile(`([\p{Hiragana}\p{Katakana}ー‐・]*)?(?:【(.*)】)?`),
readGroupExp: regexp.MustCompile(`[‐・]+`),
expVarExp: regexp.MustCompile(`\(([^\)]*)\)`),
metaExp: regexp.MustCompile(`([^]*)`),
v5Exp: regexp.MustCompile(`(動.[四五]([^]+)?)|(動..二)`),
v1Exp: regexp.MustCompile(`(動..一)`),
}
}
var cosmetics = strings.NewReplacer("(1)", "①", "(2)", "②", "(3)", "③", "(4)", "④", "(5)", "⑤", "(6)", "⑥", "(7)", "⑦", "(8)", "⑧", "(9)", "⑨", "(10)", "⑩", "(11)", "⑪", "(12)", "⑫", "(13)", "⑬", "(14)", "⑭", "(15)", "⑮", "(16)", "⑯", "(17)", "⑰", "(18)", "⑱", "(19)", "⑲", "(20)", "⑳",
"カ゛", "ガ",
"キ゛", "ギ",
"ク゛", "グ",
"ケ゛", "ゲ",
"コ゛", "ゴ",
"タ゛", "ダ",
"チ゛", "ヂ",
"ツ゛", "ヅ",
"テ゛", "デ",
"ト゛", "ド",
"ハ゛", "バ",
"ヒ゛", "ビ",
"フ゛", "ブ",
"ヘ゛", "ベ",
"ホ゛", "ボ",
"サ゛", "ザ",
"シ゛", "ジ",
"ス゛", "ズ",
"セ゛", "ゼ",
"ソ゛", "ゾ")
func (e *gakkenExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
}
var expressions, readings []string
if expression := matches[2]; len(expression) > 0 {
expression = e.metaExp.ReplaceAllLiteralString(expression, "")
for _, split := range regexp.MustCompile("(・|】【)").Split(expression, -1) {
splitInc := e.expVarExp.ReplaceAllString(split, "$1")
expressions = append(expressions, splitInc)
if split != splitInc {
splitExc := e.expVarExp.ReplaceAllLiteralString(split, "")
expressions = append(expressions, splitExc)
}
}
}
if reading := matches[1]; len(reading) > 0 {
reading = e.readGroupExp.ReplaceAllLiteralString(reading, "")
readings = append(readings, reading)
}
var tags []string
entryText := cosmetics.Replace(entry.Text)
for _, split := range strings.Split(entryText, "\n") {
if matches := e.metaExp.FindStringSubmatch(split); matches != nil {
for _, tag := range strings.Split(matches[1], "・") {
tags = append(tags, tag)
}
}
}
var terms []dbTerm
if len(expressions) == 0 {
for _, reading := range readings {
term := dbTerm{
Expression: reading,
Glossary: []string{entryText},
Sequence: sequence,
}
e.exportRules(&term, tags)
terms = append(terms, term)
}
} else {
if len(readings) == 0 {
readings = append(readings, "")
}
for _, expression := range expressions {
for _, reading := range readings {
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entryText},
Sequence: sequence,
}
e.exportRules(&term, tags)
terms = append(terms, term)
}
}
}
return terms
}
func (*gakkenExtractor) extractKanji(entry epwingEntry) []dbKanji {
return nil
}
func (e *gakkenExtractor) exportRules(term *dbTerm, tags []string) {
for _, tag := range tags {
if tag == "形" {
term.addRules("adj-i")
} else if tag == "動サ変" && (strings.HasSuffix(term.Expression, "する") || strings.HasSuffix(term.Expression, "為る")) {
term.addRules("vs")
} else if term.Expression == "来る" {
term.addRules("vk")
} else if e.v5Exp.MatchString(tag) {
term.addRules("v5")
} else if e.v1Exp.MatchString(tag) {
term.addRules("v1")
}
}
}
func (*gakkenExtractor) getRevision() string {
return "gakken"
}
func (*gakkenExtractor) getFontNarrow() map[int]string {
return map[int]string{
41550: "ī",
}
}
func (*gakkenExtractor) getFontWide() map[int]string {
return map[int]string{
42017: "国",
42018: "古",
42019: "故",
42021: "(拡)",
42020: "漢",
42033: "",
42034: "",
42070: "㋐",
42071: "㋑",
42072: "㋒",
42073: "㋓",
42074: "㋔",
42075: "㋕",
42076: "㋖",
42077: "㋗",
42078: "㋘",
42079: "㋙",
42080: "㋚",
42081: "㋛",
42082: "㋜",
42083: "㋝",
42084: "🈩",
42085: "🈔",
42086: "🈪",
42087: "[四]",
42088: "[五]",
42089: "❶",
42090: "❷",
42091: "❸",
42092: "❹",
42093: "❺",
42094: "❻",
42095: "❼",
42096: "❽",
42097: "❾",
42098: "❿",
42099: "⓫",
42100: "⓬",
42101: "⓭",
42102: "⓮",
42103: "⓯",
42104: "⓰",
42105: "⓱",
42106: "⓲",
42107: "㊀",
42108: "㊁",
42109: "㊂",
42110: "㊃",
43599: "咍",
46176: "(扌)",
48753: "灾",
48936: "烖",
58176: "(呉)",
58177: "(漢)",
}
}

221
koujien.go Normal file
View File

@ -0,0 +1,221 @@
/*
* Copyright (c) 2016 Alex Yatskov <alex@foosoft.net>
* Author: Alex Yatskov <alex@foosoft.net>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package main
import (
"regexp"
"strings"
)
type koujienExtractor struct {
partsExp *regexp.Regexp
readGroupExp *regexp.Regexp
expVarExp *regexp.Regexp
metaExp *regexp.Regexp
v5Exp *regexp.Regexp
v1Exp *regexp.Regexp
}
func makeKoujienExtractor() epwingExtractor {
return &koujienExtractor{
partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:(.*))?`),
readGroupExp: regexp.MustCompile(`[‐・]+`),
expVarExp: regexp.MustCompile(`\(([^\)]*)\)`),
metaExp: regexp.MustCompile(`([^]*)`),
v5Exp: regexp.MustCompile(`(動.[四五]([^]+)?)|(動..二)`),
v1Exp: regexp.MustCompile(`(動..一)`),
}
}
func makeFuzokuExtractor() epwingExtractor {
return &koujienExtractor{
partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:(.*))?`),
readGroupExp: regexp.MustCompile(`[-・]+`),
expVarExp: regexp.MustCompile(`\(([^\)]*)\)`),
metaExp: regexp.MustCompile(`([^]*)`),
v5Exp: regexp.MustCompile(`(動.[四五]([^]+)?)|(動..二)`),
v1Exp: regexp.MustCompile(`(動..一)`),
}
}
func (e *koujienExtractor) extractTerms(entry epwingEntry, sequence int) []dbTerm {
matches := e.partsExp.FindStringSubmatch(entry.Heading)
if matches == nil {
return nil
}
var expressions, readings []string
if expression := matches[2]; len(expression) > 0 {
expression = e.metaExp.ReplaceAllLiteralString(expression, "")
for _, split := range strings.Split(expression, "・") {
splitInc := e.expVarExp.ReplaceAllString(split, "$1")
expressions = append(expressions, splitInc)
if split != splitInc {
splitExc := e.expVarExp.ReplaceAllLiteralString(split, "")
expressions = append(expressions, splitExc)
}
}
}
if reading := matches[1]; len(reading) > 0 {
reading = e.readGroupExp.ReplaceAllLiteralString(reading, "")
readings = append(readings, reading)
}
var tags []string
for _, split := range strings.Split(entry.Text, "\n") {
if matches := e.metaExp.FindStringSubmatch(split); matches != nil {
for _, tag := range strings.Split(matches[1], "・") {
tags = append(tags, tag)
}
}
}
var terms []dbTerm
if len(expressions) == 0 {
for _, reading := range readings {
term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)
terms = append(terms, term)
}
} else {
for _, expression := range expressions {
for _, reading := range readings {
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
Sequence: sequence,
}
e.exportRules(&term, tags)
terms = append(terms, term)
}
}
}
return terms
}
func (*koujienExtractor) extractKanji(entry epwingEntry) []dbKanji {
return nil
}
func (e *koujienExtractor) exportRules(term *dbTerm, tags []string) {
for _, tag := range tags {
if tag == "形" {
term.addRules("adj-i")
} else if tag == "動サ変" && (strings.HasSuffix(term.Expression, "する") || strings.HasSuffix(term.Expression, "為る")) {
term.addRules("vs")
} else if term.Expression == "来る" {
term.addRules("vk")
} else if e.v5Exp.MatchString(tag) {
term.addRules("v5")
} else if e.v1Exp.MatchString(tag) {
term.addRules("v1")
}
}
}
func (*koujienExtractor) getRevision() string {
return "koujien"
}
func (*koujienExtractor) getFontNarrow() map[int]string {
return map[int]string{}
}
func (*koujienExtractor) getFontWide() map[int]string {
return map[int]string{
41531: "⟨",
41532: "⟩",
42017: "⇿",
42018: "🈑",
42023: "🈩",
42024: "🈔",
42025: "㊇",
42026: "3",
42027: "❷",
42028: "❶",
42031: "❸",
42037: "❹",
42043: "❺",
42045: "❻",
42057: "❼",
42083: "❽",
42284: "❾",
42544: "❿",
42561: "鉏",
43611: "⓫",
43612: "⓬",
44142: "𑖀",
44856: "㉑",
44857: "㉒",
46374: "",
46375: "",
46390: "①",
46391: "②",
46392: "③",
46393: "④",
46394: "⑤",
46395: "⑥",
46396: "⑦",
46397: "⑧",
46398: "⑨",
46399: "⑩",
46400: "⑪",
46401: "⑫",
46402: "⑬",
46403: "⑭",
46404: "⑮",
46405: "⑯",
46406: "⑰",
46407: "⑱",
46408: "⑲",
46409: "⑳",
46677: "⇀",
46420: "⇨",
47175: "(季)",
56383: "㋐",
56384: "㋑",
56385: "㋒",
56386: "㋓",
56387: "㋔",
56388: "㋕",
56389: "㋖",
56390: "㋗",
56391: "㋘",
56392: "㋙",
56393: "㋚",
56394: "㋛",
56395: "㋜",
56396: "㋝",
56397: "㋞",
56398: "▷",
}
}