1
This commit is contained in:
Alex Yatskov 2017-01-29 14:03:51 -08:00
parent d41e2aa3c8
commit c932dc4f0f
2 changed files with 25 additions and 32 deletions

View File

@ -29,8 +29,9 @@ import (
type daijisenExtractor struct { type daijisenExtractor struct {
partsExp *regexp.Regexp partsExp *regexp.Regexp
expShapesExp *regexp.Regexp
readVarExp *regexp.Regexp
readGroupExp *regexp.Regexp readGroupExp *regexp.Regexp
expVarExp *regexp.Regexp
metaExp *regexp.Regexp metaExp *regexp.Regexp
v5Exp *regexp.Regexp v5Exp *regexp.Regexp
v1Exp *regexp.Regexp v1Exp *regexp.Regexp
@ -38,10 +39,11 @@ type daijisenExtractor struct {
func makeDaijisenExtractor() epwingExtractor { func makeDaijisenExtractor() epwingExtractor {
return &daijisenExtractor{ return &daijisenExtractor{
partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:(.*))?`), partsExp: regexp.MustCompile(`([^【]+)(?:【(.*)】)?`),
expShapesExp: regexp.MustCompile(`[×△]+`),
readVarExp: regexp.MustCompile(`\(([^\)]*)\)`),
readGroupExp: regexp.MustCompile(`[-・]+`), readGroupExp: regexp.MustCompile(`[-・]+`),
expVarExp: regexp.MustCompile(`\(([^\)]*)\)`), metaExp: regexp.MustCompile(`([^]*)`),
metaExp: regexp.MustCompile(`([^]*)`),
v5Exp: regexp.MustCompile(`(動.[四五]([^]+)?)|(動..二)`), v5Exp: regexp.MustCompile(`(動.[四五]([^]+)?)|(動..二)`),
v1Exp: regexp.MustCompile(`(動..一)`), v1Exp: regexp.MustCompile(`(動..一)`),
} }
@ -53,22 +55,16 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
return nil return nil
} }
var expressions, readings []string var expressions []string
if expression := matches[2]; len(expression) > 0 { if expression := matches[2]; len(expression) > 0 {
expression = e.metaExp.ReplaceAllLiteralString(expression, "") expression = e.expShapesExp.ReplaceAllString(expression, "")
for _, split := range strings.Split(expression, "・") { expressions = strings.Split(expression, "・")
splitInc := e.expVarExp.ReplaceAllString(split, "$1")
expressions = append(expressions, splitInc)
if split != splitInc {
splitExc := e.expVarExp.ReplaceAllLiteralString(split, "")
expressions = append(expressions, splitExc)
}
}
} }
if reading := matches[1]; len(reading) > 0 { var reading string
if reading = matches[1]; len(reading) > 0 {
reading = e.readGroupExp.ReplaceAllLiteralString(reading, "") reading = e.readGroupExp.ReplaceAllLiteralString(reading, "")
readings = append(readings, reading) reading = e.readVarExp.ReplaceAllLiteralString(reading, "")
} }
var tags []string var tags []string
@ -82,29 +78,25 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm {
var terms []dbTerm var terms []dbTerm
if len(expressions) == 0 { if len(expressions) == 0 {
for _, reading := range readings { term := dbTerm{
Expression: reading,
Glossary: []string{entry.Text},
}
e.exportRules(&term, tags)
terms = append(terms, term)
} else {
for _, expression := range expressions {
term := dbTerm{ term := dbTerm{
Expression: reading, Expression: expression,
Reading: reading,
Glossary: []string{entry.Text}, Glossary: []string{entry.Text},
} }
e.exportRules(&term, tags) e.exportRules(&term, tags)
terms = append(terms, term) terms = append(terms, term)
} }
} else {
for _, expression := range expressions {
for _, reading := range readings {
term := dbTerm{
Expression: expression,
Reading: reading,
Glossary: []string{entry.Text},
}
e.exportRules(&term, tags)
terms = append(terms, term)
}
}
} }
return terms return terms

View File

@ -137,6 +137,7 @@ func epwingExportDb(inputPath, outputDir, title string, stride int, pretty bool)
translateExp := regexp.MustCompile(`{{([nw])_(\d+)}}`) translateExp := regexp.MustCompile(`{{([nw])_(\d+)}}`)
epwingExtractors := map[string]epwingExtractor{ epwingExtractors := map[string]epwingExtractor{
"三省堂 スーパー大辞林": makeDaijirinExtractor(), "三省堂 スーパー大辞林": makeDaijirinExtractor(),
"大辞泉": makeDaijisenExtractor(),
} }
var ( var (