From c932dc4f0fd3cba920c35b3754c2ba22539db27a Mon Sep 17 00:00:00 2001 From: Alex Yatskov Date: Sun, 29 Jan 2017 14:03:51 -0800 Subject: [PATCH] wip --- daijisen.go | 56 +++++++++++++++++++++++------------------------------ epwing.go | 1 + 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/daijisen.go b/daijisen.go index e0ad9de..fe2726b 100644 --- a/daijisen.go +++ b/daijisen.go @@ -29,8 +29,9 @@ import ( type daijisenExtractor struct { partsExp *regexp.Regexp + expShapesExp *regexp.Regexp + readVarExp *regexp.Regexp readGroupExp *regexp.Regexp - expVarExp *regexp.Regexp metaExp *regexp.Regexp v5Exp *regexp.Regexp v1Exp *regexp.Regexp @@ -38,10 +39,11 @@ type daijisenExtractor struct { func makeDaijisenExtractor() epwingExtractor { return &daijisenExtractor{ - partsExp: regexp.MustCompile(`([^(【〖]+)(?:【(.*)】)?(?:〖(.*)〗)?(?:((.*)))?`), + partsExp: regexp.MustCompile(`([^【]+)(?:【(.*)】)?`), + expShapesExp: regexp.MustCompile(`[×△]+`), + readVarExp: regexp.MustCompile(`\(([^\)]*)\)`), readGroupExp: regexp.MustCompile(`[-・]+`), - expVarExp: regexp.MustCompile(`\(([^\)]*)\)`), - metaExp: regexp.MustCompile(`(([^)]*))`), + metaExp: regexp.MustCompile(`[([^]]*)]`), v5Exp: regexp.MustCompile(`(動.[四五]([[^]]+])?)|(動..二)`), v1Exp: regexp.MustCompile(`(動..一)`), } @@ -53,22 +55,16 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm { return nil } - var expressions, readings []string + var expressions []string if expression := matches[2]; len(expression) > 0 { - expression = e.metaExp.ReplaceAllLiteralString(expression, "") - for _, split := range strings.Split(expression, "・") { - splitInc := e.expVarExp.ReplaceAllString(split, "$1") - expressions = append(expressions, splitInc) - if split != splitInc { - splitExc := e.expVarExp.ReplaceAllLiteralString(split, "") - expressions = append(expressions, splitExc) - } - } + expression = e.expShapesExp.ReplaceAllString(expression, "") + expressions = strings.Split(expression, "・") } - if reading := matches[1]; len(reading) > 0 { + var reading string + if reading = matches[1]; len(reading) > 0 { reading = e.readGroupExp.ReplaceAllLiteralString(reading, "") - readings = append(readings, reading) + reading = e.readVarExp.ReplaceAllLiteralString(reading, "") } var tags []string @@ -82,29 +78,25 @@ func (e *daijisenExtractor) extractTerms(entry epwingEntry) []dbTerm { var terms []dbTerm if len(expressions) == 0 { - for _, reading := range readings { + term := dbTerm{ + Expression: reading, + Glossary: []string{entry.Text}, + } + + e.exportRules(&term, tags) + terms = append(terms, term) + + } else { + for _, expression := range expressions { term := dbTerm{ - Expression: reading, + Expression: expression, + Reading: reading, Glossary: []string{entry.Text}, } e.exportRules(&term, tags) terms = append(terms, term) } - - } else { - for _, expression := range expressions { - for _, reading := range readings { - term := dbTerm{ - Expression: expression, - Reading: reading, - Glossary: []string{entry.Text}, - } - - e.exportRules(&term, tags) - terms = append(terms, term) - } - } } return terms diff --git a/epwing.go b/epwing.go index f7a12a2..8ad84b5 100644 --- a/epwing.go +++ b/epwing.go @@ -137,6 +137,7 @@ func epwingExportDb(inputPath, outputDir, title string, stride int, pretty bool) translateExp := regexp.MustCompile(`{{([nw])_(\d+)}}`) epwingExtractors := map[string]epwingExtractor{ "三省堂 スーパー大辞林": makeDaijirinExtractor(), + "大辞泉": makeDaijisenExtractor(), } var (