1

Add daijirin data

This commit is contained in:
Alex Yatskov 2016-12-13 17:17:19 -08:00
parent 8a363c52fd
commit 95ca3cc700
2 changed files with 1475 additions and 55 deletions

1475
daijirin.go Normal file

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,6 @@ import (
"io"
"io/ioutil"
"regexp"
"strings"
)
type epwingEntry struct {
@ -59,60 +58,6 @@ type daijirinExtractor struct {
annotExp *regexp.Regexp
}
func makeDaijirinExtractor() epwingExtractor {
return &daijirinExtractor{
partsExp: regexp.MustCompile(`(?P<reading>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:(?P<tag>.*))?`),
phonExp: regexp.MustCompile(`[-・]+`),
variantExp: regexp.MustCompile(`\((.*)\)`),
annotExp: regexp.MustCompile(`(.*)`),
}
}
func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
var expressions, readings, glossary, tags []string
matches := e.partsExp.FindStringSubmatch(entry.Heading)
for i, name := range e.partsExp.SubexpNames() {
value := matches[i]
if i == 0 || len(value) == 0 {
continue
}
switch name {
case "expression":
expression := e.annotExp.ReplaceAllLiteralString(value, "")
for _, split := range strings.Split(expression, ``) {
splitInc := e.variantExp.ReplaceAllString(split, "$1")
expressions = append(expressions, splitInc)
if split != splitInc {
splitExc := e.variantExp.ReplaceAllLiteralString(split, "")
expressions = append(expressions, splitExc)
}
}
case "reading":
reading := e.phonExp.ReplaceAllLiteralString(value, "")
readings = append(readings, reading)
}
}
for i, split := range strings.Split(entry.Text, "\n") {
if i == 0 {
matches := e.annotExp.FindStringSubmatch(split)
if len(matches) >= 1 {
tags = append(tags, strings.Split(matches[1], ``)...)
}
}
glossary = append(glossary, split)
}
return nil
}
func (e *daijirinExtractor) extractKanji(entry epwingEntry) []dbKanji {
return nil
}
func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error {
data, err := ioutil.ReadAll(reader)
if err != nil {