1
This commit is contained in:
Alex Yatskov 2016-12-13 13:29:05 -08:00
parent 63c8268fcc
commit 8faa2c3354

View File

@ -24,9 +24,11 @@ package main
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"regexp"
"strings"
)
type epwingEntry struct {
@ -46,36 +48,62 @@ type epwingBook struct {
Subbooks []epwingSubbook `json:"subbooks"`
}
// 3934
// (?P<kana>[^(【<]+)(?:【(?P<kanji>.*)】)?(?:<\?>(?P<native>.*)<\?>)?(?:(?P<tag>.*))?
// (?P<kana>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:(?P<tag>.*))?
// "heading": "きれ‐あが・る【切れ上がる】",
// "text": "きれ‐あが・る【切れ上がる】\n動ラ五上の方まで切れる。また、目尻や額の生え際などが上の方へ上がっている。「―・った目元」\n"
// },
// {
// "heading": "きれ‐あじ【切れ味】‐あぢ",
// "text": "きれ‐あじ【切れ味】‐あぢ\n<?>刃物の切れぐあい。「―のいいナイフ」<?>才能・技などの鋭さ。「鋭い―の批評」「―のいいショット」\n"
// },
// {
// "heading": "き‐れい【×綺麗・奇麗】",
// "text": "き‐れい【×綺麗・奇麗】\n形動<?>[ナリ]<?>色・形などが華やかな美しさをもっているさま。「―な花」「―に着飾る」<?>姿・顔かたちが整っていて美しいさま。「―な脚」「―な女性」<?>声などが快く聞こえるさま。「―な発音」<?>よごれがなく清潔なさま。「手を―に洗う」「―な空気」「―な選挙」<?>男女間に肉体的な交渉がないさま。清純。「―な関係」<?>乱れたところがないさま。整然としているさま。「机の上を―に片づける」<?>(「きれいに」の形で)残りなく物事が行われるさま。すっかり。「―に忘れる」「―にたいらげる」→美しい[用法]\n派生きれいさ\n類語<?>)美しい・美美(びび)しい・煌(きら)やか・鮮やか・美麗・華麗・華美・鮮麗・流麗・優美・美的/(<?>)麗(うるわ)しい・見目よい・端整・端麗・秀麗・佳麗(かれい)・艶美(えんび)・艶麗(えんれい)・あでやか/(<?>)清い・清らか・清潔・清浄(せいじよう・しようじよう)・清澄・清冽(せいれつ)・無垢(むく)・純潔・潔白(けつぱく)\n"
// },
type epwingExtractor interface {
extractTerms(entry epwingEntry) []dbTerm
extractKanji(entry epwingEntry) []dbKanji
}
func extractDaijirinTerms(entry epwingEntry) []dbTerm {
exp := regexp.MustCompile(`(?P<kana>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:(?P<tag>.*))?`)
matches := exp.FindStringSubmatch(entry.Heading)
type daijirinExtractor struct {
partsExp *regexp.Regexp
phonExp *regexp.Regexp
variantExp *regexp.Regexp
annotExp *regexp.Regexp
}
results := make(map[string]string)
for i, name := range exp.SubexpNames() {
if i > 0 {
results[name] = matches[i]
func makeDaijirinExtractor() epwingExtractor {
return &daijirinExtractor{
partsExp: regexp.MustCompile(`(?P<reading>[^(【〖]+)(?:【(?P<expression>.*)】)?(?:〖(?P<native>.*)〗)?(?:(?P<tag>.*))?`),
phonExp: regexp.MustCompile(`[-・]+`),
variantExp: regexp.MustCompile(`\((.*)\)`),
annotExp: regexp.MustCompile(`.*`),
}
}
func (e *daijirinExtractor) extractTerms(entry epwingEntry) []dbTerm {
var readings []string
var expressions []string
matches := e.partsExp.FindStringSubmatch(entry.Heading)
for i, name := range e.partsExp.SubexpNames() {
value := matches[i]
if i == 0 || len(value) == 0 {
continue
}
switch name {
case "expression":
expression := e.annotExp.ReplaceAllLiteralString(value, "")
for _, split := range strings.Split(expression, ``) {
splitInc := e.variantExp.ReplaceAllString(split, "$1")
expressions = append(expressions, splitInc)
if split != splitInc {
splitExc := e.variantExp.ReplaceAllLiteralString(split, "")
expressions = append(expressions, splitExc)
}
}
case "reading":
reading := e.phonExp.ReplaceAllLiteralString(value, "")
readings = append(readings, reading)
}
}
fmt.Printf("%q\n", expressions)
fmt.Printf("%q\n", readings)
return nil
}
func extractDaijirinKanji(entry epwingEntry) []dbKanji {
func (e *daijirinExtractor) extractKanji(entry epwingEntry) []dbKanji {
return nil
}
@ -90,26 +118,18 @@ func exportEpwingDb(outputDir, title string, reader io.Reader, flags int) error
return err
}
termExtractors := map[string]func(epwingEntry) []dbTerm{
"三省堂 スーパー大辞林": extractDaijirinTerms,
epwingExtractors := map[string]epwingExtractor{
"三省堂 スーパー大辞林": makeDaijirinExtractor(),
}
var terms dbTermList
for _, subbook := range book.Subbooks {
if extractor, ok := termExtractors[subbook.Title]; ok {
for _, entry := range subbook.Entries {
terms = append(terms, extractor(entry)...)
}
}
}
kanjiExtractors := map[string]func(epwingEntry) []dbKanji{}
var kanji dbKanjiList
for _, subbook := range book.Subbooks {
if extractor, ok := kanjiExtractors[subbook.Title]; ok {
if extractor, ok := epwingExtractors[subbook.Title]; ok {
for _, entry := range subbook.Entries {
kanji = append(kanji, extractor(entry)...)
terms = append(terms, extractor.extractTerms(entry)...)
kanji = append(kanji, extractor.extractKanji(entry)...)
}
}
}